scitex 2.0.0__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (704) hide show
  1. scitex/__init__.py +53 -15
  2. scitex/__main__.py +72 -26
  3. scitex/__version__.py +1 -1
  4. scitex/_sh.py +145 -23
  5. scitex/ai/__init__.py +30 -16
  6. scitex/ai/_gen_ai/_Anthropic.py +5 -7
  7. scitex/ai/_gen_ai/_BaseGenAI.py +2 -2
  8. scitex/ai/_gen_ai/_DeepSeek.py +10 -2
  9. scitex/ai/_gen_ai/_Google.py +2 -2
  10. scitex/ai/_gen_ai/_Llama.py +2 -2
  11. scitex/ai/_gen_ai/_OpenAI.py +2 -2
  12. scitex/ai/_gen_ai/_PARAMS.py +51 -65
  13. scitex/ai/_gen_ai/_Perplexity.py +2 -2
  14. scitex/ai/_gen_ai/__init__.py +25 -14
  15. scitex/ai/_gen_ai/_format_output_func.py +4 -4
  16. scitex/ai/classification/{classifier_server.py → Classifier.py} +5 -5
  17. scitex/ai/classification/CrossValidationExperiment.py +374 -0
  18. scitex/ai/classification/__init__.py +43 -4
  19. scitex/ai/classification/reporters/_BaseClassificationReporter.py +281 -0
  20. scitex/ai/classification/reporters/_ClassificationReporter.py +773 -0
  21. scitex/ai/classification/reporters/_MultiClassificationReporter.py +406 -0
  22. scitex/ai/classification/reporters/_SingleClassificationReporter.py +1834 -0
  23. scitex/ai/classification/reporters/__init__.py +11 -0
  24. scitex/ai/classification/reporters/reporter_utils/_Plotter.py +1028 -0
  25. scitex/ai/classification/reporters/reporter_utils/__init__.py +80 -0
  26. scitex/ai/classification/reporters/reporter_utils/aggregation.py +457 -0
  27. scitex/ai/classification/reporters/reporter_utils/data_models.py +313 -0
  28. scitex/ai/classification/reporters/reporter_utils/reporting.py +1056 -0
  29. scitex/ai/classification/reporters/reporter_utils/storage.py +221 -0
  30. scitex/ai/classification/reporters/reporter_utils/validation.py +395 -0
  31. scitex/ai/classification/timeseries/_TimeSeriesBlockingSplit.py +568 -0
  32. scitex/ai/classification/timeseries/_TimeSeriesCalendarSplit.py +688 -0
  33. scitex/ai/classification/timeseries/_TimeSeriesMetadata.py +139 -0
  34. scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +1716 -0
  35. scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +1685 -0
  36. scitex/ai/classification/timeseries/_TimeSeriesStrategy.py +84 -0
  37. scitex/ai/classification/timeseries/_TimeSeriesStratifiedSplit.py +610 -0
  38. scitex/ai/classification/timeseries/__init__.py +39 -0
  39. scitex/ai/classification/timeseries/_normalize_timestamp.py +436 -0
  40. scitex/ai/clustering/_umap.py +2 -2
  41. scitex/ai/feature_extraction/vit.py +1 -0
  42. scitex/ai/feature_selection/__init__.py +30 -0
  43. scitex/ai/feature_selection/feature_selection.py +364 -0
  44. scitex/ai/loss/multi_task_loss.py +1 -1
  45. scitex/ai/metrics/__init__.py +51 -4
  46. scitex/ai/metrics/_calc_bacc.py +61 -0
  47. scitex/ai/metrics/_calc_bacc_from_conf_mat.py +38 -0
  48. scitex/ai/metrics/_calc_clf_report.py +78 -0
  49. scitex/ai/metrics/_calc_conf_mat.py +93 -0
  50. scitex/ai/metrics/_calc_feature_importance.py +183 -0
  51. scitex/ai/metrics/_calc_mcc.py +61 -0
  52. scitex/ai/metrics/_calc_pre_rec_auc.py +116 -0
  53. scitex/ai/metrics/_calc_roc_auc.py +110 -0
  54. scitex/ai/metrics/_calc_seizure_prediction_metrics.py +490 -0
  55. scitex/ai/metrics/{silhoute_score_block.py → _calc_silhouette_score.py} +15 -8
  56. scitex/ai/metrics/_normalize_labels.py +83 -0
  57. scitex/ai/plt/__init__.py +47 -8
  58. scitex/ai/plt/{_conf_mat.py → _plot_conf_mat.py} +158 -87
  59. scitex/ai/plt/_plot_feature_importance.py +323 -0
  60. scitex/ai/plt/_plot_learning_curve.py +345 -0
  61. scitex/ai/plt/_plot_optuna_study.py +225 -0
  62. scitex/ai/plt/_plot_pre_rec_curve.py +290 -0
  63. scitex/ai/plt/_plot_roc_curve.py +255 -0
  64. scitex/ai/training/{learning_curve_logger.py → _LearningCurveLogger.py} +197 -213
  65. scitex/ai/training/__init__.py +2 -2
  66. scitex/ai/utils/grid_search.py +3 -3
  67. scitex/benchmark/__init__.py +52 -0
  68. scitex/benchmark/benchmark.py +400 -0
  69. scitex/benchmark/monitor.py +370 -0
  70. scitex/benchmark/profiler.py +297 -0
  71. scitex/browser/__init__.py +48 -0
  72. scitex/browser/automation/CookieHandler.py +216 -0
  73. scitex/browser/automation/__init__.py +7 -0
  74. scitex/browser/collaboration/__init__.py +55 -0
  75. scitex/browser/collaboration/auth_helpers.py +94 -0
  76. scitex/browser/collaboration/collaborative_agent.py +136 -0
  77. scitex/browser/collaboration/credential_manager.py +188 -0
  78. scitex/browser/collaboration/interactive_panel.py +400 -0
  79. scitex/browser/collaboration/persistent_browser.py +170 -0
  80. scitex/browser/collaboration/shared_session.py +383 -0
  81. scitex/browser/collaboration/standard_interactions.py +246 -0
  82. scitex/browser/collaboration/visual_feedback.py +181 -0
  83. scitex/browser/core/BrowserMixin.py +326 -0
  84. scitex/browser/core/ChromeProfileManager.py +446 -0
  85. scitex/browser/core/__init__.py +9 -0
  86. scitex/browser/debugging/__init__.py +18 -0
  87. scitex/browser/debugging/_browser_logger.py +657 -0
  88. scitex/browser/debugging/_highlight_element.py +143 -0
  89. scitex/browser/debugging/_show_grid.py +154 -0
  90. scitex/browser/interaction/__init__.py +24 -0
  91. scitex/browser/interaction/click_center.py +149 -0
  92. scitex/browser/interaction/click_with_fallbacks.py +206 -0
  93. scitex/browser/interaction/close_popups.py +498 -0
  94. scitex/browser/interaction/fill_with_fallbacks.py +209 -0
  95. scitex/browser/pdf/__init__.py +14 -0
  96. scitex/browser/pdf/click_download_for_chrome_pdf_viewer.py +200 -0
  97. scitex/browser/pdf/detect_chrome_pdf_viewer.py +198 -0
  98. scitex/browser/remote/CaptchaHandler.py +434 -0
  99. scitex/browser/remote/ZenRowsAPIClient.py +347 -0
  100. scitex/browser/remote/ZenRowsBrowserManager.py +570 -0
  101. scitex/browser/remote/__init__.py +11 -0
  102. scitex/browser/stealth/HumanBehavior.py +344 -0
  103. scitex/browser/stealth/StealthManager.py +1008 -0
  104. scitex/browser/stealth/__init__.py +9 -0
  105. scitex/browser/template.py +122 -0
  106. scitex/capture/__init__.py +110 -0
  107. scitex/capture/__main__.py +25 -0
  108. scitex/capture/capture.py +848 -0
  109. scitex/capture/cli.py +233 -0
  110. scitex/capture/gif.py +344 -0
  111. scitex/capture/mcp_server.py +961 -0
  112. scitex/capture/session.py +70 -0
  113. scitex/capture/utils.py +705 -0
  114. scitex/cli/__init__.py +17 -0
  115. scitex/cli/cloud.py +447 -0
  116. scitex/cli/main.py +42 -0
  117. scitex/cli/scholar.py +280 -0
  118. scitex/context/_suppress_output.py +5 -3
  119. scitex/db/__init__.py +30 -3
  120. scitex/db/__main__.py +75 -0
  121. scitex/db/_check_health.py +381 -0
  122. scitex/db/_delete_duplicates.py +25 -386
  123. scitex/db/_inspect.py +335 -114
  124. scitex/db/_inspect_optimized.py +301 -0
  125. scitex/db/{_PostgreSQL.py → _postgresql/_PostgreSQL.py} +3 -3
  126. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BackupMixin.py +1 -1
  127. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BatchMixin.py +1 -1
  128. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BlobMixin.py +1 -1
  129. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_ConnectionMixin.py +1 -1
  130. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_MaintenanceMixin.py +1 -1
  131. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_QueryMixin.py +1 -1
  132. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_SchemaMixin.py +1 -1
  133. scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_TransactionMixin.py +1 -1
  134. scitex/db/_postgresql/__init__.py +6 -0
  135. scitex/db/_sqlite3/_SQLite3.py +210 -0
  136. scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin.py +581 -0
  137. scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin_v01-need-_hash-col.py +517 -0
  138. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_BatchMixin.py +1 -1
  139. scitex/db/_sqlite3/_SQLite3Mixins/_BlobMixin.py +281 -0
  140. scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin.py +548 -0
  141. scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin_v01-indentation-issues.py +583 -0
  142. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_ConnectionMixin.py +29 -13
  143. scitex/db/_sqlite3/_SQLite3Mixins/_GitMixin.py +583 -0
  144. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_ImportExportMixin.py +1 -1
  145. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_IndexMixin.py +1 -1
  146. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_MaintenanceMixin.py +2 -1
  147. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_QueryMixin.py +37 -10
  148. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_RowMixin.py +46 -6
  149. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_TableMixin.py +56 -10
  150. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_TransactionMixin.py +1 -1
  151. scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/__init__.py +14 -2
  152. scitex/db/_sqlite3/__init__.py +7 -0
  153. scitex/db/_sqlite3/_delete_duplicates.py +274 -0
  154. scitex/decorators/__init__.py +2 -0
  155. scitex/decorators/_cache_disk.py +13 -5
  156. scitex/decorators/_cache_disk_async.py +49 -0
  157. scitex/decorators/_deprecated.py +175 -10
  158. scitex/decorators/_timeout.py +1 -1
  159. scitex/dev/_analyze_code_flow.py +2 -2
  160. scitex/dict/_DotDict.py +73 -15
  161. scitex/dict/_DotDict_v01-not-handling-recursive-instantiations.py +442 -0
  162. scitex/dict/_DotDict_v02-not-serializing-Path-object.py +446 -0
  163. scitex/dict/__init__.py +2 -0
  164. scitex/dict/_flatten.py +27 -0
  165. scitex/dsp/_crop.py +2 -2
  166. scitex/dsp/_demo_sig.py +2 -2
  167. scitex/dsp/_detect_ripples.py +2 -2
  168. scitex/dsp/_hilbert.py +2 -2
  169. scitex/dsp/_listen.py +6 -6
  170. scitex/dsp/_modulation_index.py +2 -2
  171. scitex/dsp/_pac.py +1 -1
  172. scitex/dsp/_psd.py +2 -2
  173. scitex/dsp/_resample.py +2 -1
  174. scitex/dsp/_time.py +3 -2
  175. scitex/dsp/_wavelet.py +3 -2
  176. scitex/dsp/add_noise.py +2 -2
  177. scitex/dsp/example.py +1 -0
  178. scitex/dsp/filt.py +10 -9
  179. scitex/dsp/template.py +3 -2
  180. scitex/dsp/utils/_differential_bandpass_filters.py +1 -1
  181. scitex/dsp/utils/pac.py +2 -2
  182. scitex/dt/_normalize_timestamp.py +432 -0
  183. scitex/errors.py +572 -0
  184. scitex/gen/_DimHandler.py +2 -2
  185. scitex/gen/__init__.py +37 -7
  186. scitex/gen/_deprecated_close.py +80 -0
  187. scitex/gen/_deprecated_start.py +26 -0
  188. scitex/gen/_detect_environment.py +152 -0
  189. scitex/gen/_detect_notebook_path.py +169 -0
  190. scitex/gen/_embed.py +6 -2
  191. scitex/gen/_get_notebook_path.py +257 -0
  192. scitex/gen/_less.py +1 -1
  193. scitex/gen/_list_packages.py +2 -2
  194. scitex/gen/_norm.py +44 -9
  195. scitex/gen/_norm_cache.py +269 -0
  196. scitex/gen/_src.py +3 -5
  197. scitex/gen/_title_case.py +3 -3
  198. scitex/io/__init__.py +28 -6
  199. scitex/io/_glob.py +13 -7
  200. scitex/io/_load.py +108 -21
  201. scitex/io/_load_cache.py +303 -0
  202. scitex/io/_load_configs.py +40 -15
  203. scitex/io/{_H5Explorer.py → _load_modules/_H5Explorer.py} +80 -17
  204. scitex/io/_load_modules/_ZarrExplorer.py +114 -0
  205. scitex/io/_load_modules/_bibtex.py +207 -0
  206. scitex/io/_load_modules/_hdf5.py +53 -178
  207. scitex/io/_load_modules/_json.py +5 -3
  208. scitex/io/_load_modules/_pdf.py +871 -16
  209. scitex/io/_load_modules/_sqlite3.py +15 -0
  210. scitex/io/_load_modules/_txt.py +41 -12
  211. scitex/io/_load_modules/_yaml.py +4 -3
  212. scitex/io/_load_modules/_zarr.py +126 -0
  213. scitex/io/_save.py +429 -171
  214. scitex/io/_save_modules/__init__.py +6 -0
  215. scitex/io/_save_modules/_bibtex.py +194 -0
  216. scitex/io/_save_modules/_csv.py +8 -4
  217. scitex/io/_save_modules/_excel.py +174 -15
  218. scitex/io/_save_modules/_hdf5.py +251 -226
  219. scitex/io/_save_modules/_image.py +1 -3
  220. scitex/io/_save_modules/_json.py +49 -4
  221. scitex/io/_save_modules/_listed_dfs_as_csv.py +1 -3
  222. scitex/io/_save_modules/_listed_scalars_as_csv.py +1 -3
  223. scitex/io/_save_modules/_tex.py +277 -0
  224. scitex/io/_save_modules/_yaml.py +42 -3
  225. scitex/io/_save_modules/_zarr.py +160 -0
  226. scitex/io/utils/__init__.py +20 -0
  227. scitex/io/utils/h5_to_zarr.py +616 -0
  228. scitex/linalg/_geometric_median.py +6 -2
  229. scitex/{gen/_tee.py → logging/_Tee.py} +43 -84
  230. scitex/logging/__init__.py +122 -0
  231. scitex/logging/_config.py +158 -0
  232. scitex/logging/_context.py +103 -0
  233. scitex/logging/_formatters.py +128 -0
  234. scitex/logging/_handlers.py +64 -0
  235. scitex/logging/_levels.py +35 -0
  236. scitex/logging/_logger.py +163 -0
  237. scitex/logging/_print_capture.py +95 -0
  238. scitex/ml/__init__.py +69 -0
  239. scitex/{ai/genai/anthropic.py → ml/_gen_ai/_Anthropic.py} +13 -19
  240. scitex/{ai/genai/base_genai.py → ml/_gen_ai/_BaseGenAI.py} +5 -5
  241. scitex/{ai/genai/deepseek.py → ml/_gen_ai/_DeepSeek.py} +11 -16
  242. scitex/{ai/genai/google.py → ml/_gen_ai/_Google.py} +7 -15
  243. scitex/{ai/genai/groq.py → ml/_gen_ai/_Groq.py} +1 -8
  244. scitex/{ai/genai/llama.py → ml/_gen_ai/_Llama.py} +3 -16
  245. scitex/{ai/genai/openai.py → ml/_gen_ai/_OpenAI.py} +3 -3
  246. scitex/{ai/genai/params.py → ml/_gen_ai/_PARAMS.py} +51 -65
  247. scitex/{ai/genai/perplexity.py → ml/_gen_ai/_Perplexity.py} +3 -14
  248. scitex/ml/_gen_ai/__init__.py +43 -0
  249. scitex/{ai/genai/calc_cost.py → ml/_gen_ai/_calc_cost.py} +1 -1
  250. scitex/{ai/genai/format_output_func.py → ml/_gen_ai/_format_output_func.py} +4 -4
  251. scitex/{ai/genai/genai_factory.py → ml/_gen_ai/_genai_factory.py} +8 -8
  252. scitex/ml/activation/__init__.py +8 -0
  253. scitex/ml/activation/_define.py +11 -0
  254. scitex/{ai/classifier_server.py → ml/classification/Classifier.py} +5 -5
  255. scitex/ml/classification/CrossValidationExperiment.py +374 -0
  256. scitex/ml/classification/__init__.py +46 -0
  257. scitex/ml/classification/reporters/_BaseClassificationReporter.py +281 -0
  258. scitex/ml/classification/reporters/_ClassificationReporter.py +773 -0
  259. scitex/ml/classification/reporters/_MultiClassificationReporter.py +406 -0
  260. scitex/ml/classification/reporters/_SingleClassificationReporter.py +1834 -0
  261. scitex/ml/classification/reporters/__init__.py +11 -0
  262. scitex/ml/classification/reporters/reporter_utils/_Plotter.py +1028 -0
  263. scitex/ml/classification/reporters/reporter_utils/__init__.py +80 -0
  264. scitex/ml/classification/reporters/reporter_utils/aggregation.py +457 -0
  265. scitex/ml/classification/reporters/reporter_utils/data_models.py +313 -0
  266. scitex/ml/classification/reporters/reporter_utils/reporting.py +1056 -0
  267. scitex/ml/classification/reporters/reporter_utils/storage.py +221 -0
  268. scitex/ml/classification/reporters/reporter_utils/validation.py +395 -0
  269. scitex/ml/classification/timeseries/_TimeSeriesBlockingSplit.py +568 -0
  270. scitex/ml/classification/timeseries/_TimeSeriesCalendarSplit.py +688 -0
  271. scitex/ml/classification/timeseries/_TimeSeriesMetadata.py +139 -0
  272. scitex/ml/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +1716 -0
  273. scitex/ml/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +1685 -0
  274. scitex/ml/classification/timeseries/_TimeSeriesStrategy.py +84 -0
  275. scitex/ml/classification/timeseries/_TimeSeriesStratifiedSplit.py +610 -0
  276. scitex/ml/classification/timeseries/__init__.py +39 -0
  277. scitex/ml/classification/timeseries/_normalize_timestamp.py +436 -0
  278. scitex/ml/clustering/__init__.py +11 -0
  279. scitex/ml/clustering/_pca.py +115 -0
  280. scitex/ml/clustering/_umap.py +376 -0
  281. scitex/ml/feature_extraction/__init__.py +56 -0
  282. scitex/ml/feature_extraction/vit.py +149 -0
  283. scitex/ml/feature_selection/__init__.py +30 -0
  284. scitex/ml/feature_selection/feature_selection.py +364 -0
  285. scitex/ml/loss/_L1L2Losses.py +34 -0
  286. scitex/ml/loss/__init__.py +12 -0
  287. scitex/ml/loss/multi_task_loss.py +47 -0
  288. scitex/ml/metrics/__init__.py +56 -0
  289. scitex/ml/metrics/_calc_bacc.py +61 -0
  290. scitex/ml/metrics/_calc_bacc_from_conf_mat.py +38 -0
  291. scitex/ml/metrics/_calc_clf_report.py +78 -0
  292. scitex/ml/metrics/_calc_conf_mat.py +93 -0
  293. scitex/ml/metrics/_calc_feature_importance.py +183 -0
  294. scitex/ml/metrics/_calc_mcc.py +61 -0
  295. scitex/ml/metrics/_calc_pre_rec_auc.py +116 -0
  296. scitex/ml/metrics/_calc_roc_auc.py +110 -0
  297. scitex/ml/metrics/_calc_seizure_prediction_metrics.py +490 -0
  298. scitex/ml/metrics/_calc_silhouette_score.py +503 -0
  299. scitex/ml/metrics/_normalize_labels.py +83 -0
  300. scitex/ml/optim/Ranger_Deep_Learning_Optimizer/__init__.py +0 -0
  301. scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/__init__.py +3 -0
  302. scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger.py +207 -0
  303. scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger2020.py +238 -0
  304. scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger913A.py +215 -0
  305. scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/rangerqh.py +184 -0
  306. scitex/ml/optim/Ranger_Deep_Learning_Optimizer/setup.py +24 -0
  307. scitex/ml/optim/__init__.py +13 -0
  308. scitex/ml/optim/_get_set.py +31 -0
  309. scitex/ml/optim/_optimizers.py +71 -0
  310. scitex/ml/plt/__init__.py +60 -0
  311. scitex/ml/plt/_plot_conf_mat.py +663 -0
  312. scitex/ml/plt/_plot_feature_importance.py +323 -0
  313. scitex/ml/plt/_plot_learning_curve.py +345 -0
  314. scitex/ml/plt/_plot_optuna_study.py +225 -0
  315. scitex/ml/plt/_plot_pre_rec_curve.py +290 -0
  316. scitex/ml/plt/_plot_roc_curve.py +255 -0
  317. scitex/ml/sk/__init__.py +11 -0
  318. scitex/ml/sk/_clf.py +58 -0
  319. scitex/ml/sk/_to_sktime.py +100 -0
  320. scitex/ml/sklearn/__init__.py +26 -0
  321. scitex/ml/sklearn/clf.py +58 -0
  322. scitex/ml/sklearn/to_sktime.py +100 -0
  323. scitex/{ai/training/early_stopping.py → ml/training/_EarlyStopping.py} +1 -2
  324. scitex/{ai → ml/training}/_LearningCurveLogger.py +198 -242
  325. scitex/ml/training/__init__.py +7 -0
  326. scitex/ml/utils/__init__.py +22 -0
  327. scitex/ml/utils/_check_params.py +50 -0
  328. scitex/ml/utils/_default_dataset.py +46 -0
  329. scitex/ml/utils/_format_samples_for_sktime.py +26 -0
  330. scitex/ml/utils/_label_encoder.py +134 -0
  331. scitex/ml/utils/_merge_labels.py +22 -0
  332. scitex/ml/utils/_sliding_window_data_augmentation.py +11 -0
  333. scitex/ml/utils/_under_sample.py +51 -0
  334. scitex/ml/utils/_verify_n_gpus.py +16 -0
  335. scitex/ml/utils/grid_search.py +148 -0
  336. scitex/nn/_BNet.py +15 -9
  337. scitex/nn/_Filters.py +2 -2
  338. scitex/nn/_ModulationIndex.py +2 -2
  339. scitex/nn/_PAC.py +1 -1
  340. scitex/nn/_Spectrogram.py +12 -3
  341. scitex/nn/__init__.py +9 -10
  342. scitex/path/__init__.py +18 -0
  343. scitex/path/_clean.py +4 -0
  344. scitex/path/_find.py +9 -4
  345. scitex/path/_symlink.py +348 -0
  346. scitex/path/_version.py +4 -3
  347. scitex/pd/__init__.py +2 -0
  348. scitex/pd/_get_unique.py +99 -0
  349. scitex/plt/__init__.py +114 -5
  350. scitex/plt/_subplots/_AxesWrapper.py +1 -3
  351. scitex/plt/_subplots/_AxisWrapper.py +7 -3
  352. scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin.py +47 -13
  353. scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin.py +160 -2
  354. scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin.py +26 -4
  355. scitex/plt/_subplots/_AxisWrapperMixins/_UnitAwareMixin.py +322 -0
  356. scitex/plt/_subplots/_AxisWrapperMixins/__init__.py +1 -0
  357. scitex/plt/_subplots/_FigWrapper.py +62 -6
  358. scitex/plt/_subplots/_export_as_csv.py +43 -27
  359. scitex/plt/_subplots/_export_as_csv_formatters/__init__.py +5 -4
  360. scitex/plt/_subplots/_export_as_csv_formatters/_format_annotate.py +81 -0
  361. scitex/plt/_subplots/_export_as_csv_formatters/_format_bar.py +1 -3
  362. scitex/plt/_subplots/_export_as_csv_formatters/_format_barh.py +20 -5
  363. scitex/plt/_subplots/_export_as_csv_formatters/_format_boxplot.py +1 -3
  364. scitex/plt/_subplots/_export_as_csv_formatters/_format_contour.py +1 -3
  365. scitex/plt/_subplots/_export_as_csv_formatters/_format_errorbar.py +35 -18
  366. scitex/plt/_subplots/_export_as_csv_formatters/_format_eventplot.py +1 -3
  367. scitex/plt/_subplots/_export_as_csv_formatters/_format_fill.py +1 -3
  368. scitex/plt/_subplots/_export_as_csv_formatters/_format_fill_between.py +1 -3
  369. scitex/plt/_subplots/_export_as_csv_formatters/_format_hist.py +1 -3
  370. scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow.py +1 -3
  371. scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow2d.py +1 -3
  372. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +15 -3
  373. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_box.py +1 -3
  374. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_conf_mat.py +1 -3
  375. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_ecdf.py +1 -3
  376. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_fillv.py +1 -3
  377. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_heatmap.py +1 -3
  378. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_image.py +1 -3
  379. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_joyplot.py +1 -3
  380. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_kde.py +1 -3
  381. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_line.py +1 -3
  382. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_ci.py +1 -3
  383. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_std.py +1 -3
  384. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_median_iqr.py +1 -3
  385. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_raster.py +1 -3
  386. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_rectangle.py +1 -3
  387. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter.py +35 -0
  388. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter_hist.py +1 -3
  389. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_shaded_line.py +1 -3
  390. scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_violin.py +1 -3
  391. scitex/plt/_subplots/_export_as_csv_formatters/_format_scatter.py +6 -4
  392. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_barplot.py +1 -3
  393. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_boxplot.py +1 -3
  394. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_heatmap.py +1 -3
  395. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_histplot.py +1 -3
  396. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_jointplot.py +1 -3
  397. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_kdeplot.py +1 -3
  398. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_lineplot.py +1 -3
  399. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_pairplot.py +1 -3
  400. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_scatterplot.py +1 -3
  401. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_stripplot.py +1 -3
  402. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_swarmplot.py +1 -3
  403. scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_violinplot.py +1 -3
  404. scitex/plt/_subplots/_export_as_csv_formatters/_format_text.py +60 -0
  405. scitex/plt/_subplots/_export_as_csv_formatters/_format_violin.py +1 -3
  406. scitex/plt/_subplots/_export_as_csv_formatters/_format_violinplot.py +1 -3
  407. scitex/plt/_subplots/_export_as_csv_formatters/test_formatters.py +1 -3
  408. scitex/plt/_subplots/_export_as_csv_formatters.py +56 -59
  409. scitex/plt/ax/_style/_hide_spines.py +1 -3
  410. scitex/plt/ax/_style/_rotate_labels.py +180 -76
  411. scitex/plt/ax/_style/_rotate_labels_v01.py +248 -0
  412. scitex/plt/ax/_style/_set_meta.py +11 -4
  413. scitex/plt/ax/_style/_set_supxyt.py +3 -3
  414. scitex/plt/ax/_style/_set_xyt.py +3 -3
  415. scitex/plt/ax/_style/_share_axes.py +2 -2
  416. scitex/plt/color/__init__.py +4 -4
  417. scitex/plt/color/{_get_colors_from_cmap.py → _get_colors_from_conf_matap.py} +7 -7
  418. scitex/plt/utils/_configure_mpl.py +99 -86
  419. scitex/plt/utils/_histogram_utils.py +1 -3
  420. scitex/plt/utils/_is_valid_axis.py +1 -3
  421. scitex/plt/utils/_scitex_config.py +1 -0
  422. scitex/repro/__init__.py +75 -0
  423. scitex/{reproduce → repro}/_gen_ID.py +1 -1
  424. scitex/{reproduce → repro}/_gen_timestamp.py +1 -1
  425. scitex/repro_rng/_RandomStateManager.py +590 -0
  426. scitex/repro_rng/_RandomStateManager_v01-no-verbose-options.py +414 -0
  427. scitex/repro_rng/__init__.py +39 -0
  428. scitex/reproduce/__init__.py +25 -13
  429. scitex/reproduce/_hash_array.py +22 -0
  430. scitex/resource/_get_processor_usages.py +4 -4
  431. scitex/resource/_get_specs.py +2 -2
  432. scitex/resource/_log_processor_usages.py +2 -2
  433. scitex/rng/_RandomStateManager.py +590 -0
  434. scitex/rng/_RandomStateManager_v01-no-verbose-options.py +414 -0
  435. scitex/rng/__init__.py +39 -0
  436. scitex/scholar/__init__.py +309 -19
  437. scitex/scholar/__main__.py +319 -0
  438. scitex/scholar/auth/ScholarAuthManager.py +308 -0
  439. scitex/scholar/auth/__init__.py +12 -0
  440. scitex/scholar/auth/core/AuthenticationGateway.py +473 -0
  441. scitex/scholar/auth/core/BrowserAuthenticator.py +386 -0
  442. scitex/scholar/auth/core/StrategyResolver.py +309 -0
  443. scitex/scholar/auth/core/__init__.py +16 -0
  444. scitex/scholar/auth/gateway/_OpenURLLinkFinder.py +120 -0
  445. scitex/scholar/auth/gateway/_OpenURLResolver.py +209 -0
  446. scitex/scholar/auth/gateway/__init__.py +38 -0
  447. scitex/scholar/auth/gateway/_resolve_functions.py +101 -0
  448. scitex/scholar/auth/providers/BaseAuthenticator.py +166 -0
  449. scitex/scholar/auth/providers/EZProxyAuthenticator.py +484 -0
  450. scitex/scholar/auth/providers/OpenAthensAuthenticator.py +619 -0
  451. scitex/scholar/auth/providers/ShibbolethAuthenticator.py +686 -0
  452. scitex/scholar/auth/providers/__init__.py +18 -0
  453. scitex/scholar/auth/session/AuthCacheManager.py +189 -0
  454. scitex/scholar/auth/session/SessionManager.py +159 -0
  455. scitex/scholar/auth/session/__init__.py +11 -0
  456. scitex/scholar/auth/sso/BaseSSOAutomator.py +373 -0
  457. scitex/scholar/auth/sso/OpenAthensSSOAutomator.py +378 -0
  458. scitex/scholar/auth/sso/SSOAutomator.py +180 -0
  459. scitex/scholar/auth/sso/UniversityOfMelbourneSSOAutomator.py +380 -0
  460. scitex/scholar/auth/sso/__init__.py +15 -0
  461. scitex/scholar/browser/ScholarBrowserManager.py +705 -0
  462. scitex/scholar/browser/__init__.py +38 -0
  463. scitex/scholar/browser/utils/__init__.py +13 -0
  464. scitex/scholar/browser/utils/click_and_wait.py +205 -0
  465. scitex/scholar/browser/utils/close_unwanted_pages.py +140 -0
  466. scitex/scholar/browser/utils/wait_redirects.py +732 -0
  467. scitex/scholar/config/PublisherRules.py +132 -0
  468. scitex/scholar/config/ScholarConfig.py +126 -0
  469. scitex/scholar/config/__init__.py +17 -0
  470. scitex/scholar/core/Paper.py +627 -0
  471. scitex/scholar/core/Papers.py +722 -0
  472. scitex/scholar/core/Scholar.py +1975 -0
  473. scitex/scholar/core/__init__.py +9 -0
  474. scitex/scholar/impact_factor/ImpactFactorEngine.py +204 -0
  475. scitex/scholar/impact_factor/__init__.py +20 -0
  476. scitex/scholar/impact_factor/estimation/ImpactFactorEstimationEngine.py +0 -0
  477. scitex/scholar/impact_factor/estimation/__init__.py +40 -0
  478. scitex/scholar/impact_factor/estimation/build_database.py +0 -0
  479. scitex/scholar/impact_factor/estimation/core/__init__.py +28 -0
  480. scitex/scholar/impact_factor/estimation/core/cache_manager.py +523 -0
  481. scitex/scholar/impact_factor/estimation/core/calculator.py +355 -0
  482. scitex/scholar/impact_factor/estimation/core/journal_matcher.py +428 -0
  483. scitex/scholar/integration/__init__.py +59 -0
  484. scitex/scholar/integration/base.py +502 -0
  485. scitex/scholar/integration/mendeley/__init__.py +22 -0
  486. scitex/scholar/integration/mendeley/exporter.py +166 -0
  487. scitex/scholar/integration/mendeley/importer.py +236 -0
  488. scitex/scholar/integration/mendeley/linker.py +79 -0
  489. scitex/scholar/integration/mendeley/mapper.py +212 -0
  490. scitex/scholar/integration/zotero/__init__.py +27 -0
  491. scitex/scholar/integration/zotero/__main__.py +264 -0
  492. scitex/scholar/integration/zotero/exporter.py +351 -0
  493. scitex/scholar/integration/zotero/importer.py +372 -0
  494. scitex/scholar/integration/zotero/linker.py +415 -0
  495. scitex/scholar/integration/zotero/mapper.py +286 -0
  496. scitex/scholar/metadata_engines/ScholarEngine.py +588 -0
  497. scitex/scholar/metadata_engines/__init__.py +21 -0
  498. scitex/scholar/metadata_engines/individual/ArXivEngine.py +397 -0
  499. scitex/scholar/metadata_engines/individual/CrossRefEngine.py +274 -0
  500. scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +263 -0
  501. scitex/scholar/metadata_engines/individual/OpenAlexEngine.py +350 -0
  502. scitex/scholar/metadata_engines/individual/PubMedEngine.py +329 -0
  503. scitex/scholar/metadata_engines/individual/SemanticScholarEngine.py +438 -0
  504. scitex/scholar/metadata_engines/individual/URLDOIEngine.py +410 -0
  505. scitex/scholar/metadata_engines/individual/_BaseDOIEngine.py +487 -0
  506. scitex/scholar/metadata_engines/individual/__init__.py +7 -0
  507. scitex/scholar/metadata_engines/utils/_PubMedConverter.py +469 -0
  508. scitex/scholar/metadata_engines/utils/_URLDOIExtractor.py +283 -0
  509. scitex/scholar/metadata_engines/utils/__init__.py +30 -0
  510. scitex/scholar/metadata_engines/utils/_metadata2bibtex.py +103 -0
  511. scitex/scholar/metadata_engines/utils/_standardize_metadata.py +376 -0
  512. scitex/scholar/pdf_download/ScholarPDFDownloader.py +579 -0
  513. scitex/scholar/pdf_download/__init__.py +5 -0
  514. scitex/scholar/pdf_download/strategies/__init__.py +38 -0
  515. scitex/scholar/pdf_download/strategies/chrome_pdf_viewer.py +376 -0
  516. scitex/scholar/pdf_download/strategies/direct_download.py +131 -0
  517. scitex/scholar/pdf_download/strategies/manual_download_fallback.py +167 -0
  518. scitex/scholar/pdf_download/strategies/manual_download_utils.py +996 -0
  519. scitex/scholar/pdf_download/strategies/response_body.py +207 -0
  520. scitex/scholar/pipelines/ScholarPipelineBibTeX.py +364 -0
  521. scitex/scholar/pipelines/ScholarPipelineParallel.py +478 -0
  522. scitex/scholar/pipelines/ScholarPipelineSingle.py +767 -0
  523. scitex/scholar/pipelines/__init__.py +49 -0
  524. scitex/scholar/storage/BibTeXHandler.py +1018 -0
  525. scitex/scholar/storage/PaperIO.py +468 -0
  526. scitex/scholar/storage/ScholarLibrary.py +182 -0
  527. scitex/scholar/storage/_DeduplicationManager.py +548 -0
  528. scitex/scholar/storage/_LibraryCacheManager.py +724 -0
  529. scitex/scholar/storage/_LibraryManager.py +1835 -0
  530. scitex/scholar/storage/__init__.py +28 -0
  531. scitex/scholar/url_finder/ScholarURLFinder.py +379 -0
  532. scitex/scholar/url_finder/__init__.py +7 -0
  533. scitex/scholar/url_finder/strategies/__init__.py +33 -0
  534. scitex/scholar/url_finder/strategies/find_pdf_urls_by_direct_links.py +261 -0
  535. scitex/scholar/url_finder/strategies/find_pdf_urls_by_dropdown.py +67 -0
  536. scitex/scholar/url_finder/strategies/find_pdf_urls_by_href.py +204 -0
  537. scitex/scholar/url_finder/strategies/find_pdf_urls_by_navigation.py +256 -0
  538. scitex/scholar/url_finder/strategies/find_pdf_urls_by_publisher_patterns.py +165 -0
  539. scitex/scholar/url_finder/strategies/find_pdf_urls_by_zotero_translators.py +163 -0
  540. scitex/scholar/url_finder/strategies/find_supplementary_urls_by_href.py +70 -0
  541. scitex/scholar/utils/__init__.py +22 -0
  542. scitex/scholar/utils/bibtex/__init__.py +9 -0
  543. scitex/scholar/utils/bibtex/_parse_bibtex.py +71 -0
  544. scitex/scholar/utils/cleanup/__init__.py +8 -0
  545. scitex/scholar/utils/cleanup/_cleanup_scholar_processes.py +96 -0
  546. scitex/scholar/utils/cleanup/cleanup_old_extractions.py +117 -0
  547. scitex/scholar/utils/text/_TextNormalizer.py +407 -0
  548. scitex/scholar/utils/text/__init__.py +9 -0
  549. scitex/scholar/zotero/__init__.py +38 -0
  550. scitex/session/__init__.py +51 -0
  551. scitex/session/_lifecycle.py +736 -0
  552. scitex/session/_manager.py +102 -0
  553. scitex/session/template.py +122 -0
  554. scitex/stats/__init__.py +30 -26
  555. scitex/stats/correct/__init__.py +21 -0
  556. scitex/stats/correct/_correct_bonferroni.py +551 -0
  557. scitex/stats/correct/_correct_fdr.py +634 -0
  558. scitex/stats/correct/_correct_holm.py +548 -0
  559. scitex/stats/correct/_correct_sidak.py +499 -0
  560. scitex/stats/descriptive/__init__.py +85 -0
  561. scitex/stats/descriptive/_circular.py +540 -0
  562. scitex/stats/descriptive/_describe.py +219 -0
  563. scitex/stats/descriptive/_nan.py +518 -0
  564. scitex/stats/descriptive/_real.py +189 -0
  565. scitex/stats/effect_sizes/__init__.py +41 -0
  566. scitex/stats/effect_sizes/_cliffs_delta.py +325 -0
  567. scitex/stats/effect_sizes/_cohens_d.py +342 -0
  568. scitex/stats/effect_sizes/_epsilon_squared.py +315 -0
  569. scitex/stats/effect_sizes/_eta_squared.py +302 -0
  570. scitex/stats/effect_sizes/_prob_superiority.py +296 -0
  571. scitex/stats/posthoc/__init__.py +19 -0
  572. scitex/stats/posthoc/_dunnett.py +463 -0
  573. scitex/stats/posthoc/_games_howell.py +383 -0
  574. scitex/stats/posthoc/_tukey_hsd.py +367 -0
  575. scitex/stats/power/__init__.py +19 -0
  576. scitex/stats/power/_power.py +433 -0
  577. scitex/stats/template.py +119 -0
  578. scitex/stats/utils/__init__.py +62 -0
  579. scitex/stats/utils/_effect_size.py +985 -0
  580. scitex/stats/utils/_formatters.py +270 -0
  581. scitex/stats/utils/_normalizers.py +927 -0
  582. scitex/stats/utils/_power.py +433 -0
  583. scitex/stats_v01/_EffectSizeCalculator.py +488 -0
  584. scitex/stats_v01/_StatisticalValidator.py +411 -0
  585. scitex/stats_v01/__init__.py +60 -0
  586. scitex/stats_v01/_additional_tests.py +415 -0
  587. scitex/{stats → stats_v01}/_p2stars.py +19 -5
  588. scitex/stats_v01/_two_sample_tests.py +141 -0
  589. scitex/stats_v01/desc/__init__.py +83 -0
  590. scitex/stats_v01/desc/_circular.py +540 -0
  591. scitex/stats_v01/desc/_describe.py +219 -0
  592. scitex/stats_v01/desc/_nan.py +518 -0
  593. scitex/{stats/desc/_nan.py → stats_v01/desc/_nan_v01-20250920_145731.py} +23 -12
  594. scitex/stats_v01/desc/_real.py +189 -0
  595. scitex/stats_v01/tests/__corr_test_optimized.py +221 -0
  596. scitex/stats_v01/tests/_corr_test_optimized.py +179 -0
  597. scitex/str/__init__.py +1 -3
  598. scitex/str/_clean_path.py +6 -2
  599. scitex/str/_latex_fallback.py +267 -160
  600. scitex/str/_parse.py +44 -36
  601. scitex/str/_printc.py +1 -3
  602. scitex/template/__init__.py +87 -0
  603. scitex/template/_create_project.py +267 -0
  604. scitex/template/create_pip_project.py +80 -0
  605. scitex/template/create_research.py +80 -0
  606. scitex/template/create_singularity.py +80 -0
  607. scitex/units.py +291 -0
  608. scitex/utils/_compress_hdf5.py +14 -3
  609. scitex/utils/_email.py +21 -2
  610. scitex/utils/_grid.py +6 -4
  611. scitex/utils/_notify.py +13 -10
  612. scitex/utils/_verify_scitex_format.py +589 -0
  613. scitex/utils/_verify_scitex_format_v01.py +370 -0
  614. scitex/utils/template.py +122 -0
  615. scitex/web/_search_pubmed.py +62 -16
  616. scitex-2.1.0.dist-info/LICENSE +21 -0
  617. scitex-2.1.0.dist-info/METADATA +677 -0
  618. scitex-2.1.0.dist-info/RECORD +919 -0
  619. {scitex-2.0.0.dist-info → scitex-2.1.0.dist-info}/WHEEL +1 -1
  620. scitex-2.1.0.dist-info/entry_points.txt +3 -0
  621. scitex/ai/__Classifiers.py +0 -101
  622. scitex/ai/classification/classification_reporter.py +0 -1137
  623. scitex/ai/classification/classifiers.py +0 -101
  624. scitex/ai/classification_reporter.py +0 -1161
  625. scitex/ai/genai/__init__.py +0 -277
  626. scitex/ai/genai/anthropic_provider.py +0 -320
  627. scitex/ai/genai/anthropic_refactored.py +0 -109
  628. scitex/ai/genai/auth_manager.py +0 -200
  629. scitex/ai/genai/base_provider.py +0 -291
  630. scitex/ai/genai/chat_history.py +0 -307
  631. scitex/ai/genai/cost_tracker.py +0 -276
  632. scitex/ai/genai/deepseek_provider.py +0 -251
  633. scitex/ai/genai/google_provider.py +0 -228
  634. scitex/ai/genai/groq_provider.py +0 -248
  635. scitex/ai/genai/image_processor.py +0 -250
  636. scitex/ai/genai/llama_provider.py +0 -214
  637. scitex/ai/genai/mock_provider.py +0 -127
  638. scitex/ai/genai/model_registry.py +0 -304
  639. scitex/ai/genai/openai_provider.py +0 -293
  640. scitex/ai/genai/perplexity_provider.py +0 -205
  641. scitex/ai/genai/provider_base.py +0 -302
  642. scitex/ai/genai/provider_factory.py +0 -370
  643. scitex/ai/genai/response_handler.py +0 -235
  644. scitex/ai/layer/_Pass.py +0 -21
  645. scitex/ai/layer/__init__.py +0 -10
  646. scitex/ai/layer/_switch.py +0 -8
  647. scitex/ai/metrics/_bACC.py +0 -51
  648. scitex/ai/plt/_learning_curve.py +0 -194
  649. scitex/ai/plt/_optuna_study.py +0 -111
  650. scitex/ai/plt/aucs/__init__.py +0 -2
  651. scitex/ai/plt/aucs/example.py +0 -60
  652. scitex/ai/plt/aucs/pre_rec_auc.py +0 -223
  653. scitex/ai/plt/aucs/roc_auc.py +0 -246
  654. scitex/ai/sampling/undersample.py +0 -29
  655. scitex/db/_SQLite3.py +0 -2136
  656. scitex/db/_SQLite3Mixins/_BlobMixin.py +0 -229
  657. scitex/gen/_close.py +0 -222
  658. scitex/gen/_start.py +0 -451
  659. scitex/general/__init__.py +0 -5
  660. scitex/io/_load_modules/_db.py +0 -24
  661. scitex/life/__init__.py +0 -10
  662. scitex/life/_monitor_rain.py +0 -49
  663. scitex/reproduce/_fix_seeds.py +0 -45
  664. scitex/res/__init__.py +0 -5
  665. scitex/scholar/_local_search.py +0 -454
  666. scitex/scholar/_paper.py +0 -244
  667. scitex/scholar/_pdf_downloader.py +0 -325
  668. scitex/scholar/_search.py +0 -393
  669. scitex/scholar/_vector_search.py +0 -370
  670. scitex/scholar/_web_sources.py +0 -457
  671. scitex/stats/desc/__init__.py +0 -40
  672. scitex-2.0.0.dist-info/METADATA +0 -307
  673. scitex-2.0.0.dist-info/RECORD +0 -572
  674. scitex-2.0.0.dist-info/licenses/LICENSE +0 -7
  675. /scitex/ai/{act → activation}/__init__.py +0 -0
  676. /scitex/ai/{act → activation}/_define.py +0 -0
  677. /scitex/ai/{early_stopping.py → training/_EarlyStopping.py} +0 -0
  678. /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_ImportExportMixin.py +0 -0
  679. /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_IndexMixin.py +0 -0
  680. /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_RowMixin.py +0 -0
  681. /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_TableMixin.py +0 -0
  682. /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/__init__.py +0 -0
  683. /scitex/{stats → stats_v01}/_calc_partial_corr.py +0 -0
  684. /scitex/{stats → stats_v01}/_corr_test_multi.py +0 -0
  685. /scitex/{stats → stats_v01}/_corr_test_wrapper.py +0 -0
  686. /scitex/{stats → stats_v01}/_describe_wrapper.py +0 -0
  687. /scitex/{stats → stats_v01}/_multiple_corrections.py +0 -0
  688. /scitex/{stats → stats_v01}/_nan_stats.py +0 -0
  689. /scitex/{stats → stats_v01}/_p2stars_wrapper.py +0 -0
  690. /scitex/{stats → stats_v01}/_statistical_tests.py +0 -0
  691. /scitex/{stats/desc/_describe.py → stats_v01/desc/_describe_v01-20250920_145731.py} +0 -0
  692. /scitex/{stats/desc/_real.py → stats_v01/desc/_real_v01-20250920_145731.py} +0 -0
  693. /scitex/{stats → stats_v01}/multiple/__init__.py +0 -0
  694. /scitex/{stats → stats_v01}/multiple/_bonferroni_correction.py +0 -0
  695. /scitex/{stats → stats_v01}/multiple/_fdr_correction.py +0 -0
  696. /scitex/{stats → stats_v01}/multiple/_multicompair.py +0 -0
  697. /scitex/{stats → stats_v01}/tests/__corr_test.py +0 -0
  698. /scitex/{stats → stats_v01}/tests/__corr_test_multi.py +0 -0
  699. /scitex/{stats → stats_v01}/tests/__corr_test_single.py +0 -0
  700. /scitex/{stats → stats_v01}/tests/__init__.py +0 -0
  701. /scitex/{stats → stats_v01}/tests/_brunner_munzel_test.py +0 -0
  702. /scitex/{stats → stats_v01}/tests/_nocorrelation_test.py +0 -0
  703. /scitex/{stats → stats_v01}/tests/_smirnov_grubbs.py +0 -0
  704. {scitex-2.0.0.dist-info → scitex-2.1.0.dist-info}/top_level.txt +0 -0
@@ -1,31 +1,886 @@
1
1
  #!/usr/bin/env python3
2
2
  # -*- coding: utf-8 -*-
3
- # Time-stamp: "2024-11-14 07:55:46 (ywatanabe)"
4
- # File: ./scitex_repo/src/scitex/io/_load_modules/_pdf.py
3
+ # Timestamp: "2025-10-06 10:27:52 (ywatanabe)"
4
+ # File: /home/ywatanabe/proj/scitex_repo/src/scitex/io/_load_modules/_pdf.py
5
+ # ----------------------------------------
6
+ from __future__ import annotations
7
+ import os
8
+ __FILE__ = __file__
9
+ __DIR__ = os.path.dirname(__FILE__)
10
+ # ----------------------------------------
11
+
12
+ """
13
+ Enhanced PDF loading module with comprehensive extraction capabilities.
14
+
15
+ This module provides advanced PDF extraction for scientific papers, including:
16
+ - Text extraction with formatting preservation
17
+ - Table extraction as pandas DataFrames
18
+ - Image extraction with metadata
19
+ - Section-aware text parsing
20
+ - Multiple extraction modes for different use cases
21
+ """
22
+
23
+ import hashlib
24
+ import re
25
+ import tempfile
26
+ from typing import Any, Dict, List
27
+
28
+ from scitex import logging
29
+ from scitex.dict import DotDict
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # Try to import PDF libraries in order of preference
34
+ try:
35
+ import fitz # PyMuPDF - preferred for text and images
36
+
37
+ FITZ_AVAILABLE = True
38
+ except ImportError:
39
+ FITZ_AVAILABLE = False
40
+
41
+ try:
42
+ import pdfplumber # Best for table extraction
43
+
44
+ PDFPLUMBER_AVAILABLE = True
45
+ except ImportError:
46
+ PDFPLUMBER_AVAILABLE = False
5
47
 
6
48
  try:
7
- import PyPDF2
49
+ import PyPDF2 # Fallback option
50
+
51
+ PYPDF2_AVAILABLE = True
8
52
  except ImportError:
9
- PyPDF2 = None
53
+ PYPDF2_AVAILABLE = False
54
+
55
+ try:
56
+ import pandas as pd
57
+
58
+ PANDAS_AVAILABLE = True
59
+ except ImportError:
60
+ PANDAS_AVAILABLE = False
61
+
62
+
63
+ def _load_pdf(lpath: str, mode: str = "full", **kwargs) -> Any:
64
+ """
65
+ Load PDF file with comprehensive extraction capabilities.
66
+
67
+ Args:
68
+ lpath: Path to PDF file
69
+ mode: Extraction mode (default: 'full')
70
+ - 'full': Complete extraction including text, sections, metadata, pages, tables, and images
71
+ - 'scientific': Optimized for scientific papers (text + sections + tables + images + stats)
72
+ - 'text': Plain text extraction only
73
+ - 'sections': Section-aware text extraction
74
+ - 'tables': Extract tables as DataFrames
75
+ - 'images': Extract images with metadata
76
+ - 'metadata': PDF metadata only
77
+ - 'pages': Page-by-page extraction
78
+ **kwargs: Additional arguments
79
+ - backend: 'auto' (default), 'fitz', 'pdfplumber', or 'pypdf2'
80
+ - clean_text: Clean extracted text (default: True)
81
+ - extract_images: Extract images to files (default: False for 'full' mode, True for 'scientific')
82
+ - output_dir: Directory for extracted images/tables (default: temp dir)
83
+ - save_as_jpg: Convert all extracted images to JPG format (default: True)
84
+ - table_settings: Dict of pdfplumber table extraction settings
85
+
86
+ Returns:
87
+ Extracted content based on mode:
88
+ - 'text': str
89
+ - 'sections': Dict[str, str]
90
+ - 'tables': Dict[int, List[pd.DataFrame]]
91
+ - 'images': List[Dict] with image metadata
92
+ - 'metadata': Dict with PDF metadata
93
+ - 'pages': List[Dict] with page content
94
+ - 'full': Dict with comprehensive extraction (text, sections, metadata, pages, tables, images, stats)
95
+ - 'scientific': Dict with scientific paper extraction (text, sections, metadata, tables, images, stats)
96
+
97
+ Examples:
98
+ >>> import scitex.io as stx
99
+
100
+ >>> # Full extraction (default) - everything included
101
+ >>> data = stx.load("paper.pdf")
102
+ >>> print(data['full_text']) # Complete text
103
+ >>> print(data['sections']) # Parsed sections
104
+ >>> print(data['tables']) # All tables as DataFrames
105
+ >>> print(data['metadata']) # PDF metadata
106
+ >>> print(data['pages']) # Page-by-page content
107
+ >>> print(data['stats']) # Statistics
108
+
109
+ >>> # Scientific mode (recommended for papers) - optimized for research
110
+ >>> paper = stx.load("paper.pdf", mode="scientific")
111
+ >>> print(paper['text']) # Full text
112
+ >>> print(paper['sections']) # Sections (Abstract, Methods, etc.)
113
+ >>> print(paper['tables']) # All tables as DataFrames
114
+ >>> print(paper['images']) # Image metadata
115
+ >>> print(paper['stats']) # Content statistics
116
+
117
+ >>> # Simple text extraction only
118
+ >>> text = stx.load("paper.pdf", mode="text")
119
+
120
+ >>> # Extract tables only
121
+ >>> tables = stx.load("paper.pdf", mode="tables")
122
+ """
123
+ mode = kwargs.get("mode", mode)
124
+ backend = kwargs.get("backend", "auto")
125
+ clean_text = kwargs.get("clean_text", True)
126
+ extract_images = kwargs.get("extract_images", False)
127
+ output_dir = kwargs.get("output_dir", None)
128
+ table_settings = kwargs.get("table_settings", {})
129
+
130
+ # Validate file exists
131
+ if not os.path.exists(lpath):
132
+ raise FileNotFoundError(f"PDF file not found: {lpath}")
133
+
134
+ # Extension validation removed - handled by load() function
135
+ # This allows loading files without extensions when ext='pdf' is specified
136
+
137
+ # Select backend based on mode and availability
138
+ backend = _select_backend(mode, backend)
139
+
140
+ # Create output directory if needed
141
+ if output_dir is None and (
142
+ extract_images or mode in ["images", "scientific", "full"]
143
+ ):
144
+ output_dir = tempfile.mkdtemp(prefix="pdf_extract_")
145
+ logger.debug(f"Using temporary directory: {output_dir}")
146
+
147
+ # Extract based on mode
148
+ if mode == "text":
149
+ return _extract_text(lpath, backend, clean_text)
150
+ elif mode == "sections":
151
+ return _extract_sections(lpath, backend, clean_text)
152
+ elif mode == "tables":
153
+ return _extract_tables(lpath, table_settings)
154
+ elif mode == "images":
155
+ save_as_jpg = kwargs.get("save_as_jpg", True)
156
+ return _extract_images(lpath, output_dir, save_as_jpg)
157
+ elif mode == "metadata":
158
+ return _extract_metadata(lpath, backend)
159
+ elif mode == "pages":
160
+ return _extract_pages(lpath, backend, clean_text)
161
+ elif mode == "scientific":
162
+ save_as_jpg = kwargs.get("save_as_jpg", True)
163
+ return _extract_scientific(
164
+ lpath, clean_text, output_dir, table_settings, save_as_jpg
165
+ )
166
+ elif mode == "full":
167
+ save_as_jpg = kwargs.get("save_as_jpg", True)
168
+ return _extract_full(
169
+ lpath,
170
+ backend,
171
+ clean_text,
172
+ extract_images,
173
+ output_dir,
174
+ table_settings,
175
+ save_as_jpg,
176
+ )
177
+ else:
178
+ raise ValueError(f"Unknown extraction mode: {mode}")
179
+
180
+
181
+ def _select_backend(mode: str, requested: str) -> str:
182
+ """Select appropriate backend based on mode and availability."""
183
+ if requested != "auto":
184
+ return requested
185
+
186
+ # Mode-specific backend selection
187
+ if mode in ["tables"]:
188
+ if PDFPLUMBER_AVAILABLE:
189
+ return "pdfplumber"
190
+ else:
191
+ logger.warning(
192
+ "pdfplumber not available for table extraction. Install with: pip install pdfplumber"
193
+ )
194
+ return "fitz" if FITZ_AVAILABLE else "pypdf2"
195
+
196
+ elif mode in ["images", "scientific", "full"]:
197
+ if FITZ_AVAILABLE:
198
+ return "fitz"
199
+ else:
200
+ logger.warning(
201
+ "PyMuPDF (fitz) recommended for image extraction. Install with: pip install PyMuPDF"
202
+ )
203
+ return "pdfplumber" if PDFPLUMBER_AVAILABLE else "pypdf2"
204
+
205
+ else: # text, sections, metadata, pages
206
+ if FITZ_AVAILABLE:
207
+ return "fitz"
208
+ elif PDFPLUMBER_AVAILABLE:
209
+ return "pdfplumber"
210
+ elif PYPDF2_AVAILABLE:
211
+ return "pypdf2"
212
+ else:
213
+ raise ImportError(
214
+ "No PDF library available. Install one of:\n"
215
+ " pip install PyMuPDF # Recommended\n"
216
+ " pip install pdfplumber # Best for tables\n"
217
+ " pip install PyPDF2 # Basic fallback"
218
+ )
219
+
220
+
221
+ def _extract_text(lpath: str, backend: str, clean: bool) -> str:
222
+ """Extract plain text from PDF."""
223
+ if backend == "fitz":
224
+ return _extract_text_fitz(lpath, clean)
225
+ elif backend == "pdfplumber":
226
+ return _extract_text_pdfplumber(lpath, clean)
227
+ else:
228
+ return _extract_text_pypdf2(lpath, clean)
229
+
230
+
231
+ def _extract_text_fitz(lpath: str, clean: bool) -> str:
232
+ """Extract text using PyMuPDF."""
233
+ if not FITZ_AVAILABLE:
234
+ raise ImportError("PyMuPDF (fitz) not available")
235
+
236
+ try:
237
+ doc = fitz.open(lpath)
238
+ text_parts = []
239
+
240
+ for page_num, page in enumerate(doc):
241
+ text = page.get_text()
242
+ if text.strip():
243
+ text_parts.append(text)
244
+
245
+ doc.close()
246
+
247
+ full_text = "\n".join(text_parts)
248
+
249
+ if clean:
250
+ full_text = _clean_pdf_text(full_text)
251
+
252
+ return full_text
253
+
254
+ except Exception as e:
255
+ logger.error(f"Error extracting text with fitz from {lpath}: {e}")
256
+ raise
257
+
258
+
259
+ def _extract_text_pdfplumber(lpath: str, clean: bool) -> str:
260
+ """Extract text using pdfplumber."""
261
+ if not PDFPLUMBER_AVAILABLE:
262
+ raise ImportError("pdfplumber not available")
263
+
264
+ try:
265
+ import pdfplumber
266
+
267
+ text_parts = []
268
+ with pdfplumber.open(lpath) as pdf:
269
+ for page in pdf.pages:
270
+ text = page.extract_text()
271
+ if text:
272
+ text_parts.append(text)
273
+
274
+ full_text = "\n".join(text_parts)
275
+
276
+ if clean:
277
+ full_text = _clean_pdf_text(full_text)
278
+
279
+ return full_text
280
+
281
+ except Exception as e:
282
+ logger.error(
283
+ f"Error extracting text with pdfplumber from {lpath}: {e}"
284
+ )
285
+ raise
286
+
287
+
288
+ def _extract_text_pypdf2(lpath: str, clean: bool) -> str:
289
+ """Extract text using PyPDF2."""
290
+ if not PYPDF2_AVAILABLE:
291
+ raise ImportError("PyPDF2 not available")
292
+
293
+ try:
294
+ reader = PyPDF2.PdfReader(lpath)
295
+ text_parts = []
296
+
297
+ for page_num in range(len(reader.pages)):
298
+ page = reader.pages[page_num]
299
+ text = page.extract_text()
300
+ if text.strip():
301
+ text_parts.append(text)
302
+
303
+ full_text = "\n".join(text_parts)
304
+
305
+ if clean:
306
+ full_text = _clean_pdf_text(full_text)
307
+
308
+ return full_text
309
+
310
+ except Exception as e:
311
+ logger.error(f"Error extracting text with PyPDF2 from {lpath}: {e}")
312
+ raise
313
+
314
+
315
+ def _extract_tables(
316
+ lpath: str, table_settings: Dict = None
317
+ ) -> Dict[int, List["pd.DataFrame"]]:
318
+ """
319
+ Extract tables from PDF as pandas DataFrames.
320
+
321
+ Returns:
322
+ Dict mapping page numbers to list of DataFrames
323
+ """
324
+ if not PDFPLUMBER_AVAILABLE:
325
+ raise ImportError(
326
+ "pdfplumber required for table extraction. Install with:\n"
327
+ " pip install pdfplumber pandas"
328
+ )
329
+
330
+ if not PANDAS_AVAILABLE:
331
+ raise ImportError("pandas required for table extraction")
332
+
333
+ import pandas as pd
334
+ import pdfplumber
335
+
336
+ tables_dict = {}
337
+ table_settings = table_settings or {}
338
+
339
+ try:
340
+ with pdfplumber.open(lpath) as pdf:
341
+ for page_num, page in enumerate(pdf.pages):
342
+ # Extract tables from page
343
+ tables = page.extract_tables(**table_settings)
344
+
345
+ if tables:
346
+ # Convert to DataFrames
347
+ dfs = []
348
+ for table in tables:
349
+ if table and len(table) > 0:
350
+ # First row as header if it looks like headers
351
+ if len(table) > 1 and all(
352
+ isinstance(cell, str)
353
+ for cell in table[0]
354
+ if cell
355
+ ):
356
+ df = pd.DataFrame(table[1:], columns=table[0])
357
+ else:
358
+ df = pd.DataFrame(table)
359
+
360
+ # Clean up DataFrame
361
+ df = (
362
+ df.replace("", None)
363
+ .dropna(how="all", axis=1)
364
+ .dropna(how="all", axis=0)
365
+ )
366
+
367
+ if not df.empty:
368
+ dfs.append(df)
369
+
370
+ if dfs:
371
+ tables_dict[page_num] = dfs
372
+
373
+ logger.info(f"Extracted tables from {len(tables_dict)} pages")
374
+ return tables_dict
375
+
376
+ except Exception as e:
377
+ logger.error(f"Error extracting tables: {e}")
378
+ raise
379
+
380
+
381
+ def _extract_images(
382
+ lpath: str, output_dir: str = None, save_as_jpg: bool = True
383
+ ) -> List[Dict[str, Any]]:
384
+ """
385
+ Extract images from PDF with metadata.
386
+
387
+ Args:
388
+ lpath: Path to PDF file
389
+ output_dir: Directory to save images (optional)
390
+ save_as_jpg: If True, convert all images to JPG format (default: True)
10
391
 
392
+ Returns:
393
+ List of dicts containing image metadata and paths
394
+ """
395
+ if not FITZ_AVAILABLE:
396
+ raise ImportError(
397
+ "PyMuPDF (fitz) required for image extraction. Install with:\n"
398
+ " pip install PyMuPDF"
399
+ )
400
+
401
+ images_info = []
11
402
 
12
- def _load_pdf(lpath, **kwargs):
13
- """Load PDF file and return extracted text."""
14
- if PyPDF2 is None:
15
- raise ImportError("PyPDF2 is required for PDF loading. Install with: pip install PyPDF2")
16
-
17
403
  try:
18
- if not lpath.endswith(".pdf"):
19
- raise ValueError("File must have .pdf extension")
404
+ doc = fitz.open(lpath)
405
+
406
+ for page_num, page in enumerate(doc):
407
+ image_list = page.get_images()
408
+
409
+ for img_index, img in enumerate(image_list):
410
+ xref = img[0]
411
+
412
+ # Extract image data
413
+ base_image = doc.extract_image(xref)
414
+ image_bytes = base_image["image"]
415
+ original_ext = base_image["ext"]
416
+
417
+ image_info = {
418
+ "page": page_num + 1,
419
+ "index": img_index,
420
+ "width": base_image["width"],
421
+ "height": base_image["height"],
422
+ "colorspace": base_image["colorspace"],
423
+ "bpc": base_image["bpc"], # bits per component
424
+ "original_ext": original_ext,
425
+ "size_bytes": len(image_bytes),
426
+ }
427
+
428
+ # Save image if output directory provided
429
+ if output_dir:
430
+ os.makedirs(output_dir, exist_ok=True)
431
+
432
+ if save_as_jpg and original_ext not in ["jpg", "jpeg"]:
433
+ # Convert to JPG using PIL
434
+ try:
435
+ from PIL import Image
436
+ import io
437
+
438
+ # Open image from bytes
439
+ img_pil = Image.open(io.BytesIO(image_bytes))
440
+
441
+ # Convert RGBA to RGB if necessary
442
+ if img_pil.mode in ('RGBA', 'LA', 'P'):
443
+ # Create a white background
444
+ background = Image.new('RGB', img_pil.size, (255, 255, 255))
445
+ if img_pil.mode == 'P':
446
+ img_pil = img_pil.convert('RGBA')
447
+ background.paste(img_pil, mask=img_pil.split()[-1] if img_pil.mode == 'RGBA' else None)
448
+ img_pil = background
449
+ elif img_pil.mode != 'RGB':
450
+ img_pil = img_pil.convert('RGB')
451
+
452
+ # Save as JPG
453
+ filename = f"page_{page_num + 1}_img_{img_index}.jpg"
454
+ filepath = os.path.join(output_dir, filename)
455
+ img_pil.save(filepath, 'JPEG', quality=95)
456
+
457
+ image_info["ext"] = "jpg"
458
+ except ImportError:
459
+ logger.warning("PIL not available for image conversion. Install with: pip install Pillow")
460
+ # Fall back to original format
461
+ filename = f"page_{page_num + 1}_img_{img_index}.{original_ext}"
462
+ filepath = os.path.join(output_dir, filename)
463
+ with open(filepath, "wb") as img_file:
464
+ img_file.write(image_bytes)
465
+ image_info["ext"] = original_ext
466
+ else:
467
+ # Save with original format
468
+ ext = "jpg" if original_ext == "jpeg" else original_ext
469
+ filename = f"page_{page_num + 1}_img_{img_index}.{ext}"
470
+ filepath = os.path.join(output_dir, filename)
471
+ with open(filepath, "wb") as img_file:
472
+ img_file.write(image_bytes)
473
+ image_info["ext"] = ext
474
+
475
+ image_info["filepath"] = filepath
476
+ image_info["filename"] = filename
477
+
478
+ images_info.append(image_info)
479
+
480
+ doc.close()
481
+
482
+ logger.info(f"Extracted {len(images_info)} images from PDF")
483
+ return images_info
484
+
485
+ except Exception as e:
486
+ logger.error(f"Error extracting images: {e}")
487
+ raise
488
+
489
+
490
+ def _extract_sections(lpath: str, backend: str, clean: bool) -> Dict[str, str]:
491
+ """Extract text organized by sections."""
492
+ # Get full text first
493
+ text = _extract_text(lpath, backend, clean=False)
494
+
495
+ # Parse into sections
496
+ sections = _parse_sections(text)
497
+
498
+ # Clean section text if requested
499
+ if clean:
500
+ for section, content in sections.items():
501
+ sections[section] = _clean_pdf_text(content)
502
+
503
+ return sections
504
+
505
+
506
+ def _parse_sections(text: str) -> Dict[str, str]:
507
+ """
508
+ Parse text into sections based on IMRaD structure.
509
+
510
+ Follows the standard scientific paper structure:
511
+ - frontpage: Title, authors, affiliations, keywords
512
+ - abstract: Paper summary
513
+ - introduction: Background and motivation
514
+ - methods: Methodology (materials and methods, experimental design)
515
+ - results: Findings
516
+ - discussion: Interpretation and implications
517
+ - references: Citations
518
+ """
519
+ sections = {}
520
+ current_section = "frontpage"
521
+ current_text = []
522
+
523
+ # Simplified section patterns - IMRaD + frontpage only
524
+ # Only match standalone section headers (exact matches)
525
+ section_patterns = [
526
+ r"^abstract\s*$",
527
+ r"^summary\s*$",
528
+ r"^introduction\s*$",
529
+ r"^background\s*$",
530
+ r"^methods?\s*$",
531
+ r"^materials?\s+and\s+methods?\s*$",
532
+ r"^methodology\s*$",
533
+ r"^results?\s*$",
534
+ r"^discussion\s*$",
535
+ r"^references?\s*$",
536
+ ]
537
+
538
+ lines = text.split("\n")
539
+
540
+ for line in lines:
541
+ line_lower = line.lower().strip()
542
+ line_stripped = line.strip()
543
+
544
+ # Check if this line is a section header
545
+ is_header = False
546
+ for pattern in section_patterns:
547
+ if re.match(pattern, line_lower):
548
+ # Additional validation: header lines should be short (< 50 chars)
549
+ # and not contain numbers/punctuation (except spaces)
550
+ if len(line_stripped) < 50:
551
+ # Save previous section
552
+ if current_text:
553
+ sections[current_section] = "\n".join(current_text).strip()
554
+
555
+ # Start new section
556
+ current_section = line_lower.strip()
557
+ current_text = []
558
+ is_header = True
559
+ break
560
+
561
+ if not is_header:
562
+ current_text.append(line)
563
+
564
+ # Save last section
565
+ if current_text:
566
+ sections[current_section] = "\n".join(current_text).strip()
567
+
568
+ return sections
569
+
570
+
571
+ def _extract_metadata(lpath: str, backend: str) -> Dict[str, Any]:
572
+ """Extract PDF metadata."""
573
+ metadata = {
574
+ "file_path": lpath,
575
+ "file_name": os.path.basename(lpath),
576
+ "file_size": os.path.getsize(lpath),
577
+ "backend": backend,
578
+ }
579
+
580
+ if backend == "fitz" and FITZ_AVAILABLE:
581
+ try:
582
+ doc = fitz.open(lpath)
583
+ pdf_metadata = doc.metadata
584
+
585
+ metadata.update(
586
+ {
587
+ "title": pdf_metadata.get("title", ""),
588
+ "author": pdf_metadata.get("author", ""),
589
+ "subject": pdf_metadata.get("subject", ""),
590
+ "keywords": pdf_metadata.get("keywords", ""),
591
+ "creator": pdf_metadata.get("creator", ""),
592
+ "producer": pdf_metadata.get("producer", ""),
593
+ "creation_date": str(pdf_metadata.get("creationDate", "")),
594
+ "modification_date": str(pdf_metadata.get("modDate", "")),
595
+ "pages": len(doc),
596
+ "encrypted": doc.is_encrypted,
597
+ }
598
+ )
599
+
600
+ doc.close()
601
+
602
+ except Exception as e:
603
+ logger.error(f"Error extracting metadata with fitz: {e}")
604
+
605
+ elif backend == "pdfplumber" and PDFPLUMBER_AVAILABLE:
606
+ try:
607
+ import pdfplumber
608
+
609
+ with pdfplumber.open(lpath) as pdf:
610
+ metadata["pages"] = len(pdf.pages)
611
+ if hasattr(pdf, "metadata"):
612
+ metadata.update(pdf.metadata)
613
+ except Exception as e:
614
+ logger.error(f"Error extracting metadata with pdfplumber: {e}")
615
+
616
+ elif backend == "pypdf2" and PYPDF2_AVAILABLE:
617
+ try:
618
+ reader = PyPDF2.PdfReader(lpath)
619
+
620
+ if reader.metadata:
621
+ metadata.update(
622
+ {
623
+ "title": reader.metadata.get("/Title", ""),
624
+ "author": reader.metadata.get("/Author", ""),
625
+ "subject": reader.metadata.get("/Subject", ""),
626
+ "creator": reader.metadata.get("/Creator", ""),
627
+ "producer": reader.metadata.get("/Producer", ""),
628
+ "creation_date": str(
629
+ reader.metadata.get("/CreationDate", "")
630
+ ),
631
+ "modification_date": str(
632
+ reader.metadata.get("/ModDate", "")
633
+ ),
634
+ }
635
+ )
636
+
637
+ metadata["pages"] = len(reader.pages)
638
+ metadata["encrypted"] = reader.is_encrypted
639
+
640
+ except Exception as e:
641
+ logger.error(f"Error extracting metadata with PyPDF2: {e}")
20
642
 
643
+ # Generate file hash
644
+ metadata["md5_hash"] = _calculate_file_hash(lpath)
645
+
646
+ return metadata
647
+
648
+
649
+ def _extract_pages(
650
+ lpath: str, backend: str, clean: bool
651
+ ) -> List[Dict[str, Any]]:
652
+ """Extract content page by page."""
653
+ pages = []
654
+
655
+ if backend == "fitz" and FITZ_AVAILABLE:
656
+ doc = fitz.open(lpath)
657
+
658
+ for page_num, page in enumerate(doc):
659
+ text = page.get_text()
660
+ if clean:
661
+ text = _clean_pdf_text(text)
662
+
663
+ pages.append(
664
+ {
665
+ "page_number": page_num + 1,
666
+ "text": text,
667
+ "char_count": len(text),
668
+ "word_count": len(text.split()),
669
+ }
670
+ )
671
+
672
+ doc.close()
673
+
674
+ elif backend == "pdfplumber" and PDFPLUMBER_AVAILABLE:
675
+ import pdfplumber
676
+
677
+ with pdfplumber.open(lpath) as pdf:
678
+ for page_num, page in enumerate(pdf.pages):
679
+ text = page.extract_text() or ""
680
+ if clean:
681
+ text = _clean_pdf_text(text)
682
+
683
+ pages.append(
684
+ {
685
+ "page_number": page_num + 1,
686
+ "text": text,
687
+ "char_count": len(text),
688
+ "word_count": len(text.split()),
689
+ }
690
+ )
691
+
692
+ elif backend == "pypdf2" and PYPDF2_AVAILABLE:
21
693
  reader = PyPDF2.PdfReader(lpath)
22
- full_text = []
694
+
23
695
  for page_num in range(len(reader.pages)):
24
696
  page = reader.pages[page_num]
25
- full_text.append(page.extract_text())
26
- return "\n".join(full_text)
27
- except (ValueError, FileNotFoundError, PyPDF2.PdfReadError) as e:
28
- raise ValueError(f"Error loading PDF {lpath}: {str(e)}")
697
+ text = page.extract_text()
698
+ if clean:
699
+ text = _clean_pdf_text(text)
700
+
701
+ pages.append(
702
+ {
703
+ "page_number": page_num + 1,
704
+ "text": text,
705
+ "char_count": len(text),
706
+ "word_count": len(text.split()),
707
+ }
708
+ )
709
+
710
+ return pages
711
+
712
+
713
+ def _extract_scientific(
714
+ lpath: str, clean_text: bool, output_dir: str, table_settings: Dict, save_as_jpg: bool = True
715
+ ) -> DotDict:
716
+ """
717
+ Optimized extraction for scientific papers.
718
+ Extracts text, tables, images, and sections in a structured format.
719
+ """
720
+ result = {
721
+ "pdf_path": lpath,
722
+ "filename": os.path.basename(lpath),
723
+ "extraction_mode": "scientific",
724
+ }
725
+
726
+ try:
727
+ # Extract text and sections
728
+ backend = _select_backend("text", "auto")
729
+ result["text"] = _extract_text(lpath, backend, clean_text)
730
+ result["sections"] = _extract_sections(lpath, backend, clean_text)
731
+
732
+ # Extract metadata
733
+ result["metadata"] = _extract_metadata(lpath, backend)
734
+
735
+ # Extract tables if pdfplumber available
736
+ if PDFPLUMBER_AVAILABLE and PANDAS_AVAILABLE:
737
+ try:
738
+ result["tables"] = _extract_tables(lpath, table_settings)
739
+ except Exception as e:
740
+ logger.warning(f"Could not extract tables: {e}")
741
+ result["tables"] = {}
742
+ else:
743
+ result["tables"] = {}
744
+ logger.info("Table extraction requires pdfplumber and pandas")
745
+
746
+ # Extract images if fitz available
747
+ if FITZ_AVAILABLE:
748
+ try:
749
+ result["images"] = _extract_images(lpath, output_dir, save_as_jpg)
750
+ except Exception as e:
751
+ logger.warning(f"Could not extract images: {e}")
752
+ result["images"] = []
753
+ else:
754
+ result["images"] = []
755
+ logger.info("Image extraction requires PyMuPDF (fitz)")
756
+
757
+ # Calculate statistics
758
+ result["stats"] = {
759
+ "total_chars": len(result["text"]),
760
+ "total_words": len(result["text"].split()),
761
+ "total_pages": result["metadata"].get("pages", 0),
762
+ "num_sections": len(result["sections"]),
763
+ "num_tables": sum(
764
+ len(tables) for tables in result["tables"].values()
765
+ ),
766
+ "num_images": len(result["images"]),
767
+ }
768
+
769
+ logger.info(
770
+ f"Scientific extraction complete: "
771
+ f"{result['stats']['total_pages']} pages, "
772
+ f"{result['stats']['num_sections']} sections, "
773
+ f"{result['stats']['num_tables']} tables, "
774
+ f"{result['stats']['num_images']} images"
775
+ )
776
+
777
+ except Exception as e:
778
+ logger.error(f"Error in scientific extraction: {e}")
779
+ result["error"] = str(e)
780
+
781
+ return DotDict(result)
782
+
783
+
784
+ def _extract_full(
785
+ lpath: str,
786
+ backend: str,
787
+ clean: bool,
788
+ extract_images: bool,
789
+ output_dir: str,
790
+ table_settings: Dict,
791
+ save_as_jpg: bool = True,
792
+ ) -> DotDict:
793
+ """Extract comprehensive data from PDF."""
794
+ result = {
795
+ "pdf_path": lpath,
796
+ "filename": os.path.basename(lpath),
797
+ "backend": backend,
798
+ "extraction_params": {
799
+ "clean_text": clean,
800
+ "extract_images": extract_images,
801
+ },
802
+ }
803
+
804
+ # Extract all components
805
+ try:
806
+ result["full_text"] = _extract_text(lpath, backend, clean)
807
+ result["sections"] = _extract_sections(lpath, backend, clean)
808
+ result["metadata"] = _extract_metadata(lpath, backend)
809
+ result["pages"] = _extract_pages(lpath, backend, clean)
810
+
811
+ # Extract tables if available
812
+ if PDFPLUMBER_AVAILABLE and PANDAS_AVAILABLE:
813
+ try:
814
+ result["tables"] = _extract_tables(lpath, table_settings)
815
+ except Exception as e:
816
+ logger.warning(f"Could not extract tables: {e}")
817
+ result["tables"] = {}
818
+
819
+ # Extract images if requested and available
820
+ if extract_images and FITZ_AVAILABLE:
821
+ try:
822
+ result["images"] = _extract_images(lpath, output_dir, save_as_jpg)
823
+ except Exception as e:
824
+ logger.warning(f"Could not extract images: {e}")
825
+ result["images"] = []
826
+
827
+ # Calculate statistics
828
+ result["stats"] = {
829
+ "total_chars": len(result["full_text"]),
830
+ "total_words": len(result["full_text"].split()),
831
+ "total_pages": len(result["pages"]),
832
+ "num_sections": len(result["sections"]),
833
+ "num_tables": sum(
834
+ len(tables) for tables in result.get("tables", {}).values()
835
+ ),
836
+ "num_images": len(result.get("images", [])),
837
+ "avg_words_per_page": (
838
+ len(result["full_text"].split()) / len(result["pages"])
839
+ if result["pages"]
840
+ else 0
841
+ ),
842
+ }
843
+
844
+ except Exception as e:
845
+ logger.error(f"Error in full extraction: {e}")
846
+ result["error"] = str(e)
847
+
848
+ return DotDict(result)
849
+
850
+
851
+ def _clean_pdf_text(text: str) -> str:
852
+ """Clean extracted PDF text."""
853
+ # Remove excessive whitespace
854
+ text = re.sub(r"\s+", " ", text)
855
+
856
+ # Fix hyphenated words at line breaks
857
+ text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
858
+
859
+ # Remove page numbers (common patterns)
860
+ text = re.sub(r"\n\s*\d+\s*\n", "\n", text)
861
+ text = re.sub(r"Page\s+\d+\s+of\s+\d+", "", text, flags=re.IGNORECASE)
862
+
863
+ # Clean up common PDF artifacts
864
+ text = text.replace("\x00", "") # Null bytes
865
+ text = re.sub(r"[\x01-\x1f\x7f-\x9f]", "", text) # Control characters
866
+
867
+ # Normalize quotes and dashes
868
+ text = text.replace('"', '"').replace('"', '"')
869
+ text = text.replace(""", "'").replace(""", "'")
870
+ text = text.replace("–", "-").replace("—", "-")
871
+
872
+ # Remove multiple consecutive newlines
873
+ text = re.sub(r"\n{3,}", "\n\n", text)
874
+
875
+ return text.strip()
876
+
29
877
 
878
+ def _calculate_file_hash(lpath: str) -> str:
879
+ """Calculate MD5 hash of file."""
880
+ hash_md5 = hashlib.md5()
881
+ with open(lpath, "rb") as f:
882
+ for chunk in iter(lambda: f.read(4096), b""):
883
+ hash_md5.update(chunk)
884
+ return hash_md5.hexdigest()
30
885
 
31
886
  # EOF