scitex 2.0.0__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +53 -15
- scitex/__main__.py +72 -26
- scitex/__version__.py +1 -1
- scitex/_sh.py +145 -23
- scitex/ai/__init__.py +30 -16
- scitex/ai/_gen_ai/_Anthropic.py +5 -7
- scitex/ai/_gen_ai/_BaseGenAI.py +2 -2
- scitex/ai/_gen_ai/_DeepSeek.py +10 -2
- scitex/ai/_gen_ai/_Google.py +2 -2
- scitex/ai/_gen_ai/_Llama.py +2 -2
- scitex/ai/_gen_ai/_OpenAI.py +2 -2
- scitex/ai/_gen_ai/_PARAMS.py +51 -65
- scitex/ai/_gen_ai/_Perplexity.py +2 -2
- scitex/ai/_gen_ai/__init__.py +25 -14
- scitex/ai/_gen_ai/_format_output_func.py +4 -4
- scitex/ai/classification/{classifier_server.py → Classifier.py} +5 -5
- scitex/ai/classification/CrossValidationExperiment.py +374 -0
- scitex/ai/classification/__init__.py +43 -4
- scitex/ai/classification/reporters/_BaseClassificationReporter.py +281 -0
- scitex/ai/classification/reporters/_ClassificationReporter.py +773 -0
- scitex/ai/classification/reporters/_MultiClassificationReporter.py +406 -0
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +1834 -0
- scitex/ai/classification/reporters/__init__.py +11 -0
- scitex/ai/classification/reporters/reporter_utils/_Plotter.py +1028 -0
- scitex/ai/classification/reporters/reporter_utils/__init__.py +80 -0
- scitex/ai/classification/reporters/reporter_utils/aggregation.py +457 -0
- scitex/ai/classification/reporters/reporter_utils/data_models.py +313 -0
- scitex/ai/classification/reporters/reporter_utils/reporting.py +1056 -0
- scitex/ai/classification/reporters/reporter_utils/storage.py +221 -0
- scitex/ai/classification/reporters/reporter_utils/validation.py +395 -0
- scitex/ai/classification/timeseries/_TimeSeriesBlockingSplit.py +568 -0
- scitex/ai/classification/timeseries/_TimeSeriesCalendarSplit.py +688 -0
- scitex/ai/classification/timeseries/_TimeSeriesMetadata.py +139 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +1716 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +1685 -0
- scitex/ai/classification/timeseries/_TimeSeriesStrategy.py +84 -0
- scitex/ai/classification/timeseries/_TimeSeriesStratifiedSplit.py +610 -0
- scitex/ai/classification/timeseries/__init__.py +39 -0
- scitex/ai/classification/timeseries/_normalize_timestamp.py +436 -0
- scitex/ai/clustering/_umap.py +2 -2
- scitex/ai/feature_extraction/vit.py +1 -0
- scitex/ai/feature_selection/__init__.py +30 -0
- scitex/ai/feature_selection/feature_selection.py +364 -0
- scitex/ai/loss/multi_task_loss.py +1 -1
- scitex/ai/metrics/__init__.py +51 -4
- scitex/ai/metrics/_calc_bacc.py +61 -0
- scitex/ai/metrics/_calc_bacc_from_conf_mat.py +38 -0
- scitex/ai/metrics/_calc_clf_report.py +78 -0
- scitex/ai/metrics/_calc_conf_mat.py +93 -0
- scitex/ai/metrics/_calc_feature_importance.py +183 -0
- scitex/ai/metrics/_calc_mcc.py +61 -0
- scitex/ai/metrics/_calc_pre_rec_auc.py +116 -0
- scitex/ai/metrics/_calc_roc_auc.py +110 -0
- scitex/ai/metrics/_calc_seizure_prediction_metrics.py +490 -0
- scitex/ai/metrics/{silhoute_score_block.py → _calc_silhouette_score.py} +15 -8
- scitex/ai/metrics/_normalize_labels.py +83 -0
- scitex/ai/plt/__init__.py +47 -8
- scitex/ai/plt/{_conf_mat.py → _plot_conf_mat.py} +158 -87
- scitex/ai/plt/_plot_feature_importance.py +323 -0
- scitex/ai/plt/_plot_learning_curve.py +345 -0
- scitex/ai/plt/_plot_optuna_study.py +225 -0
- scitex/ai/plt/_plot_pre_rec_curve.py +290 -0
- scitex/ai/plt/_plot_roc_curve.py +255 -0
- scitex/ai/training/{learning_curve_logger.py → _LearningCurveLogger.py} +197 -213
- scitex/ai/training/__init__.py +2 -2
- scitex/ai/utils/grid_search.py +3 -3
- scitex/benchmark/__init__.py +52 -0
- scitex/benchmark/benchmark.py +400 -0
- scitex/benchmark/monitor.py +370 -0
- scitex/benchmark/profiler.py +297 -0
- scitex/browser/__init__.py +48 -0
- scitex/browser/automation/CookieHandler.py +216 -0
- scitex/browser/automation/__init__.py +7 -0
- scitex/browser/collaboration/__init__.py +55 -0
- scitex/browser/collaboration/auth_helpers.py +94 -0
- scitex/browser/collaboration/collaborative_agent.py +136 -0
- scitex/browser/collaboration/credential_manager.py +188 -0
- scitex/browser/collaboration/interactive_panel.py +400 -0
- scitex/browser/collaboration/persistent_browser.py +170 -0
- scitex/browser/collaboration/shared_session.py +383 -0
- scitex/browser/collaboration/standard_interactions.py +246 -0
- scitex/browser/collaboration/visual_feedback.py +181 -0
- scitex/browser/core/BrowserMixin.py +326 -0
- scitex/browser/core/ChromeProfileManager.py +446 -0
- scitex/browser/core/__init__.py +9 -0
- scitex/browser/debugging/__init__.py +18 -0
- scitex/browser/debugging/_browser_logger.py +657 -0
- scitex/browser/debugging/_highlight_element.py +143 -0
- scitex/browser/debugging/_show_grid.py +154 -0
- scitex/browser/interaction/__init__.py +24 -0
- scitex/browser/interaction/click_center.py +149 -0
- scitex/browser/interaction/click_with_fallbacks.py +206 -0
- scitex/browser/interaction/close_popups.py +498 -0
- scitex/browser/interaction/fill_with_fallbacks.py +209 -0
- scitex/browser/pdf/__init__.py +14 -0
- scitex/browser/pdf/click_download_for_chrome_pdf_viewer.py +200 -0
- scitex/browser/pdf/detect_chrome_pdf_viewer.py +198 -0
- scitex/browser/remote/CaptchaHandler.py +434 -0
- scitex/browser/remote/ZenRowsAPIClient.py +347 -0
- scitex/browser/remote/ZenRowsBrowserManager.py +570 -0
- scitex/browser/remote/__init__.py +11 -0
- scitex/browser/stealth/HumanBehavior.py +344 -0
- scitex/browser/stealth/StealthManager.py +1008 -0
- scitex/browser/stealth/__init__.py +9 -0
- scitex/browser/template.py +122 -0
- scitex/capture/__init__.py +110 -0
- scitex/capture/__main__.py +25 -0
- scitex/capture/capture.py +848 -0
- scitex/capture/cli.py +233 -0
- scitex/capture/gif.py +344 -0
- scitex/capture/mcp_server.py +961 -0
- scitex/capture/session.py +70 -0
- scitex/capture/utils.py +705 -0
- scitex/cli/__init__.py +17 -0
- scitex/cli/cloud.py +447 -0
- scitex/cli/main.py +42 -0
- scitex/cli/scholar.py +280 -0
- scitex/context/_suppress_output.py +5 -3
- scitex/db/__init__.py +30 -3
- scitex/db/__main__.py +75 -0
- scitex/db/_check_health.py +381 -0
- scitex/db/_delete_duplicates.py +25 -386
- scitex/db/_inspect.py +335 -114
- scitex/db/_inspect_optimized.py +301 -0
- scitex/db/{_PostgreSQL.py → _postgresql/_PostgreSQL.py} +3 -3
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BackupMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BatchMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BlobMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_ConnectionMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_MaintenanceMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_QueryMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_SchemaMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_TransactionMixin.py +1 -1
- scitex/db/_postgresql/__init__.py +6 -0
- scitex/db/_sqlite3/_SQLite3.py +210 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin.py +581 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin_v01-need-_hash-col.py +517 -0
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_BatchMixin.py +1 -1
- scitex/db/_sqlite3/_SQLite3Mixins/_BlobMixin.py +281 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin.py +548 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin_v01-indentation-issues.py +583 -0
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_ConnectionMixin.py +29 -13
- scitex/db/_sqlite3/_SQLite3Mixins/_GitMixin.py +583 -0
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_ImportExportMixin.py +1 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_IndexMixin.py +1 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_MaintenanceMixin.py +2 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_QueryMixin.py +37 -10
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_RowMixin.py +46 -6
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_TableMixin.py +56 -10
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_TransactionMixin.py +1 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/__init__.py +14 -2
- scitex/db/_sqlite3/__init__.py +7 -0
- scitex/db/_sqlite3/_delete_duplicates.py +274 -0
- scitex/decorators/__init__.py +2 -0
- scitex/decorators/_cache_disk.py +13 -5
- scitex/decorators/_cache_disk_async.py +49 -0
- scitex/decorators/_deprecated.py +175 -10
- scitex/decorators/_timeout.py +1 -1
- scitex/dev/_analyze_code_flow.py +2 -2
- scitex/dict/_DotDict.py +73 -15
- scitex/dict/_DotDict_v01-not-handling-recursive-instantiations.py +442 -0
- scitex/dict/_DotDict_v02-not-serializing-Path-object.py +446 -0
- scitex/dict/__init__.py +2 -0
- scitex/dict/_flatten.py +27 -0
- scitex/dsp/_crop.py +2 -2
- scitex/dsp/_demo_sig.py +2 -2
- scitex/dsp/_detect_ripples.py +2 -2
- scitex/dsp/_hilbert.py +2 -2
- scitex/dsp/_listen.py +6 -6
- scitex/dsp/_modulation_index.py +2 -2
- scitex/dsp/_pac.py +1 -1
- scitex/dsp/_psd.py +2 -2
- scitex/dsp/_resample.py +2 -1
- scitex/dsp/_time.py +3 -2
- scitex/dsp/_wavelet.py +3 -2
- scitex/dsp/add_noise.py +2 -2
- scitex/dsp/example.py +1 -0
- scitex/dsp/filt.py +10 -9
- scitex/dsp/template.py +3 -2
- scitex/dsp/utils/_differential_bandpass_filters.py +1 -1
- scitex/dsp/utils/pac.py +2 -2
- scitex/dt/_normalize_timestamp.py +432 -0
- scitex/errors.py +572 -0
- scitex/gen/_DimHandler.py +2 -2
- scitex/gen/__init__.py +37 -7
- scitex/gen/_deprecated_close.py +80 -0
- scitex/gen/_deprecated_start.py +26 -0
- scitex/gen/_detect_environment.py +152 -0
- scitex/gen/_detect_notebook_path.py +169 -0
- scitex/gen/_embed.py +6 -2
- scitex/gen/_get_notebook_path.py +257 -0
- scitex/gen/_less.py +1 -1
- scitex/gen/_list_packages.py +2 -2
- scitex/gen/_norm.py +44 -9
- scitex/gen/_norm_cache.py +269 -0
- scitex/gen/_src.py +3 -5
- scitex/gen/_title_case.py +3 -3
- scitex/io/__init__.py +28 -6
- scitex/io/_glob.py +13 -7
- scitex/io/_load.py +108 -21
- scitex/io/_load_cache.py +303 -0
- scitex/io/_load_configs.py +40 -15
- scitex/io/{_H5Explorer.py → _load_modules/_H5Explorer.py} +80 -17
- scitex/io/_load_modules/_ZarrExplorer.py +114 -0
- scitex/io/_load_modules/_bibtex.py +207 -0
- scitex/io/_load_modules/_hdf5.py +53 -178
- scitex/io/_load_modules/_json.py +5 -3
- scitex/io/_load_modules/_pdf.py +871 -16
- scitex/io/_load_modules/_sqlite3.py +15 -0
- scitex/io/_load_modules/_txt.py +41 -12
- scitex/io/_load_modules/_yaml.py +4 -3
- scitex/io/_load_modules/_zarr.py +126 -0
- scitex/io/_save.py +429 -171
- scitex/io/_save_modules/__init__.py +6 -0
- scitex/io/_save_modules/_bibtex.py +194 -0
- scitex/io/_save_modules/_csv.py +8 -4
- scitex/io/_save_modules/_excel.py +174 -15
- scitex/io/_save_modules/_hdf5.py +251 -226
- scitex/io/_save_modules/_image.py +1 -3
- scitex/io/_save_modules/_json.py +49 -4
- scitex/io/_save_modules/_listed_dfs_as_csv.py +1 -3
- scitex/io/_save_modules/_listed_scalars_as_csv.py +1 -3
- scitex/io/_save_modules/_tex.py +277 -0
- scitex/io/_save_modules/_yaml.py +42 -3
- scitex/io/_save_modules/_zarr.py +160 -0
- scitex/io/utils/__init__.py +20 -0
- scitex/io/utils/h5_to_zarr.py +616 -0
- scitex/linalg/_geometric_median.py +6 -2
- scitex/{gen/_tee.py → logging/_Tee.py} +43 -84
- scitex/logging/__init__.py +122 -0
- scitex/logging/_config.py +158 -0
- scitex/logging/_context.py +103 -0
- scitex/logging/_formatters.py +128 -0
- scitex/logging/_handlers.py +64 -0
- scitex/logging/_levels.py +35 -0
- scitex/logging/_logger.py +163 -0
- scitex/logging/_print_capture.py +95 -0
- scitex/ml/__init__.py +69 -0
- scitex/{ai/genai/anthropic.py → ml/_gen_ai/_Anthropic.py} +13 -19
- scitex/{ai/genai/base_genai.py → ml/_gen_ai/_BaseGenAI.py} +5 -5
- scitex/{ai/genai/deepseek.py → ml/_gen_ai/_DeepSeek.py} +11 -16
- scitex/{ai/genai/google.py → ml/_gen_ai/_Google.py} +7 -15
- scitex/{ai/genai/groq.py → ml/_gen_ai/_Groq.py} +1 -8
- scitex/{ai/genai/llama.py → ml/_gen_ai/_Llama.py} +3 -16
- scitex/{ai/genai/openai.py → ml/_gen_ai/_OpenAI.py} +3 -3
- scitex/{ai/genai/params.py → ml/_gen_ai/_PARAMS.py} +51 -65
- scitex/{ai/genai/perplexity.py → ml/_gen_ai/_Perplexity.py} +3 -14
- scitex/ml/_gen_ai/__init__.py +43 -0
- scitex/{ai/genai/calc_cost.py → ml/_gen_ai/_calc_cost.py} +1 -1
- scitex/{ai/genai/format_output_func.py → ml/_gen_ai/_format_output_func.py} +4 -4
- scitex/{ai/genai/genai_factory.py → ml/_gen_ai/_genai_factory.py} +8 -8
- scitex/ml/activation/__init__.py +8 -0
- scitex/ml/activation/_define.py +11 -0
- scitex/{ai/classifier_server.py → ml/classification/Classifier.py} +5 -5
- scitex/ml/classification/CrossValidationExperiment.py +374 -0
- scitex/ml/classification/__init__.py +46 -0
- scitex/ml/classification/reporters/_BaseClassificationReporter.py +281 -0
- scitex/ml/classification/reporters/_ClassificationReporter.py +773 -0
- scitex/ml/classification/reporters/_MultiClassificationReporter.py +406 -0
- scitex/ml/classification/reporters/_SingleClassificationReporter.py +1834 -0
- scitex/ml/classification/reporters/__init__.py +11 -0
- scitex/ml/classification/reporters/reporter_utils/_Plotter.py +1028 -0
- scitex/ml/classification/reporters/reporter_utils/__init__.py +80 -0
- scitex/ml/classification/reporters/reporter_utils/aggregation.py +457 -0
- scitex/ml/classification/reporters/reporter_utils/data_models.py +313 -0
- scitex/ml/classification/reporters/reporter_utils/reporting.py +1056 -0
- scitex/ml/classification/reporters/reporter_utils/storage.py +221 -0
- scitex/ml/classification/reporters/reporter_utils/validation.py +395 -0
- scitex/ml/classification/timeseries/_TimeSeriesBlockingSplit.py +568 -0
- scitex/ml/classification/timeseries/_TimeSeriesCalendarSplit.py +688 -0
- scitex/ml/classification/timeseries/_TimeSeriesMetadata.py +139 -0
- scitex/ml/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +1716 -0
- scitex/ml/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +1685 -0
- scitex/ml/classification/timeseries/_TimeSeriesStrategy.py +84 -0
- scitex/ml/classification/timeseries/_TimeSeriesStratifiedSplit.py +610 -0
- scitex/ml/classification/timeseries/__init__.py +39 -0
- scitex/ml/classification/timeseries/_normalize_timestamp.py +436 -0
- scitex/ml/clustering/__init__.py +11 -0
- scitex/ml/clustering/_pca.py +115 -0
- scitex/ml/clustering/_umap.py +376 -0
- scitex/ml/feature_extraction/__init__.py +56 -0
- scitex/ml/feature_extraction/vit.py +149 -0
- scitex/ml/feature_selection/__init__.py +30 -0
- scitex/ml/feature_selection/feature_selection.py +364 -0
- scitex/ml/loss/_L1L2Losses.py +34 -0
- scitex/ml/loss/__init__.py +12 -0
- scitex/ml/loss/multi_task_loss.py +47 -0
- scitex/ml/metrics/__init__.py +56 -0
- scitex/ml/metrics/_calc_bacc.py +61 -0
- scitex/ml/metrics/_calc_bacc_from_conf_mat.py +38 -0
- scitex/ml/metrics/_calc_clf_report.py +78 -0
- scitex/ml/metrics/_calc_conf_mat.py +93 -0
- scitex/ml/metrics/_calc_feature_importance.py +183 -0
- scitex/ml/metrics/_calc_mcc.py +61 -0
- scitex/ml/metrics/_calc_pre_rec_auc.py +116 -0
- scitex/ml/metrics/_calc_roc_auc.py +110 -0
- scitex/ml/metrics/_calc_seizure_prediction_metrics.py +490 -0
- scitex/ml/metrics/_calc_silhouette_score.py +503 -0
- scitex/ml/metrics/_normalize_labels.py +83 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/__init__.py +0 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/__init__.py +3 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger.py +207 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger2020.py +238 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger913A.py +215 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/rangerqh.py +184 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/setup.py +24 -0
- scitex/ml/optim/__init__.py +13 -0
- scitex/ml/optim/_get_set.py +31 -0
- scitex/ml/optim/_optimizers.py +71 -0
- scitex/ml/plt/__init__.py +60 -0
- scitex/ml/plt/_plot_conf_mat.py +663 -0
- scitex/ml/plt/_plot_feature_importance.py +323 -0
- scitex/ml/plt/_plot_learning_curve.py +345 -0
- scitex/ml/plt/_plot_optuna_study.py +225 -0
- scitex/ml/plt/_plot_pre_rec_curve.py +290 -0
- scitex/ml/plt/_plot_roc_curve.py +255 -0
- scitex/ml/sk/__init__.py +11 -0
- scitex/ml/sk/_clf.py +58 -0
- scitex/ml/sk/_to_sktime.py +100 -0
- scitex/ml/sklearn/__init__.py +26 -0
- scitex/ml/sklearn/clf.py +58 -0
- scitex/ml/sklearn/to_sktime.py +100 -0
- scitex/{ai/training/early_stopping.py → ml/training/_EarlyStopping.py} +1 -2
- scitex/{ai → ml/training}/_LearningCurveLogger.py +198 -242
- scitex/ml/training/__init__.py +7 -0
- scitex/ml/utils/__init__.py +22 -0
- scitex/ml/utils/_check_params.py +50 -0
- scitex/ml/utils/_default_dataset.py +46 -0
- scitex/ml/utils/_format_samples_for_sktime.py +26 -0
- scitex/ml/utils/_label_encoder.py +134 -0
- scitex/ml/utils/_merge_labels.py +22 -0
- scitex/ml/utils/_sliding_window_data_augmentation.py +11 -0
- scitex/ml/utils/_under_sample.py +51 -0
- scitex/ml/utils/_verify_n_gpus.py +16 -0
- scitex/ml/utils/grid_search.py +148 -0
- scitex/nn/_BNet.py +15 -9
- scitex/nn/_Filters.py +2 -2
- scitex/nn/_ModulationIndex.py +2 -2
- scitex/nn/_PAC.py +1 -1
- scitex/nn/_Spectrogram.py +12 -3
- scitex/nn/__init__.py +9 -10
- scitex/path/__init__.py +18 -0
- scitex/path/_clean.py +4 -0
- scitex/path/_find.py +9 -4
- scitex/path/_symlink.py +348 -0
- scitex/path/_version.py +4 -3
- scitex/pd/__init__.py +2 -0
- scitex/pd/_get_unique.py +99 -0
- scitex/plt/__init__.py +114 -5
- scitex/plt/_subplots/_AxesWrapper.py +1 -3
- scitex/plt/_subplots/_AxisWrapper.py +7 -3
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin.py +47 -13
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin.py +160 -2
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin.py +26 -4
- scitex/plt/_subplots/_AxisWrapperMixins/_UnitAwareMixin.py +322 -0
- scitex/plt/_subplots/_AxisWrapperMixins/__init__.py +1 -0
- scitex/plt/_subplots/_FigWrapper.py +62 -6
- scitex/plt/_subplots/_export_as_csv.py +43 -27
- scitex/plt/_subplots/_export_as_csv_formatters/__init__.py +5 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_annotate.py +81 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_bar.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_barh.py +20 -5
- scitex/plt/_subplots/_export_as_csv_formatters/_format_boxplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_contour.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_errorbar.py +35 -18
- scitex/plt/_subplots/_export_as_csv_formatters/_format_eventplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill_between.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_hist.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow2d.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +15 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_box.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_conf_mat.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_ecdf.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_fillv.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_heatmap.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_image.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_joyplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_kde.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_line.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_ci.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_std.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_median_iqr.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_raster.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_rectangle.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter.py +35 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter_hist.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_shaded_line.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_violin.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_scatter.py +6 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_barplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_boxplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_heatmap.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_histplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_jointplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_kdeplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_lineplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_pairplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_scatterplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_stripplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_swarmplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_violinplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_text.py +60 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violin.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violinplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/test_formatters.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters.py +56 -59
- scitex/plt/ax/_style/_hide_spines.py +1 -3
- scitex/plt/ax/_style/_rotate_labels.py +180 -76
- scitex/plt/ax/_style/_rotate_labels_v01.py +248 -0
- scitex/plt/ax/_style/_set_meta.py +11 -4
- scitex/plt/ax/_style/_set_supxyt.py +3 -3
- scitex/plt/ax/_style/_set_xyt.py +3 -3
- scitex/plt/ax/_style/_share_axes.py +2 -2
- scitex/plt/color/__init__.py +4 -4
- scitex/plt/color/{_get_colors_from_cmap.py → _get_colors_from_conf_matap.py} +7 -7
- scitex/plt/utils/_configure_mpl.py +99 -86
- scitex/plt/utils/_histogram_utils.py +1 -3
- scitex/plt/utils/_is_valid_axis.py +1 -3
- scitex/plt/utils/_scitex_config.py +1 -0
- scitex/repro/__init__.py +75 -0
- scitex/{reproduce → repro}/_gen_ID.py +1 -1
- scitex/{reproduce → repro}/_gen_timestamp.py +1 -1
- scitex/repro_rng/_RandomStateManager.py +590 -0
- scitex/repro_rng/_RandomStateManager_v01-no-verbose-options.py +414 -0
- scitex/repro_rng/__init__.py +39 -0
- scitex/reproduce/__init__.py +25 -13
- scitex/reproduce/_hash_array.py +22 -0
- scitex/resource/_get_processor_usages.py +4 -4
- scitex/resource/_get_specs.py +2 -2
- scitex/resource/_log_processor_usages.py +2 -2
- scitex/rng/_RandomStateManager.py +590 -0
- scitex/rng/_RandomStateManager_v01-no-verbose-options.py +414 -0
- scitex/rng/__init__.py +39 -0
- scitex/scholar/__init__.py +309 -19
- scitex/scholar/__main__.py +319 -0
- scitex/scholar/auth/ScholarAuthManager.py +308 -0
- scitex/scholar/auth/__init__.py +12 -0
- scitex/scholar/auth/core/AuthenticationGateway.py +473 -0
- scitex/scholar/auth/core/BrowserAuthenticator.py +386 -0
- scitex/scholar/auth/core/StrategyResolver.py +309 -0
- scitex/scholar/auth/core/__init__.py +16 -0
- scitex/scholar/auth/gateway/_OpenURLLinkFinder.py +120 -0
- scitex/scholar/auth/gateway/_OpenURLResolver.py +209 -0
- scitex/scholar/auth/gateway/__init__.py +38 -0
- scitex/scholar/auth/gateway/_resolve_functions.py +101 -0
- scitex/scholar/auth/providers/BaseAuthenticator.py +166 -0
- scitex/scholar/auth/providers/EZProxyAuthenticator.py +484 -0
- scitex/scholar/auth/providers/OpenAthensAuthenticator.py +619 -0
- scitex/scholar/auth/providers/ShibbolethAuthenticator.py +686 -0
- scitex/scholar/auth/providers/__init__.py +18 -0
- scitex/scholar/auth/session/AuthCacheManager.py +189 -0
- scitex/scholar/auth/session/SessionManager.py +159 -0
- scitex/scholar/auth/session/__init__.py +11 -0
- scitex/scholar/auth/sso/BaseSSOAutomator.py +373 -0
- scitex/scholar/auth/sso/OpenAthensSSOAutomator.py +378 -0
- scitex/scholar/auth/sso/SSOAutomator.py +180 -0
- scitex/scholar/auth/sso/UniversityOfMelbourneSSOAutomator.py +380 -0
- scitex/scholar/auth/sso/__init__.py +15 -0
- scitex/scholar/browser/ScholarBrowserManager.py +705 -0
- scitex/scholar/browser/__init__.py +38 -0
- scitex/scholar/browser/utils/__init__.py +13 -0
- scitex/scholar/browser/utils/click_and_wait.py +205 -0
- scitex/scholar/browser/utils/close_unwanted_pages.py +140 -0
- scitex/scholar/browser/utils/wait_redirects.py +732 -0
- scitex/scholar/config/PublisherRules.py +132 -0
- scitex/scholar/config/ScholarConfig.py +126 -0
- scitex/scholar/config/__init__.py +17 -0
- scitex/scholar/core/Paper.py +627 -0
- scitex/scholar/core/Papers.py +722 -0
- scitex/scholar/core/Scholar.py +1975 -0
- scitex/scholar/core/__init__.py +9 -0
- scitex/scholar/impact_factor/ImpactFactorEngine.py +204 -0
- scitex/scholar/impact_factor/__init__.py +20 -0
- scitex/scholar/impact_factor/estimation/ImpactFactorEstimationEngine.py +0 -0
- scitex/scholar/impact_factor/estimation/__init__.py +40 -0
- scitex/scholar/impact_factor/estimation/build_database.py +0 -0
- scitex/scholar/impact_factor/estimation/core/__init__.py +28 -0
- scitex/scholar/impact_factor/estimation/core/cache_manager.py +523 -0
- scitex/scholar/impact_factor/estimation/core/calculator.py +355 -0
- scitex/scholar/impact_factor/estimation/core/journal_matcher.py +428 -0
- scitex/scholar/integration/__init__.py +59 -0
- scitex/scholar/integration/base.py +502 -0
- scitex/scholar/integration/mendeley/__init__.py +22 -0
- scitex/scholar/integration/mendeley/exporter.py +166 -0
- scitex/scholar/integration/mendeley/importer.py +236 -0
- scitex/scholar/integration/mendeley/linker.py +79 -0
- scitex/scholar/integration/mendeley/mapper.py +212 -0
- scitex/scholar/integration/zotero/__init__.py +27 -0
- scitex/scholar/integration/zotero/__main__.py +264 -0
- scitex/scholar/integration/zotero/exporter.py +351 -0
- scitex/scholar/integration/zotero/importer.py +372 -0
- scitex/scholar/integration/zotero/linker.py +415 -0
- scitex/scholar/integration/zotero/mapper.py +286 -0
- scitex/scholar/metadata_engines/ScholarEngine.py +588 -0
- scitex/scholar/metadata_engines/__init__.py +21 -0
- scitex/scholar/metadata_engines/individual/ArXivEngine.py +397 -0
- scitex/scholar/metadata_engines/individual/CrossRefEngine.py +274 -0
- scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +263 -0
- scitex/scholar/metadata_engines/individual/OpenAlexEngine.py +350 -0
- scitex/scholar/metadata_engines/individual/PubMedEngine.py +329 -0
- scitex/scholar/metadata_engines/individual/SemanticScholarEngine.py +438 -0
- scitex/scholar/metadata_engines/individual/URLDOIEngine.py +410 -0
- scitex/scholar/metadata_engines/individual/_BaseDOIEngine.py +487 -0
- scitex/scholar/metadata_engines/individual/__init__.py +7 -0
- scitex/scholar/metadata_engines/utils/_PubMedConverter.py +469 -0
- scitex/scholar/metadata_engines/utils/_URLDOIExtractor.py +283 -0
- scitex/scholar/metadata_engines/utils/__init__.py +30 -0
- scitex/scholar/metadata_engines/utils/_metadata2bibtex.py +103 -0
- scitex/scholar/metadata_engines/utils/_standardize_metadata.py +376 -0
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +579 -0
- scitex/scholar/pdf_download/__init__.py +5 -0
- scitex/scholar/pdf_download/strategies/__init__.py +38 -0
- scitex/scholar/pdf_download/strategies/chrome_pdf_viewer.py +376 -0
- scitex/scholar/pdf_download/strategies/direct_download.py +131 -0
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +167 -0
- scitex/scholar/pdf_download/strategies/manual_download_utils.py +996 -0
- scitex/scholar/pdf_download/strategies/response_body.py +207 -0
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +364 -0
- scitex/scholar/pipelines/ScholarPipelineParallel.py +478 -0
- scitex/scholar/pipelines/ScholarPipelineSingle.py +767 -0
- scitex/scholar/pipelines/__init__.py +49 -0
- scitex/scholar/storage/BibTeXHandler.py +1018 -0
- scitex/scholar/storage/PaperIO.py +468 -0
- scitex/scholar/storage/ScholarLibrary.py +182 -0
- scitex/scholar/storage/_DeduplicationManager.py +548 -0
- scitex/scholar/storage/_LibraryCacheManager.py +724 -0
- scitex/scholar/storage/_LibraryManager.py +1835 -0
- scitex/scholar/storage/__init__.py +28 -0
- scitex/scholar/url_finder/ScholarURLFinder.py +379 -0
- scitex/scholar/url_finder/__init__.py +7 -0
- scitex/scholar/url_finder/strategies/__init__.py +33 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_direct_links.py +261 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_dropdown.py +67 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_href.py +204 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_navigation.py +256 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_publisher_patterns.py +165 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_zotero_translators.py +163 -0
- scitex/scholar/url_finder/strategies/find_supplementary_urls_by_href.py +70 -0
- scitex/scholar/utils/__init__.py +22 -0
- scitex/scholar/utils/bibtex/__init__.py +9 -0
- scitex/scholar/utils/bibtex/_parse_bibtex.py +71 -0
- scitex/scholar/utils/cleanup/__init__.py +8 -0
- scitex/scholar/utils/cleanup/_cleanup_scholar_processes.py +96 -0
- scitex/scholar/utils/cleanup/cleanup_old_extractions.py +117 -0
- scitex/scholar/utils/text/_TextNormalizer.py +407 -0
- scitex/scholar/utils/text/__init__.py +9 -0
- scitex/scholar/zotero/__init__.py +38 -0
- scitex/session/__init__.py +51 -0
- scitex/session/_lifecycle.py +736 -0
- scitex/session/_manager.py +102 -0
- scitex/session/template.py +122 -0
- scitex/stats/__init__.py +30 -26
- scitex/stats/correct/__init__.py +21 -0
- scitex/stats/correct/_correct_bonferroni.py +551 -0
- scitex/stats/correct/_correct_fdr.py +634 -0
- scitex/stats/correct/_correct_holm.py +548 -0
- scitex/stats/correct/_correct_sidak.py +499 -0
- scitex/stats/descriptive/__init__.py +85 -0
- scitex/stats/descriptive/_circular.py +540 -0
- scitex/stats/descriptive/_describe.py +219 -0
- scitex/stats/descriptive/_nan.py +518 -0
- scitex/stats/descriptive/_real.py +189 -0
- scitex/stats/effect_sizes/__init__.py +41 -0
- scitex/stats/effect_sizes/_cliffs_delta.py +325 -0
- scitex/stats/effect_sizes/_cohens_d.py +342 -0
- scitex/stats/effect_sizes/_epsilon_squared.py +315 -0
- scitex/stats/effect_sizes/_eta_squared.py +302 -0
- scitex/stats/effect_sizes/_prob_superiority.py +296 -0
- scitex/stats/posthoc/__init__.py +19 -0
- scitex/stats/posthoc/_dunnett.py +463 -0
- scitex/stats/posthoc/_games_howell.py +383 -0
- scitex/stats/posthoc/_tukey_hsd.py +367 -0
- scitex/stats/power/__init__.py +19 -0
- scitex/stats/power/_power.py +433 -0
- scitex/stats/template.py +119 -0
- scitex/stats/utils/__init__.py +62 -0
- scitex/stats/utils/_effect_size.py +985 -0
- scitex/stats/utils/_formatters.py +270 -0
- scitex/stats/utils/_normalizers.py +927 -0
- scitex/stats/utils/_power.py +433 -0
- scitex/stats_v01/_EffectSizeCalculator.py +488 -0
- scitex/stats_v01/_StatisticalValidator.py +411 -0
- scitex/stats_v01/__init__.py +60 -0
- scitex/stats_v01/_additional_tests.py +415 -0
- scitex/{stats → stats_v01}/_p2stars.py +19 -5
- scitex/stats_v01/_two_sample_tests.py +141 -0
- scitex/stats_v01/desc/__init__.py +83 -0
- scitex/stats_v01/desc/_circular.py +540 -0
- scitex/stats_v01/desc/_describe.py +219 -0
- scitex/stats_v01/desc/_nan.py +518 -0
- scitex/{stats/desc/_nan.py → stats_v01/desc/_nan_v01-20250920_145731.py} +23 -12
- scitex/stats_v01/desc/_real.py +189 -0
- scitex/stats_v01/tests/__corr_test_optimized.py +221 -0
- scitex/stats_v01/tests/_corr_test_optimized.py +179 -0
- scitex/str/__init__.py +1 -3
- scitex/str/_clean_path.py +6 -2
- scitex/str/_latex_fallback.py +267 -160
- scitex/str/_parse.py +44 -36
- scitex/str/_printc.py +1 -3
- scitex/template/__init__.py +87 -0
- scitex/template/_create_project.py +267 -0
- scitex/template/create_pip_project.py +80 -0
- scitex/template/create_research.py +80 -0
- scitex/template/create_singularity.py +80 -0
- scitex/units.py +291 -0
- scitex/utils/_compress_hdf5.py +14 -3
- scitex/utils/_email.py +21 -2
- scitex/utils/_grid.py +6 -4
- scitex/utils/_notify.py +13 -10
- scitex/utils/_verify_scitex_format.py +589 -0
- scitex/utils/_verify_scitex_format_v01.py +370 -0
- scitex/utils/template.py +122 -0
- scitex/web/_search_pubmed.py +62 -16
- scitex-2.1.0.dist-info/LICENSE +21 -0
- scitex-2.1.0.dist-info/METADATA +677 -0
- scitex-2.1.0.dist-info/RECORD +919 -0
- {scitex-2.0.0.dist-info → scitex-2.1.0.dist-info}/WHEEL +1 -1
- scitex-2.1.0.dist-info/entry_points.txt +3 -0
- scitex/ai/__Classifiers.py +0 -101
- scitex/ai/classification/classification_reporter.py +0 -1137
- scitex/ai/classification/classifiers.py +0 -101
- scitex/ai/classification_reporter.py +0 -1161
- scitex/ai/genai/__init__.py +0 -277
- scitex/ai/genai/anthropic_provider.py +0 -320
- scitex/ai/genai/anthropic_refactored.py +0 -109
- scitex/ai/genai/auth_manager.py +0 -200
- scitex/ai/genai/base_provider.py +0 -291
- scitex/ai/genai/chat_history.py +0 -307
- scitex/ai/genai/cost_tracker.py +0 -276
- scitex/ai/genai/deepseek_provider.py +0 -251
- scitex/ai/genai/google_provider.py +0 -228
- scitex/ai/genai/groq_provider.py +0 -248
- scitex/ai/genai/image_processor.py +0 -250
- scitex/ai/genai/llama_provider.py +0 -214
- scitex/ai/genai/mock_provider.py +0 -127
- scitex/ai/genai/model_registry.py +0 -304
- scitex/ai/genai/openai_provider.py +0 -293
- scitex/ai/genai/perplexity_provider.py +0 -205
- scitex/ai/genai/provider_base.py +0 -302
- scitex/ai/genai/provider_factory.py +0 -370
- scitex/ai/genai/response_handler.py +0 -235
- scitex/ai/layer/_Pass.py +0 -21
- scitex/ai/layer/__init__.py +0 -10
- scitex/ai/layer/_switch.py +0 -8
- scitex/ai/metrics/_bACC.py +0 -51
- scitex/ai/plt/_learning_curve.py +0 -194
- scitex/ai/plt/_optuna_study.py +0 -111
- scitex/ai/plt/aucs/__init__.py +0 -2
- scitex/ai/plt/aucs/example.py +0 -60
- scitex/ai/plt/aucs/pre_rec_auc.py +0 -223
- scitex/ai/plt/aucs/roc_auc.py +0 -246
- scitex/ai/sampling/undersample.py +0 -29
- scitex/db/_SQLite3.py +0 -2136
- scitex/db/_SQLite3Mixins/_BlobMixin.py +0 -229
- scitex/gen/_close.py +0 -222
- scitex/gen/_start.py +0 -451
- scitex/general/__init__.py +0 -5
- scitex/io/_load_modules/_db.py +0 -24
- scitex/life/__init__.py +0 -10
- scitex/life/_monitor_rain.py +0 -49
- scitex/reproduce/_fix_seeds.py +0 -45
- scitex/res/__init__.py +0 -5
- scitex/scholar/_local_search.py +0 -454
- scitex/scholar/_paper.py +0 -244
- scitex/scholar/_pdf_downloader.py +0 -325
- scitex/scholar/_search.py +0 -393
- scitex/scholar/_vector_search.py +0 -370
- scitex/scholar/_web_sources.py +0 -457
- scitex/stats/desc/__init__.py +0 -40
- scitex-2.0.0.dist-info/METADATA +0 -307
- scitex-2.0.0.dist-info/RECORD +0 -572
- scitex-2.0.0.dist-info/licenses/LICENSE +0 -7
- /scitex/ai/{act → activation}/__init__.py +0 -0
- /scitex/ai/{act → activation}/_define.py +0 -0
- /scitex/ai/{early_stopping.py → training/_EarlyStopping.py} +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_ImportExportMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_IndexMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_RowMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_TableMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/__init__.py +0 -0
- /scitex/{stats → stats_v01}/_calc_partial_corr.py +0 -0
- /scitex/{stats → stats_v01}/_corr_test_multi.py +0 -0
- /scitex/{stats → stats_v01}/_corr_test_wrapper.py +0 -0
- /scitex/{stats → stats_v01}/_describe_wrapper.py +0 -0
- /scitex/{stats → stats_v01}/_multiple_corrections.py +0 -0
- /scitex/{stats → stats_v01}/_nan_stats.py +0 -0
- /scitex/{stats → stats_v01}/_p2stars_wrapper.py +0 -0
- /scitex/{stats → stats_v01}/_statistical_tests.py +0 -0
- /scitex/{stats/desc/_describe.py → stats_v01/desc/_describe_v01-20250920_145731.py} +0 -0
- /scitex/{stats/desc/_real.py → stats_v01/desc/_real_v01-20250920_145731.py} +0 -0
- /scitex/{stats → stats_v01}/multiple/__init__.py +0 -0
- /scitex/{stats → stats_v01}/multiple/_bonferroni_correction.py +0 -0
- /scitex/{stats → stats_v01}/multiple/_fdr_correction.py +0 -0
- /scitex/{stats → stats_v01}/multiple/_multicompair.py +0 -0
- /scitex/{stats → stats_v01}/tests/__corr_test.py +0 -0
- /scitex/{stats → stats_v01}/tests/__corr_test_multi.py +0 -0
- /scitex/{stats → stats_v01}/tests/__corr_test_single.py +0 -0
- /scitex/{stats → stats_v01}/tests/__init__.py +0 -0
- /scitex/{stats → stats_v01}/tests/_brunner_munzel_test.py +0 -0
- /scitex/{stats → stats_v01}/tests/_nocorrelation_test.py +0 -0
- /scitex/{stats → stats_v01}/tests/_smirnov_grubbs.py +0 -0
- {scitex-2.0.0.dist-info → scitex-2.1.0.dist-info}/top_level.txt +0 -0
scitex/io/_load_modules/_pdf.py
CHANGED
|
@@ -1,31 +1,886 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
|
-
#
|
|
4
|
-
# File:
|
|
3
|
+
# Timestamp: "2025-10-06 10:27:52 (ywatanabe)"
|
|
4
|
+
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/io/_load_modules/_pdf.py
|
|
5
|
+
# ----------------------------------------
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
import os
|
|
8
|
+
__FILE__ = __file__
|
|
9
|
+
__DIR__ = os.path.dirname(__FILE__)
|
|
10
|
+
# ----------------------------------------
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
Enhanced PDF loading module with comprehensive extraction capabilities.
|
|
14
|
+
|
|
15
|
+
This module provides advanced PDF extraction for scientific papers, including:
|
|
16
|
+
- Text extraction with formatting preservation
|
|
17
|
+
- Table extraction as pandas DataFrames
|
|
18
|
+
- Image extraction with metadata
|
|
19
|
+
- Section-aware text parsing
|
|
20
|
+
- Multiple extraction modes for different use cases
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import hashlib
|
|
24
|
+
import re
|
|
25
|
+
import tempfile
|
|
26
|
+
from typing import Any, Dict, List
|
|
27
|
+
|
|
28
|
+
from scitex import logging
|
|
29
|
+
from scitex.dict import DotDict
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# Try to import PDF libraries in order of preference
|
|
34
|
+
try:
|
|
35
|
+
import fitz # PyMuPDF - preferred for text and images
|
|
36
|
+
|
|
37
|
+
FITZ_AVAILABLE = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
FITZ_AVAILABLE = False
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import pdfplumber # Best for table extraction
|
|
43
|
+
|
|
44
|
+
PDFPLUMBER_AVAILABLE = True
|
|
45
|
+
except ImportError:
|
|
46
|
+
PDFPLUMBER_AVAILABLE = False
|
|
5
47
|
|
|
6
48
|
try:
|
|
7
|
-
import PyPDF2
|
|
49
|
+
import PyPDF2 # Fallback option
|
|
50
|
+
|
|
51
|
+
PYPDF2_AVAILABLE = True
|
|
8
52
|
except ImportError:
|
|
9
|
-
|
|
53
|
+
PYPDF2_AVAILABLE = False
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
import pandas as pd
|
|
57
|
+
|
|
58
|
+
PANDAS_AVAILABLE = True
|
|
59
|
+
except ImportError:
|
|
60
|
+
PANDAS_AVAILABLE = False
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _load_pdf(lpath: str, mode: str = "full", **kwargs) -> Any:
|
|
64
|
+
"""
|
|
65
|
+
Load PDF file with comprehensive extraction capabilities.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
lpath: Path to PDF file
|
|
69
|
+
mode: Extraction mode (default: 'full')
|
|
70
|
+
- 'full': Complete extraction including text, sections, metadata, pages, tables, and images
|
|
71
|
+
- 'scientific': Optimized for scientific papers (text + sections + tables + images + stats)
|
|
72
|
+
- 'text': Plain text extraction only
|
|
73
|
+
- 'sections': Section-aware text extraction
|
|
74
|
+
- 'tables': Extract tables as DataFrames
|
|
75
|
+
- 'images': Extract images with metadata
|
|
76
|
+
- 'metadata': PDF metadata only
|
|
77
|
+
- 'pages': Page-by-page extraction
|
|
78
|
+
**kwargs: Additional arguments
|
|
79
|
+
- backend: 'auto' (default), 'fitz', 'pdfplumber', or 'pypdf2'
|
|
80
|
+
- clean_text: Clean extracted text (default: True)
|
|
81
|
+
- extract_images: Extract images to files (default: False for 'full' mode, True for 'scientific')
|
|
82
|
+
- output_dir: Directory for extracted images/tables (default: temp dir)
|
|
83
|
+
- save_as_jpg: Convert all extracted images to JPG format (default: True)
|
|
84
|
+
- table_settings: Dict of pdfplumber table extraction settings
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Extracted content based on mode:
|
|
88
|
+
- 'text': str
|
|
89
|
+
- 'sections': Dict[str, str]
|
|
90
|
+
- 'tables': Dict[int, List[pd.DataFrame]]
|
|
91
|
+
- 'images': List[Dict] with image metadata
|
|
92
|
+
- 'metadata': Dict with PDF metadata
|
|
93
|
+
- 'pages': List[Dict] with page content
|
|
94
|
+
- 'full': Dict with comprehensive extraction (text, sections, metadata, pages, tables, images, stats)
|
|
95
|
+
- 'scientific': Dict with scientific paper extraction (text, sections, metadata, tables, images, stats)
|
|
96
|
+
|
|
97
|
+
Examples:
|
|
98
|
+
>>> import scitex.io as stx
|
|
99
|
+
|
|
100
|
+
>>> # Full extraction (default) - everything included
|
|
101
|
+
>>> data = stx.load("paper.pdf")
|
|
102
|
+
>>> print(data['full_text']) # Complete text
|
|
103
|
+
>>> print(data['sections']) # Parsed sections
|
|
104
|
+
>>> print(data['tables']) # All tables as DataFrames
|
|
105
|
+
>>> print(data['metadata']) # PDF metadata
|
|
106
|
+
>>> print(data['pages']) # Page-by-page content
|
|
107
|
+
>>> print(data['stats']) # Statistics
|
|
108
|
+
|
|
109
|
+
>>> # Scientific mode (recommended for papers) - optimized for research
|
|
110
|
+
>>> paper = stx.load("paper.pdf", mode="scientific")
|
|
111
|
+
>>> print(paper['text']) # Full text
|
|
112
|
+
>>> print(paper['sections']) # Sections (Abstract, Methods, etc.)
|
|
113
|
+
>>> print(paper['tables']) # All tables as DataFrames
|
|
114
|
+
>>> print(paper['images']) # Image metadata
|
|
115
|
+
>>> print(paper['stats']) # Content statistics
|
|
116
|
+
|
|
117
|
+
>>> # Simple text extraction only
|
|
118
|
+
>>> text = stx.load("paper.pdf", mode="text")
|
|
119
|
+
|
|
120
|
+
>>> # Extract tables only
|
|
121
|
+
>>> tables = stx.load("paper.pdf", mode="tables")
|
|
122
|
+
"""
|
|
123
|
+
mode = kwargs.get("mode", mode)
|
|
124
|
+
backend = kwargs.get("backend", "auto")
|
|
125
|
+
clean_text = kwargs.get("clean_text", True)
|
|
126
|
+
extract_images = kwargs.get("extract_images", False)
|
|
127
|
+
output_dir = kwargs.get("output_dir", None)
|
|
128
|
+
table_settings = kwargs.get("table_settings", {})
|
|
129
|
+
|
|
130
|
+
# Validate file exists
|
|
131
|
+
if not os.path.exists(lpath):
|
|
132
|
+
raise FileNotFoundError(f"PDF file not found: {lpath}")
|
|
133
|
+
|
|
134
|
+
# Extension validation removed - handled by load() function
|
|
135
|
+
# This allows loading files without extensions when ext='pdf' is specified
|
|
136
|
+
|
|
137
|
+
# Select backend based on mode and availability
|
|
138
|
+
backend = _select_backend(mode, backend)
|
|
139
|
+
|
|
140
|
+
# Create output directory if needed
|
|
141
|
+
if output_dir is None and (
|
|
142
|
+
extract_images or mode in ["images", "scientific", "full"]
|
|
143
|
+
):
|
|
144
|
+
output_dir = tempfile.mkdtemp(prefix="pdf_extract_")
|
|
145
|
+
logger.debug(f"Using temporary directory: {output_dir}")
|
|
146
|
+
|
|
147
|
+
# Extract based on mode
|
|
148
|
+
if mode == "text":
|
|
149
|
+
return _extract_text(lpath, backend, clean_text)
|
|
150
|
+
elif mode == "sections":
|
|
151
|
+
return _extract_sections(lpath, backend, clean_text)
|
|
152
|
+
elif mode == "tables":
|
|
153
|
+
return _extract_tables(lpath, table_settings)
|
|
154
|
+
elif mode == "images":
|
|
155
|
+
save_as_jpg = kwargs.get("save_as_jpg", True)
|
|
156
|
+
return _extract_images(lpath, output_dir, save_as_jpg)
|
|
157
|
+
elif mode == "metadata":
|
|
158
|
+
return _extract_metadata(lpath, backend)
|
|
159
|
+
elif mode == "pages":
|
|
160
|
+
return _extract_pages(lpath, backend, clean_text)
|
|
161
|
+
elif mode == "scientific":
|
|
162
|
+
save_as_jpg = kwargs.get("save_as_jpg", True)
|
|
163
|
+
return _extract_scientific(
|
|
164
|
+
lpath, clean_text, output_dir, table_settings, save_as_jpg
|
|
165
|
+
)
|
|
166
|
+
elif mode == "full":
|
|
167
|
+
save_as_jpg = kwargs.get("save_as_jpg", True)
|
|
168
|
+
return _extract_full(
|
|
169
|
+
lpath,
|
|
170
|
+
backend,
|
|
171
|
+
clean_text,
|
|
172
|
+
extract_images,
|
|
173
|
+
output_dir,
|
|
174
|
+
table_settings,
|
|
175
|
+
save_as_jpg,
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
178
|
+
raise ValueError(f"Unknown extraction mode: {mode}")
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _select_backend(mode: str, requested: str) -> str:
|
|
182
|
+
"""Select appropriate backend based on mode and availability."""
|
|
183
|
+
if requested != "auto":
|
|
184
|
+
return requested
|
|
185
|
+
|
|
186
|
+
# Mode-specific backend selection
|
|
187
|
+
if mode in ["tables"]:
|
|
188
|
+
if PDFPLUMBER_AVAILABLE:
|
|
189
|
+
return "pdfplumber"
|
|
190
|
+
else:
|
|
191
|
+
logger.warning(
|
|
192
|
+
"pdfplumber not available for table extraction. Install with: pip install pdfplumber"
|
|
193
|
+
)
|
|
194
|
+
return "fitz" if FITZ_AVAILABLE else "pypdf2"
|
|
195
|
+
|
|
196
|
+
elif mode in ["images", "scientific", "full"]:
|
|
197
|
+
if FITZ_AVAILABLE:
|
|
198
|
+
return "fitz"
|
|
199
|
+
else:
|
|
200
|
+
logger.warning(
|
|
201
|
+
"PyMuPDF (fitz) recommended for image extraction. Install with: pip install PyMuPDF"
|
|
202
|
+
)
|
|
203
|
+
return "pdfplumber" if PDFPLUMBER_AVAILABLE else "pypdf2"
|
|
204
|
+
|
|
205
|
+
else: # text, sections, metadata, pages
|
|
206
|
+
if FITZ_AVAILABLE:
|
|
207
|
+
return "fitz"
|
|
208
|
+
elif PDFPLUMBER_AVAILABLE:
|
|
209
|
+
return "pdfplumber"
|
|
210
|
+
elif PYPDF2_AVAILABLE:
|
|
211
|
+
return "pypdf2"
|
|
212
|
+
else:
|
|
213
|
+
raise ImportError(
|
|
214
|
+
"No PDF library available. Install one of:\n"
|
|
215
|
+
" pip install PyMuPDF # Recommended\n"
|
|
216
|
+
" pip install pdfplumber # Best for tables\n"
|
|
217
|
+
" pip install PyPDF2 # Basic fallback"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _extract_text(lpath: str, backend: str, clean: bool) -> str:
|
|
222
|
+
"""Extract plain text from PDF."""
|
|
223
|
+
if backend == "fitz":
|
|
224
|
+
return _extract_text_fitz(lpath, clean)
|
|
225
|
+
elif backend == "pdfplumber":
|
|
226
|
+
return _extract_text_pdfplumber(lpath, clean)
|
|
227
|
+
else:
|
|
228
|
+
return _extract_text_pypdf2(lpath, clean)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _extract_text_fitz(lpath: str, clean: bool) -> str:
|
|
232
|
+
"""Extract text using PyMuPDF."""
|
|
233
|
+
if not FITZ_AVAILABLE:
|
|
234
|
+
raise ImportError("PyMuPDF (fitz) not available")
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
doc = fitz.open(lpath)
|
|
238
|
+
text_parts = []
|
|
239
|
+
|
|
240
|
+
for page_num, page in enumerate(doc):
|
|
241
|
+
text = page.get_text()
|
|
242
|
+
if text.strip():
|
|
243
|
+
text_parts.append(text)
|
|
244
|
+
|
|
245
|
+
doc.close()
|
|
246
|
+
|
|
247
|
+
full_text = "\n".join(text_parts)
|
|
248
|
+
|
|
249
|
+
if clean:
|
|
250
|
+
full_text = _clean_pdf_text(full_text)
|
|
251
|
+
|
|
252
|
+
return full_text
|
|
253
|
+
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logger.error(f"Error extracting text with fitz from {lpath}: {e}")
|
|
256
|
+
raise
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _extract_text_pdfplumber(lpath: str, clean: bool) -> str:
|
|
260
|
+
"""Extract text using pdfplumber."""
|
|
261
|
+
if not PDFPLUMBER_AVAILABLE:
|
|
262
|
+
raise ImportError("pdfplumber not available")
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
import pdfplumber
|
|
266
|
+
|
|
267
|
+
text_parts = []
|
|
268
|
+
with pdfplumber.open(lpath) as pdf:
|
|
269
|
+
for page in pdf.pages:
|
|
270
|
+
text = page.extract_text()
|
|
271
|
+
if text:
|
|
272
|
+
text_parts.append(text)
|
|
273
|
+
|
|
274
|
+
full_text = "\n".join(text_parts)
|
|
275
|
+
|
|
276
|
+
if clean:
|
|
277
|
+
full_text = _clean_pdf_text(full_text)
|
|
278
|
+
|
|
279
|
+
return full_text
|
|
280
|
+
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.error(
|
|
283
|
+
f"Error extracting text with pdfplumber from {lpath}: {e}"
|
|
284
|
+
)
|
|
285
|
+
raise
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _extract_text_pypdf2(lpath: str, clean: bool) -> str:
|
|
289
|
+
"""Extract text using PyPDF2."""
|
|
290
|
+
if not PYPDF2_AVAILABLE:
|
|
291
|
+
raise ImportError("PyPDF2 not available")
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
reader = PyPDF2.PdfReader(lpath)
|
|
295
|
+
text_parts = []
|
|
296
|
+
|
|
297
|
+
for page_num in range(len(reader.pages)):
|
|
298
|
+
page = reader.pages[page_num]
|
|
299
|
+
text = page.extract_text()
|
|
300
|
+
if text.strip():
|
|
301
|
+
text_parts.append(text)
|
|
302
|
+
|
|
303
|
+
full_text = "\n".join(text_parts)
|
|
304
|
+
|
|
305
|
+
if clean:
|
|
306
|
+
full_text = _clean_pdf_text(full_text)
|
|
307
|
+
|
|
308
|
+
return full_text
|
|
309
|
+
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.error(f"Error extracting text with PyPDF2 from {lpath}: {e}")
|
|
312
|
+
raise
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _extract_tables(
|
|
316
|
+
lpath: str, table_settings: Dict = None
|
|
317
|
+
) -> Dict[int, List["pd.DataFrame"]]:
|
|
318
|
+
"""
|
|
319
|
+
Extract tables from PDF as pandas DataFrames.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Dict mapping page numbers to list of DataFrames
|
|
323
|
+
"""
|
|
324
|
+
if not PDFPLUMBER_AVAILABLE:
|
|
325
|
+
raise ImportError(
|
|
326
|
+
"pdfplumber required for table extraction. Install with:\n"
|
|
327
|
+
" pip install pdfplumber pandas"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
if not PANDAS_AVAILABLE:
|
|
331
|
+
raise ImportError("pandas required for table extraction")
|
|
332
|
+
|
|
333
|
+
import pandas as pd
|
|
334
|
+
import pdfplumber
|
|
335
|
+
|
|
336
|
+
tables_dict = {}
|
|
337
|
+
table_settings = table_settings or {}
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
with pdfplumber.open(lpath) as pdf:
|
|
341
|
+
for page_num, page in enumerate(pdf.pages):
|
|
342
|
+
# Extract tables from page
|
|
343
|
+
tables = page.extract_tables(**table_settings)
|
|
344
|
+
|
|
345
|
+
if tables:
|
|
346
|
+
# Convert to DataFrames
|
|
347
|
+
dfs = []
|
|
348
|
+
for table in tables:
|
|
349
|
+
if table and len(table) > 0:
|
|
350
|
+
# First row as header if it looks like headers
|
|
351
|
+
if len(table) > 1 and all(
|
|
352
|
+
isinstance(cell, str)
|
|
353
|
+
for cell in table[0]
|
|
354
|
+
if cell
|
|
355
|
+
):
|
|
356
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
|
357
|
+
else:
|
|
358
|
+
df = pd.DataFrame(table)
|
|
359
|
+
|
|
360
|
+
# Clean up DataFrame
|
|
361
|
+
df = (
|
|
362
|
+
df.replace("", None)
|
|
363
|
+
.dropna(how="all", axis=1)
|
|
364
|
+
.dropna(how="all", axis=0)
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
if not df.empty:
|
|
368
|
+
dfs.append(df)
|
|
369
|
+
|
|
370
|
+
if dfs:
|
|
371
|
+
tables_dict[page_num] = dfs
|
|
372
|
+
|
|
373
|
+
logger.info(f"Extracted tables from {len(tables_dict)} pages")
|
|
374
|
+
return tables_dict
|
|
375
|
+
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.error(f"Error extracting tables: {e}")
|
|
378
|
+
raise
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _extract_images(
|
|
382
|
+
lpath: str, output_dir: str = None, save_as_jpg: bool = True
|
|
383
|
+
) -> List[Dict[str, Any]]:
|
|
384
|
+
"""
|
|
385
|
+
Extract images from PDF with metadata.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
lpath: Path to PDF file
|
|
389
|
+
output_dir: Directory to save images (optional)
|
|
390
|
+
save_as_jpg: If True, convert all images to JPG format (default: True)
|
|
10
391
|
|
|
392
|
+
Returns:
|
|
393
|
+
List of dicts containing image metadata and paths
|
|
394
|
+
"""
|
|
395
|
+
if not FITZ_AVAILABLE:
|
|
396
|
+
raise ImportError(
|
|
397
|
+
"PyMuPDF (fitz) required for image extraction. Install with:\n"
|
|
398
|
+
" pip install PyMuPDF"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
images_info = []
|
|
11
402
|
|
|
12
|
-
def _load_pdf(lpath, **kwargs):
|
|
13
|
-
"""Load PDF file and return extracted text."""
|
|
14
|
-
if PyPDF2 is None:
|
|
15
|
-
raise ImportError("PyPDF2 is required for PDF loading. Install with: pip install PyPDF2")
|
|
16
|
-
|
|
17
403
|
try:
|
|
18
|
-
|
|
19
|
-
|
|
404
|
+
doc = fitz.open(lpath)
|
|
405
|
+
|
|
406
|
+
for page_num, page in enumerate(doc):
|
|
407
|
+
image_list = page.get_images()
|
|
408
|
+
|
|
409
|
+
for img_index, img in enumerate(image_list):
|
|
410
|
+
xref = img[0]
|
|
411
|
+
|
|
412
|
+
# Extract image data
|
|
413
|
+
base_image = doc.extract_image(xref)
|
|
414
|
+
image_bytes = base_image["image"]
|
|
415
|
+
original_ext = base_image["ext"]
|
|
416
|
+
|
|
417
|
+
image_info = {
|
|
418
|
+
"page": page_num + 1,
|
|
419
|
+
"index": img_index,
|
|
420
|
+
"width": base_image["width"],
|
|
421
|
+
"height": base_image["height"],
|
|
422
|
+
"colorspace": base_image["colorspace"],
|
|
423
|
+
"bpc": base_image["bpc"], # bits per component
|
|
424
|
+
"original_ext": original_ext,
|
|
425
|
+
"size_bytes": len(image_bytes),
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
# Save image if output directory provided
|
|
429
|
+
if output_dir:
|
|
430
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
431
|
+
|
|
432
|
+
if save_as_jpg and original_ext not in ["jpg", "jpeg"]:
|
|
433
|
+
# Convert to JPG using PIL
|
|
434
|
+
try:
|
|
435
|
+
from PIL import Image
|
|
436
|
+
import io
|
|
437
|
+
|
|
438
|
+
# Open image from bytes
|
|
439
|
+
img_pil = Image.open(io.BytesIO(image_bytes))
|
|
440
|
+
|
|
441
|
+
# Convert RGBA to RGB if necessary
|
|
442
|
+
if img_pil.mode in ('RGBA', 'LA', 'P'):
|
|
443
|
+
# Create a white background
|
|
444
|
+
background = Image.new('RGB', img_pil.size, (255, 255, 255))
|
|
445
|
+
if img_pil.mode == 'P':
|
|
446
|
+
img_pil = img_pil.convert('RGBA')
|
|
447
|
+
background.paste(img_pil, mask=img_pil.split()[-1] if img_pil.mode == 'RGBA' else None)
|
|
448
|
+
img_pil = background
|
|
449
|
+
elif img_pil.mode != 'RGB':
|
|
450
|
+
img_pil = img_pil.convert('RGB')
|
|
451
|
+
|
|
452
|
+
# Save as JPG
|
|
453
|
+
filename = f"page_{page_num + 1}_img_{img_index}.jpg"
|
|
454
|
+
filepath = os.path.join(output_dir, filename)
|
|
455
|
+
img_pil.save(filepath, 'JPEG', quality=95)
|
|
456
|
+
|
|
457
|
+
image_info["ext"] = "jpg"
|
|
458
|
+
except ImportError:
|
|
459
|
+
logger.warning("PIL not available for image conversion. Install with: pip install Pillow")
|
|
460
|
+
# Fall back to original format
|
|
461
|
+
filename = f"page_{page_num + 1}_img_{img_index}.{original_ext}"
|
|
462
|
+
filepath = os.path.join(output_dir, filename)
|
|
463
|
+
with open(filepath, "wb") as img_file:
|
|
464
|
+
img_file.write(image_bytes)
|
|
465
|
+
image_info["ext"] = original_ext
|
|
466
|
+
else:
|
|
467
|
+
# Save with original format
|
|
468
|
+
ext = "jpg" if original_ext == "jpeg" else original_ext
|
|
469
|
+
filename = f"page_{page_num + 1}_img_{img_index}.{ext}"
|
|
470
|
+
filepath = os.path.join(output_dir, filename)
|
|
471
|
+
with open(filepath, "wb") as img_file:
|
|
472
|
+
img_file.write(image_bytes)
|
|
473
|
+
image_info["ext"] = ext
|
|
474
|
+
|
|
475
|
+
image_info["filepath"] = filepath
|
|
476
|
+
image_info["filename"] = filename
|
|
477
|
+
|
|
478
|
+
images_info.append(image_info)
|
|
479
|
+
|
|
480
|
+
doc.close()
|
|
481
|
+
|
|
482
|
+
logger.info(f"Extracted {len(images_info)} images from PDF")
|
|
483
|
+
return images_info
|
|
484
|
+
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.error(f"Error extracting images: {e}")
|
|
487
|
+
raise
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _extract_sections(lpath: str, backend: str, clean: bool) -> Dict[str, str]:
|
|
491
|
+
"""Extract text organized by sections."""
|
|
492
|
+
# Get full text first
|
|
493
|
+
text = _extract_text(lpath, backend, clean=False)
|
|
494
|
+
|
|
495
|
+
# Parse into sections
|
|
496
|
+
sections = _parse_sections(text)
|
|
497
|
+
|
|
498
|
+
# Clean section text if requested
|
|
499
|
+
if clean:
|
|
500
|
+
for section, content in sections.items():
|
|
501
|
+
sections[section] = _clean_pdf_text(content)
|
|
502
|
+
|
|
503
|
+
return sections
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _parse_sections(text: str) -> Dict[str, str]:
|
|
507
|
+
"""
|
|
508
|
+
Parse text into sections based on IMRaD structure.
|
|
509
|
+
|
|
510
|
+
Follows the standard scientific paper structure:
|
|
511
|
+
- frontpage: Title, authors, affiliations, keywords
|
|
512
|
+
- abstract: Paper summary
|
|
513
|
+
- introduction: Background and motivation
|
|
514
|
+
- methods: Methodology (materials and methods, experimental design)
|
|
515
|
+
- results: Findings
|
|
516
|
+
- discussion: Interpretation and implications
|
|
517
|
+
- references: Citations
|
|
518
|
+
"""
|
|
519
|
+
sections = {}
|
|
520
|
+
current_section = "frontpage"
|
|
521
|
+
current_text = []
|
|
522
|
+
|
|
523
|
+
# Simplified section patterns - IMRaD + frontpage only
|
|
524
|
+
# Only match standalone section headers (exact matches)
|
|
525
|
+
section_patterns = [
|
|
526
|
+
r"^abstract\s*$",
|
|
527
|
+
r"^summary\s*$",
|
|
528
|
+
r"^introduction\s*$",
|
|
529
|
+
r"^background\s*$",
|
|
530
|
+
r"^methods?\s*$",
|
|
531
|
+
r"^materials?\s+and\s+methods?\s*$",
|
|
532
|
+
r"^methodology\s*$",
|
|
533
|
+
r"^results?\s*$",
|
|
534
|
+
r"^discussion\s*$",
|
|
535
|
+
r"^references?\s*$",
|
|
536
|
+
]
|
|
537
|
+
|
|
538
|
+
lines = text.split("\n")
|
|
539
|
+
|
|
540
|
+
for line in lines:
|
|
541
|
+
line_lower = line.lower().strip()
|
|
542
|
+
line_stripped = line.strip()
|
|
543
|
+
|
|
544
|
+
# Check if this line is a section header
|
|
545
|
+
is_header = False
|
|
546
|
+
for pattern in section_patterns:
|
|
547
|
+
if re.match(pattern, line_lower):
|
|
548
|
+
# Additional validation: header lines should be short (< 50 chars)
|
|
549
|
+
# and not contain numbers/punctuation (except spaces)
|
|
550
|
+
if len(line_stripped) < 50:
|
|
551
|
+
# Save previous section
|
|
552
|
+
if current_text:
|
|
553
|
+
sections[current_section] = "\n".join(current_text).strip()
|
|
554
|
+
|
|
555
|
+
# Start new section
|
|
556
|
+
current_section = line_lower.strip()
|
|
557
|
+
current_text = []
|
|
558
|
+
is_header = True
|
|
559
|
+
break
|
|
560
|
+
|
|
561
|
+
if not is_header:
|
|
562
|
+
current_text.append(line)
|
|
563
|
+
|
|
564
|
+
# Save last section
|
|
565
|
+
if current_text:
|
|
566
|
+
sections[current_section] = "\n".join(current_text).strip()
|
|
567
|
+
|
|
568
|
+
return sections
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def _extract_metadata(lpath: str, backend: str) -> Dict[str, Any]:
|
|
572
|
+
"""Extract PDF metadata."""
|
|
573
|
+
metadata = {
|
|
574
|
+
"file_path": lpath,
|
|
575
|
+
"file_name": os.path.basename(lpath),
|
|
576
|
+
"file_size": os.path.getsize(lpath),
|
|
577
|
+
"backend": backend,
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
if backend == "fitz" and FITZ_AVAILABLE:
|
|
581
|
+
try:
|
|
582
|
+
doc = fitz.open(lpath)
|
|
583
|
+
pdf_metadata = doc.metadata
|
|
584
|
+
|
|
585
|
+
metadata.update(
|
|
586
|
+
{
|
|
587
|
+
"title": pdf_metadata.get("title", ""),
|
|
588
|
+
"author": pdf_metadata.get("author", ""),
|
|
589
|
+
"subject": pdf_metadata.get("subject", ""),
|
|
590
|
+
"keywords": pdf_metadata.get("keywords", ""),
|
|
591
|
+
"creator": pdf_metadata.get("creator", ""),
|
|
592
|
+
"producer": pdf_metadata.get("producer", ""),
|
|
593
|
+
"creation_date": str(pdf_metadata.get("creationDate", "")),
|
|
594
|
+
"modification_date": str(pdf_metadata.get("modDate", "")),
|
|
595
|
+
"pages": len(doc),
|
|
596
|
+
"encrypted": doc.is_encrypted,
|
|
597
|
+
}
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
doc.close()
|
|
601
|
+
|
|
602
|
+
except Exception as e:
|
|
603
|
+
logger.error(f"Error extracting metadata with fitz: {e}")
|
|
604
|
+
|
|
605
|
+
elif backend == "pdfplumber" and PDFPLUMBER_AVAILABLE:
|
|
606
|
+
try:
|
|
607
|
+
import pdfplumber
|
|
608
|
+
|
|
609
|
+
with pdfplumber.open(lpath) as pdf:
|
|
610
|
+
metadata["pages"] = len(pdf.pages)
|
|
611
|
+
if hasattr(pdf, "metadata"):
|
|
612
|
+
metadata.update(pdf.metadata)
|
|
613
|
+
except Exception as e:
|
|
614
|
+
logger.error(f"Error extracting metadata with pdfplumber: {e}")
|
|
615
|
+
|
|
616
|
+
elif backend == "pypdf2" and PYPDF2_AVAILABLE:
|
|
617
|
+
try:
|
|
618
|
+
reader = PyPDF2.PdfReader(lpath)
|
|
619
|
+
|
|
620
|
+
if reader.metadata:
|
|
621
|
+
metadata.update(
|
|
622
|
+
{
|
|
623
|
+
"title": reader.metadata.get("/Title", ""),
|
|
624
|
+
"author": reader.metadata.get("/Author", ""),
|
|
625
|
+
"subject": reader.metadata.get("/Subject", ""),
|
|
626
|
+
"creator": reader.metadata.get("/Creator", ""),
|
|
627
|
+
"producer": reader.metadata.get("/Producer", ""),
|
|
628
|
+
"creation_date": str(
|
|
629
|
+
reader.metadata.get("/CreationDate", "")
|
|
630
|
+
),
|
|
631
|
+
"modification_date": str(
|
|
632
|
+
reader.metadata.get("/ModDate", "")
|
|
633
|
+
),
|
|
634
|
+
}
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
metadata["pages"] = len(reader.pages)
|
|
638
|
+
metadata["encrypted"] = reader.is_encrypted
|
|
639
|
+
|
|
640
|
+
except Exception as e:
|
|
641
|
+
logger.error(f"Error extracting metadata with PyPDF2: {e}")
|
|
20
642
|
|
|
643
|
+
# Generate file hash
|
|
644
|
+
metadata["md5_hash"] = _calculate_file_hash(lpath)
|
|
645
|
+
|
|
646
|
+
return metadata
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def _extract_pages(
|
|
650
|
+
lpath: str, backend: str, clean: bool
|
|
651
|
+
) -> List[Dict[str, Any]]:
|
|
652
|
+
"""Extract content page by page."""
|
|
653
|
+
pages = []
|
|
654
|
+
|
|
655
|
+
if backend == "fitz" and FITZ_AVAILABLE:
|
|
656
|
+
doc = fitz.open(lpath)
|
|
657
|
+
|
|
658
|
+
for page_num, page in enumerate(doc):
|
|
659
|
+
text = page.get_text()
|
|
660
|
+
if clean:
|
|
661
|
+
text = _clean_pdf_text(text)
|
|
662
|
+
|
|
663
|
+
pages.append(
|
|
664
|
+
{
|
|
665
|
+
"page_number": page_num + 1,
|
|
666
|
+
"text": text,
|
|
667
|
+
"char_count": len(text),
|
|
668
|
+
"word_count": len(text.split()),
|
|
669
|
+
}
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
doc.close()
|
|
673
|
+
|
|
674
|
+
elif backend == "pdfplumber" and PDFPLUMBER_AVAILABLE:
|
|
675
|
+
import pdfplumber
|
|
676
|
+
|
|
677
|
+
with pdfplumber.open(lpath) as pdf:
|
|
678
|
+
for page_num, page in enumerate(pdf.pages):
|
|
679
|
+
text = page.extract_text() or ""
|
|
680
|
+
if clean:
|
|
681
|
+
text = _clean_pdf_text(text)
|
|
682
|
+
|
|
683
|
+
pages.append(
|
|
684
|
+
{
|
|
685
|
+
"page_number": page_num + 1,
|
|
686
|
+
"text": text,
|
|
687
|
+
"char_count": len(text),
|
|
688
|
+
"word_count": len(text.split()),
|
|
689
|
+
}
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
elif backend == "pypdf2" and PYPDF2_AVAILABLE:
|
|
21
693
|
reader = PyPDF2.PdfReader(lpath)
|
|
22
|
-
|
|
694
|
+
|
|
23
695
|
for page_num in range(len(reader.pages)):
|
|
24
696
|
page = reader.pages[page_num]
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
697
|
+
text = page.extract_text()
|
|
698
|
+
if clean:
|
|
699
|
+
text = _clean_pdf_text(text)
|
|
700
|
+
|
|
701
|
+
pages.append(
|
|
702
|
+
{
|
|
703
|
+
"page_number": page_num + 1,
|
|
704
|
+
"text": text,
|
|
705
|
+
"char_count": len(text),
|
|
706
|
+
"word_count": len(text.split()),
|
|
707
|
+
}
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
return pages
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def _extract_scientific(
|
|
714
|
+
lpath: str, clean_text: bool, output_dir: str, table_settings: Dict, save_as_jpg: bool = True
|
|
715
|
+
) -> DotDict:
|
|
716
|
+
"""
|
|
717
|
+
Optimized extraction for scientific papers.
|
|
718
|
+
Extracts text, tables, images, and sections in a structured format.
|
|
719
|
+
"""
|
|
720
|
+
result = {
|
|
721
|
+
"pdf_path": lpath,
|
|
722
|
+
"filename": os.path.basename(lpath),
|
|
723
|
+
"extraction_mode": "scientific",
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
try:
|
|
727
|
+
# Extract text and sections
|
|
728
|
+
backend = _select_backend("text", "auto")
|
|
729
|
+
result["text"] = _extract_text(lpath, backend, clean_text)
|
|
730
|
+
result["sections"] = _extract_sections(lpath, backend, clean_text)
|
|
731
|
+
|
|
732
|
+
# Extract metadata
|
|
733
|
+
result["metadata"] = _extract_metadata(lpath, backend)
|
|
734
|
+
|
|
735
|
+
# Extract tables if pdfplumber available
|
|
736
|
+
if PDFPLUMBER_AVAILABLE and PANDAS_AVAILABLE:
|
|
737
|
+
try:
|
|
738
|
+
result["tables"] = _extract_tables(lpath, table_settings)
|
|
739
|
+
except Exception as e:
|
|
740
|
+
logger.warning(f"Could not extract tables: {e}")
|
|
741
|
+
result["tables"] = {}
|
|
742
|
+
else:
|
|
743
|
+
result["tables"] = {}
|
|
744
|
+
logger.info("Table extraction requires pdfplumber and pandas")
|
|
745
|
+
|
|
746
|
+
# Extract images if fitz available
|
|
747
|
+
if FITZ_AVAILABLE:
|
|
748
|
+
try:
|
|
749
|
+
result["images"] = _extract_images(lpath, output_dir, save_as_jpg)
|
|
750
|
+
except Exception as e:
|
|
751
|
+
logger.warning(f"Could not extract images: {e}")
|
|
752
|
+
result["images"] = []
|
|
753
|
+
else:
|
|
754
|
+
result["images"] = []
|
|
755
|
+
logger.info("Image extraction requires PyMuPDF (fitz)")
|
|
756
|
+
|
|
757
|
+
# Calculate statistics
|
|
758
|
+
result["stats"] = {
|
|
759
|
+
"total_chars": len(result["text"]),
|
|
760
|
+
"total_words": len(result["text"].split()),
|
|
761
|
+
"total_pages": result["metadata"].get("pages", 0),
|
|
762
|
+
"num_sections": len(result["sections"]),
|
|
763
|
+
"num_tables": sum(
|
|
764
|
+
len(tables) for tables in result["tables"].values()
|
|
765
|
+
),
|
|
766
|
+
"num_images": len(result["images"]),
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
logger.info(
|
|
770
|
+
f"Scientific extraction complete: "
|
|
771
|
+
f"{result['stats']['total_pages']} pages, "
|
|
772
|
+
f"{result['stats']['num_sections']} sections, "
|
|
773
|
+
f"{result['stats']['num_tables']} tables, "
|
|
774
|
+
f"{result['stats']['num_images']} images"
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
except Exception as e:
|
|
778
|
+
logger.error(f"Error in scientific extraction: {e}")
|
|
779
|
+
result["error"] = str(e)
|
|
780
|
+
|
|
781
|
+
return DotDict(result)
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def _extract_full(
|
|
785
|
+
lpath: str,
|
|
786
|
+
backend: str,
|
|
787
|
+
clean: bool,
|
|
788
|
+
extract_images: bool,
|
|
789
|
+
output_dir: str,
|
|
790
|
+
table_settings: Dict,
|
|
791
|
+
save_as_jpg: bool = True,
|
|
792
|
+
) -> DotDict:
|
|
793
|
+
"""Extract comprehensive data from PDF."""
|
|
794
|
+
result = {
|
|
795
|
+
"pdf_path": lpath,
|
|
796
|
+
"filename": os.path.basename(lpath),
|
|
797
|
+
"backend": backend,
|
|
798
|
+
"extraction_params": {
|
|
799
|
+
"clean_text": clean,
|
|
800
|
+
"extract_images": extract_images,
|
|
801
|
+
},
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
# Extract all components
|
|
805
|
+
try:
|
|
806
|
+
result["full_text"] = _extract_text(lpath, backend, clean)
|
|
807
|
+
result["sections"] = _extract_sections(lpath, backend, clean)
|
|
808
|
+
result["metadata"] = _extract_metadata(lpath, backend)
|
|
809
|
+
result["pages"] = _extract_pages(lpath, backend, clean)
|
|
810
|
+
|
|
811
|
+
# Extract tables if available
|
|
812
|
+
if PDFPLUMBER_AVAILABLE and PANDAS_AVAILABLE:
|
|
813
|
+
try:
|
|
814
|
+
result["tables"] = _extract_tables(lpath, table_settings)
|
|
815
|
+
except Exception as e:
|
|
816
|
+
logger.warning(f"Could not extract tables: {e}")
|
|
817
|
+
result["tables"] = {}
|
|
818
|
+
|
|
819
|
+
# Extract images if requested and available
|
|
820
|
+
if extract_images and FITZ_AVAILABLE:
|
|
821
|
+
try:
|
|
822
|
+
result["images"] = _extract_images(lpath, output_dir, save_as_jpg)
|
|
823
|
+
except Exception as e:
|
|
824
|
+
logger.warning(f"Could not extract images: {e}")
|
|
825
|
+
result["images"] = []
|
|
826
|
+
|
|
827
|
+
# Calculate statistics
|
|
828
|
+
result["stats"] = {
|
|
829
|
+
"total_chars": len(result["full_text"]),
|
|
830
|
+
"total_words": len(result["full_text"].split()),
|
|
831
|
+
"total_pages": len(result["pages"]),
|
|
832
|
+
"num_sections": len(result["sections"]),
|
|
833
|
+
"num_tables": sum(
|
|
834
|
+
len(tables) for tables in result.get("tables", {}).values()
|
|
835
|
+
),
|
|
836
|
+
"num_images": len(result.get("images", [])),
|
|
837
|
+
"avg_words_per_page": (
|
|
838
|
+
len(result["full_text"].split()) / len(result["pages"])
|
|
839
|
+
if result["pages"]
|
|
840
|
+
else 0
|
|
841
|
+
),
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
except Exception as e:
|
|
845
|
+
logger.error(f"Error in full extraction: {e}")
|
|
846
|
+
result["error"] = str(e)
|
|
847
|
+
|
|
848
|
+
return DotDict(result)
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def _clean_pdf_text(text: str) -> str:
|
|
852
|
+
"""Clean extracted PDF text."""
|
|
853
|
+
# Remove excessive whitespace
|
|
854
|
+
text = re.sub(r"\s+", " ", text)
|
|
855
|
+
|
|
856
|
+
# Fix hyphenated words at line breaks
|
|
857
|
+
text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
|
|
858
|
+
|
|
859
|
+
# Remove page numbers (common patterns)
|
|
860
|
+
text = re.sub(r"\n\s*\d+\s*\n", "\n", text)
|
|
861
|
+
text = re.sub(r"Page\s+\d+\s+of\s+\d+", "", text, flags=re.IGNORECASE)
|
|
862
|
+
|
|
863
|
+
# Clean up common PDF artifacts
|
|
864
|
+
text = text.replace("\x00", "") # Null bytes
|
|
865
|
+
text = re.sub(r"[\x01-\x1f\x7f-\x9f]", "", text) # Control characters
|
|
866
|
+
|
|
867
|
+
# Normalize quotes and dashes
|
|
868
|
+
text = text.replace('"', '"').replace('"', '"')
|
|
869
|
+
text = text.replace(""", "'").replace(""", "'")
|
|
870
|
+
text = text.replace("–", "-").replace("—", "-")
|
|
871
|
+
|
|
872
|
+
# Remove multiple consecutive newlines
|
|
873
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
874
|
+
|
|
875
|
+
return text.strip()
|
|
876
|
+
|
|
29
877
|
|
|
878
|
+
def _calculate_file_hash(lpath: str) -> str:
|
|
879
|
+
"""Calculate MD5 hash of file."""
|
|
880
|
+
hash_md5 = hashlib.md5()
|
|
881
|
+
with open(lpath, "rb") as f:
|
|
882
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
883
|
+
hash_md5.update(chunk)
|
|
884
|
+
return hash_md5.hexdigest()
|
|
30
885
|
|
|
31
886
|
# EOF
|