scitex 2.0.0__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +53 -15
- scitex/__main__.py +72 -26
- scitex/__version__.py +1 -1
- scitex/_sh.py +145 -23
- scitex/ai/__init__.py +30 -16
- scitex/ai/_gen_ai/_Anthropic.py +5 -7
- scitex/ai/_gen_ai/_BaseGenAI.py +2 -2
- scitex/ai/_gen_ai/_DeepSeek.py +10 -2
- scitex/ai/_gen_ai/_Google.py +2 -2
- scitex/ai/_gen_ai/_Llama.py +2 -2
- scitex/ai/_gen_ai/_OpenAI.py +2 -2
- scitex/ai/_gen_ai/_PARAMS.py +51 -65
- scitex/ai/_gen_ai/_Perplexity.py +2 -2
- scitex/ai/_gen_ai/__init__.py +25 -14
- scitex/ai/_gen_ai/_format_output_func.py +4 -4
- scitex/ai/classification/{classifier_server.py → Classifier.py} +5 -5
- scitex/ai/classification/CrossValidationExperiment.py +374 -0
- scitex/ai/classification/__init__.py +43 -4
- scitex/ai/classification/reporters/_BaseClassificationReporter.py +281 -0
- scitex/ai/classification/reporters/_ClassificationReporter.py +773 -0
- scitex/ai/classification/reporters/_MultiClassificationReporter.py +406 -0
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +1834 -0
- scitex/ai/classification/reporters/__init__.py +11 -0
- scitex/ai/classification/reporters/reporter_utils/_Plotter.py +1028 -0
- scitex/ai/classification/reporters/reporter_utils/__init__.py +80 -0
- scitex/ai/classification/reporters/reporter_utils/aggregation.py +457 -0
- scitex/ai/classification/reporters/reporter_utils/data_models.py +313 -0
- scitex/ai/classification/reporters/reporter_utils/reporting.py +1056 -0
- scitex/ai/classification/reporters/reporter_utils/storage.py +221 -0
- scitex/ai/classification/reporters/reporter_utils/validation.py +395 -0
- scitex/ai/classification/timeseries/_TimeSeriesBlockingSplit.py +568 -0
- scitex/ai/classification/timeseries/_TimeSeriesCalendarSplit.py +688 -0
- scitex/ai/classification/timeseries/_TimeSeriesMetadata.py +139 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +1716 -0
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +1685 -0
- scitex/ai/classification/timeseries/_TimeSeriesStrategy.py +84 -0
- scitex/ai/classification/timeseries/_TimeSeriesStratifiedSplit.py +610 -0
- scitex/ai/classification/timeseries/__init__.py +39 -0
- scitex/ai/classification/timeseries/_normalize_timestamp.py +436 -0
- scitex/ai/clustering/_umap.py +2 -2
- scitex/ai/feature_extraction/vit.py +1 -0
- scitex/ai/feature_selection/__init__.py +30 -0
- scitex/ai/feature_selection/feature_selection.py +364 -0
- scitex/ai/loss/multi_task_loss.py +1 -1
- scitex/ai/metrics/__init__.py +51 -4
- scitex/ai/metrics/_calc_bacc.py +61 -0
- scitex/ai/metrics/_calc_bacc_from_conf_mat.py +38 -0
- scitex/ai/metrics/_calc_clf_report.py +78 -0
- scitex/ai/metrics/_calc_conf_mat.py +93 -0
- scitex/ai/metrics/_calc_feature_importance.py +183 -0
- scitex/ai/metrics/_calc_mcc.py +61 -0
- scitex/ai/metrics/_calc_pre_rec_auc.py +116 -0
- scitex/ai/metrics/_calc_roc_auc.py +110 -0
- scitex/ai/metrics/_calc_seizure_prediction_metrics.py +490 -0
- scitex/ai/metrics/{silhoute_score_block.py → _calc_silhouette_score.py} +15 -8
- scitex/ai/metrics/_normalize_labels.py +83 -0
- scitex/ai/plt/__init__.py +47 -8
- scitex/ai/plt/{_conf_mat.py → _plot_conf_mat.py} +158 -87
- scitex/ai/plt/_plot_feature_importance.py +323 -0
- scitex/ai/plt/_plot_learning_curve.py +345 -0
- scitex/ai/plt/_plot_optuna_study.py +225 -0
- scitex/ai/plt/_plot_pre_rec_curve.py +290 -0
- scitex/ai/plt/_plot_roc_curve.py +255 -0
- scitex/ai/training/{learning_curve_logger.py → _LearningCurveLogger.py} +197 -213
- scitex/ai/training/__init__.py +2 -2
- scitex/ai/utils/grid_search.py +3 -3
- scitex/benchmark/__init__.py +52 -0
- scitex/benchmark/benchmark.py +400 -0
- scitex/benchmark/monitor.py +370 -0
- scitex/benchmark/profiler.py +297 -0
- scitex/browser/__init__.py +48 -0
- scitex/browser/automation/CookieHandler.py +216 -0
- scitex/browser/automation/__init__.py +7 -0
- scitex/browser/collaboration/__init__.py +55 -0
- scitex/browser/collaboration/auth_helpers.py +94 -0
- scitex/browser/collaboration/collaborative_agent.py +136 -0
- scitex/browser/collaboration/credential_manager.py +188 -0
- scitex/browser/collaboration/interactive_panel.py +400 -0
- scitex/browser/collaboration/persistent_browser.py +170 -0
- scitex/browser/collaboration/shared_session.py +383 -0
- scitex/browser/collaboration/standard_interactions.py +246 -0
- scitex/browser/collaboration/visual_feedback.py +181 -0
- scitex/browser/core/BrowserMixin.py +326 -0
- scitex/browser/core/ChromeProfileManager.py +446 -0
- scitex/browser/core/__init__.py +9 -0
- scitex/browser/debugging/__init__.py +18 -0
- scitex/browser/debugging/_browser_logger.py +657 -0
- scitex/browser/debugging/_highlight_element.py +143 -0
- scitex/browser/debugging/_show_grid.py +154 -0
- scitex/browser/interaction/__init__.py +24 -0
- scitex/browser/interaction/click_center.py +149 -0
- scitex/browser/interaction/click_with_fallbacks.py +206 -0
- scitex/browser/interaction/close_popups.py +498 -0
- scitex/browser/interaction/fill_with_fallbacks.py +209 -0
- scitex/browser/pdf/__init__.py +14 -0
- scitex/browser/pdf/click_download_for_chrome_pdf_viewer.py +200 -0
- scitex/browser/pdf/detect_chrome_pdf_viewer.py +198 -0
- scitex/browser/remote/CaptchaHandler.py +434 -0
- scitex/browser/remote/ZenRowsAPIClient.py +347 -0
- scitex/browser/remote/ZenRowsBrowserManager.py +570 -0
- scitex/browser/remote/__init__.py +11 -0
- scitex/browser/stealth/HumanBehavior.py +344 -0
- scitex/browser/stealth/StealthManager.py +1008 -0
- scitex/browser/stealth/__init__.py +9 -0
- scitex/browser/template.py +122 -0
- scitex/capture/__init__.py +110 -0
- scitex/capture/__main__.py +25 -0
- scitex/capture/capture.py +848 -0
- scitex/capture/cli.py +233 -0
- scitex/capture/gif.py +344 -0
- scitex/capture/mcp_server.py +961 -0
- scitex/capture/session.py +70 -0
- scitex/capture/utils.py +705 -0
- scitex/cli/__init__.py +17 -0
- scitex/cli/cloud.py +447 -0
- scitex/cli/main.py +42 -0
- scitex/cli/scholar.py +280 -0
- scitex/context/_suppress_output.py +5 -3
- scitex/db/__init__.py +30 -3
- scitex/db/__main__.py +75 -0
- scitex/db/_check_health.py +381 -0
- scitex/db/_delete_duplicates.py +25 -386
- scitex/db/_inspect.py +335 -114
- scitex/db/_inspect_optimized.py +301 -0
- scitex/db/{_PostgreSQL.py → _postgresql/_PostgreSQL.py} +3 -3
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BackupMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BatchMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_BlobMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_ConnectionMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_MaintenanceMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_QueryMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_SchemaMixin.py +1 -1
- scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_TransactionMixin.py +1 -1
- scitex/db/_postgresql/__init__.py +6 -0
- scitex/db/_sqlite3/_SQLite3.py +210 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin.py +581 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin_v01-need-_hash-col.py +517 -0
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_BatchMixin.py +1 -1
- scitex/db/_sqlite3/_SQLite3Mixins/_BlobMixin.py +281 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin.py +548 -0
- scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin_v01-indentation-issues.py +583 -0
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_ConnectionMixin.py +29 -13
- scitex/db/_sqlite3/_SQLite3Mixins/_GitMixin.py +583 -0
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_ImportExportMixin.py +1 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_IndexMixin.py +1 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_MaintenanceMixin.py +2 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_QueryMixin.py +37 -10
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_RowMixin.py +46 -6
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_TableMixin.py +56 -10
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/_TransactionMixin.py +1 -1
- scitex/db/{_SQLite3Mixins → _sqlite3/_SQLite3Mixins}/__init__.py +14 -2
- scitex/db/_sqlite3/__init__.py +7 -0
- scitex/db/_sqlite3/_delete_duplicates.py +274 -0
- scitex/decorators/__init__.py +2 -0
- scitex/decorators/_cache_disk.py +13 -5
- scitex/decorators/_cache_disk_async.py +49 -0
- scitex/decorators/_deprecated.py +175 -10
- scitex/decorators/_timeout.py +1 -1
- scitex/dev/_analyze_code_flow.py +2 -2
- scitex/dict/_DotDict.py +73 -15
- scitex/dict/_DotDict_v01-not-handling-recursive-instantiations.py +442 -0
- scitex/dict/_DotDict_v02-not-serializing-Path-object.py +446 -0
- scitex/dict/__init__.py +2 -0
- scitex/dict/_flatten.py +27 -0
- scitex/dsp/_crop.py +2 -2
- scitex/dsp/_demo_sig.py +2 -2
- scitex/dsp/_detect_ripples.py +2 -2
- scitex/dsp/_hilbert.py +2 -2
- scitex/dsp/_listen.py +6 -6
- scitex/dsp/_modulation_index.py +2 -2
- scitex/dsp/_pac.py +1 -1
- scitex/dsp/_psd.py +2 -2
- scitex/dsp/_resample.py +2 -1
- scitex/dsp/_time.py +3 -2
- scitex/dsp/_wavelet.py +3 -2
- scitex/dsp/add_noise.py +2 -2
- scitex/dsp/example.py +1 -0
- scitex/dsp/filt.py +10 -9
- scitex/dsp/template.py +3 -2
- scitex/dsp/utils/_differential_bandpass_filters.py +1 -1
- scitex/dsp/utils/pac.py +2 -2
- scitex/dt/_normalize_timestamp.py +432 -0
- scitex/errors.py +572 -0
- scitex/gen/_DimHandler.py +2 -2
- scitex/gen/__init__.py +37 -7
- scitex/gen/_deprecated_close.py +80 -0
- scitex/gen/_deprecated_start.py +26 -0
- scitex/gen/_detect_environment.py +152 -0
- scitex/gen/_detect_notebook_path.py +169 -0
- scitex/gen/_embed.py +6 -2
- scitex/gen/_get_notebook_path.py +257 -0
- scitex/gen/_less.py +1 -1
- scitex/gen/_list_packages.py +2 -2
- scitex/gen/_norm.py +44 -9
- scitex/gen/_norm_cache.py +269 -0
- scitex/gen/_src.py +3 -5
- scitex/gen/_title_case.py +3 -3
- scitex/io/__init__.py +28 -6
- scitex/io/_glob.py +13 -7
- scitex/io/_load.py +108 -21
- scitex/io/_load_cache.py +303 -0
- scitex/io/_load_configs.py +40 -15
- scitex/io/{_H5Explorer.py → _load_modules/_H5Explorer.py} +80 -17
- scitex/io/_load_modules/_ZarrExplorer.py +114 -0
- scitex/io/_load_modules/_bibtex.py +207 -0
- scitex/io/_load_modules/_hdf5.py +53 -178
- scitex/io/_load_modules/_json.py +5 -3
- scitex/io/_load_modules/_pdf.py +871 -16
- scitex/io/_load_modules/_sqlite3.py +15 -0
- scitex/io/_load_modules/_txt.py +41 -12
- scitex/io/_load_modules/_yaml.py +4 -3
- scitex/io/_load_modules/_zarr.py +126 -0
- scitex/io/_save.py +429 -171
- scitex/io/_save_modules/__init__.py +6 -0
- scitex/io/_save_modules/_bibtex.py +194 -0
- scitex/io/_save_modules/_csv.py +8 -4
- scitex/io/_save_modules/_excel.py +174 -15
- scitex/io/_save_modules/_hdf5.py +251 -226
- scitex/io/_save_modules/_image.py +1 -3
- scitex/io/_save_modules/_json.py +49 -4
- scitex/io/_save_modules/_listed_dfs_as_csv.py +1 -3
- scitex/io/_save_modules/_listed_scalars_as_csv.py +1 -3
- scitex/io/_save_modules/_tex.py +277 -0
- scitex/io/_save_modules/_yaml.py +42 -3
- scitex/io/_save_modules/_zarr.py +160 -0
- scitex/io/utils/__init__.py +20 -0
- scitex/io/utils/h5_to_zarr.py +616 -0
- scitex/linalg/_geometric_median.py +6 -2
- scitex/{gen/_tee.py → logging/_Tee.py} +43 -84
- scitex/logging/__init__.py +122 -0
- scitex/logging/_config.py +158 -0
- scitex/logging/_context.py +103 -0
- scitex/logging/_formatters.py +128 -0
- scitex/logging/_handlers.py +64 -0
- scitex/logging/_levels.py +35 -0
- scitex/logging/_logger.py +163 -0
- scitex/logging/_print_capture.py +95 -0
- scitex/ml/__init__.py +69 -0
- scitex/{ai/genai/anthropic.py → ml/_gen_ai/_Anthropic.py} +13 -19
- scitex/{ai/genai/base_genai.py → ml/_gen_ai/_BaseGenAI.py} +5 -5
- scitex/{ai/genai/deepseek.py → ml/_gen_ai/_DeepSeek.py} +11 -16
- scitex/{ai/genai/google.py → ml/_gen_ai/_Google.py} +7 -15
- scitex/{ai/genai/groq.py → ml/_gen_ai/_Groq.py} +1 -8
- scitex/{ai/genai/llama.py → ml/_gen_ai/_Llama.py} +3 -16
- scitex/{ai/genai/openai.py → ml/_gen_ai/_OpenAI.py} +3 -3
- scitex/{ai/genai/params.py → ml/_gen_ai/_PARAMS.py} +51 -65
- scitex/{ai/genai/perplexity.py → ml/_gen_ai/_Perplexity.py} +3 -14
- scitex/ml/_gen_ai/__init__.py +43 -0
- scitex/{ai/genai/calc_cost.py → ml/_gen_ai/_calc_cost.py} +1 -1
- scitex/{ai/genai/format_output_func.py → ml/_gen_ai/_format_output_func.py} +4 -4
- scitex/{ai/genai/genai_factory.py → ml/_gen_ai/_genai_factory.py} +8 -8
- scitex/ml/activation/__init__.py +8 -0
- scitex/ml/activation/_define.py +11 -0
- scitex/{ai/classifier_server.py → ml/classification/Classifier.py} +5 -5
- scitex/ml/classification/CrossValidationExperiment.py +374 -0
- scitex/ml/classification/__init__.py +46 -0
- scitex/ml/classification/reporters/_BaseClassificationReporter.py +281 -0
- scitex/ml/classification/reporters/_ClassificationReporter.py +773 -0
- scitex/ml/classification/reporters/_MultiClassificationReporter.py +406 -0
- scitex/ml/classification/reporters/_SingleClassificationReporter.py +1834 -0
- scitex/ml/classification/reporters/__init__.py +11 -0
- scitex/ml/classification/reporters/reporter_utils/_Plotter.py +1028 -0
- scitex/ml/classification/reporters/reporter_utils/__init__.py +80 -0
- scitex/ml/classification/reporters/reporter_utils/aggregation.py +457 -0
- scitex/ml/classification/reporters/reporter_utils/data_models.py +313 -0
- scitex/ml/classification/reporters/reporter_utils/reporting.py +1056 -0
- scitex/ml/classification/reporters/reporter_utils/storage.py +221 -0
- scitex/ml/classification/reporters/reporter_utils/validation.py +395 -0
- scitex/ml/classification/timeseries/_TimeSeriesBlockingSplit.py +568 -0
- scitex/ml/classification/timeseries/_TimeSeriesCalendarSplit.py +688 -0
- scitex/ml/classification/timeseries/_TimeSeriesMetadata.py +139 -0
- scitex/ml/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +1716 -0
- scitex/ml/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +1685 -0
- scitex/ml/classification/timeseries/_TimeSeriesStrategy.py +84 -0
- scitex/ml/classification/timeseries/_TimeSeriesStratifiedSplit.py +610 -0
- scitex/ml/classification/timeseries/__init__.py +39 -0
- scitex/ml/classification/timeseries/_normalize_timestamp.py +436 -0
- scitex/ml/clustering/__init__.py +11 -0
- scitex/ml/clustering/_pca.py +115 -0
- scitex/ml/clustering/_umap.py +376 -0
- scitex/ml/feature_extraction/__init__.py +56 -0
- scitex/ml/feature_extraction/vit.py +149 -0
- scitex/ml/feature_selection/__init__.py +30 -0
- scitex/ml/feature_selection/feature_selection.py +364 -0
- scitex/ml/loss/_L1L2Losses.py +34 -0
- scitex/ml/loss/__init__.py +12 -0
- scitex/ml/loss/multi_task_loss.py +47 -0
- scitex/ml/metrics/__init__.py +56 -0
- scitex/ml/metrics/_calc_bacc.py +61 -0
- scitex/ml/metrics/_calc_bacc_from_conf_mat.py +38 -0
- scitex/ml/metrics/_calc_clf_report.py +78 -0
- scitex/ml/metrics/_calc_conf_mat.py +93 -0
- scitex/ml/metrics/_calc_feature_importance.py +183 -0
- scitex/ml/metrics/_calc_mcc.py +61 -0
- scitex/ml/metrics/_calc_pre_rec_auc.py +116 -0
- scitex/ml/metrics/_calc_roc_auc.py +110 -0
- scitex/ml/metrics/_calc_seizure_prediction_metrics.py +490 -0
- scitex/ml/metrics/_calc_silhouette_score.py +503 -0
- scitex/ml/metrics/_normalize_labels.py +83 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/__init__.py +0 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/__init__.py +3 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger.py +207 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger2020.py +238 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger913A.py +215 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/ranger/rangerqh.py +184 -0
- scitex/ml/optim/Ranger_Deep_Learning_Optimizer/setup.py +24 -0
- scitex/ml/optim/__init__.py +13 -0
- scitex/ml/optim/_get_set.py +31 -0
- scitex/ml/optim/_optimizers.py +71 -0
- scitex/ml/plt/__init__.py +60 -0
- scitex/ml/plt/_plot_conf_mat.py +663 -0
- scitex/ml/plt/_plot_feature_importance.py +323 -0
- scitex/ml/plt/_plot_learning_curve.py +345 -0
- scitex/ml/plt/_plot_optuna_study.py +225 -0
- scitex/ml/plt/_plot_pre_rec_curve.py +290 -0
- scitex/ml/plt/_plot_roc_curve.py +255 -0
- scitex/ml/sk/__init__.py +11 -0
- scitex/ml/sk/_clf.py +58 -0
- scitex/ml/sk/_to_sktime.py +100 -0
- scitex/ml/sklearn/__init__.py +26 -0
- scitex/ml/sklearn/clf.py +58 -0
- scitex/ml/sklearn/to_sktime.py +100 -0
- scitex/{ai/training/early_stopping.py → ml/training/_EarlyStopping.py} +1 -2
- scitex/{ai → ml/training}/_LearningCurveLogger.py +198 -242
- scitex/ml/training/__init__.py +7 -0
- scitex/ml/utils/__init__.py +22 -0
- scitex/ml/utils/_check_params.py +50 -0
- scitex/ml/utils/_default_dataset.py +46 -0
- scitex/ml/utils/_format_samples_for_sktime.py +26 -0
- scitex/ml/utils/_label_encoder.py +134 -0
- scitex/ml/utils/_merge_labels.py +22 -0
- scitex/ml/utils/_sliding_window_data_augmentation.py +11 -0
- scitex/ml/utils/_under_sample.py +51 -0
- scitex/ml/utils/_verify_n_gpus.py +16 -0
- scitex/ml/utils/grid_search.py +148 -0
- scitex/nn/_BNet.py +15 -9
- scitex/nn/_Filters.py +2 -2
- scitex/nn/_ModulationIndex.py +2 -2
- scitex/nn/_PAC.py +1 -1
- scitex/nn/_Spectrogram.py +12 -3
- scitex/nn/__init__.py +9 -10
- scitex/path/__init__.py +18 -0
- scitex/path/_clean.py +4 -0
- scitex/path/_find.py +9 -4
- scitex/path/_symlink.py +348 -0
- scitex/path/_version.py +4 -3
- scitex/pd/__init__.py +2 -0
- scitex/pd/_get_unique.py +99 -0
- scitex/plt/__init__.py +114 -5
- scitex/plt/_subplots/_AxesWrapper.py +1 -3
- scitex/plt/_subplots/_AxisWrapper.py +7 -3
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin.py +47 -13
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin.py +160 -2
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin.py +26 -4
- scitex/plt/_subplots/_AxisWrapperMixins/_UnitAwareMixin.py +322 -0
- scitex/plt/_subplots/_AxisWrapperMixins/__init__.py +1 -0
- scitex/plt/_subplots/_FigWrapper.py +62 -6
- scitex/plt/_subplots/_export_as_csv.py +43 -27
- scitex/plt/_subplots/_export_as_csv_formatters/__init__.py +5 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_annotate.py +81 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_bar.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_barh.py +20 -5
- scitex/plt/_subplots/_export_as_csv_formatters/_format_boxplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_contour.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_errorbar.py +35 -18
- scitex/plt/_subplots/_export_as_csv_formatters/_format_eventplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill_between.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_hist.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow2d.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +15 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_box.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_conf_mat.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_ecdf.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_fillv.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_heatmap.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_image.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_joyplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_kde.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_line.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_ci.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_std.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_median_iqr.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_raster.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_rectangle.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter.py +35 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter_hist.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_shaded_line.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_violin.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_scatter.py +6 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_barplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_boxplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_heatmap.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_histplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_jointplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_kdeplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_lineplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_pairplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_scatterplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_stripplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_swarmplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_violinplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_text.py +60 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violin.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violinplot.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters/test_formatters.py +1 -3
- scitex/plt/_subplots/_export_as_csv_formatters.py +56 -59
- scitex/plt/ax/_style/_hide_spines.py +1 -3
- scitex/plt/ax/_style/_rotate_labels.py +180 -76
- scitex/plt/ax/_style/_rotate_labels_v01.py +248 -0
- scitex/plt/ax/_style/_set_meta.py +11 -4
- scitex/plt/ax/_style/_set_supxyt.py +3 -3
- scitex/plt/ax/_style/_set_xyt.py +3 -3
- scitex/plt/ax/_style/_share_axes.py +2 -2
- scitex/plt/color/__init__.py +4 -4
- scitex/plt/color/{_get_colors_from_cmap.py → _get_colors_from_conf_matap.py} +7 -7
- scitex/plt/utils/_configure_mpl.py +99 -86
- scitex/plt/utils/_histogram_utils.py +1 -3
- scitex/plt/utils/_is_valid_axis.py +1 -3
- scitex/plt/utils/_scitex_config.py +1 -0
- scitex/repro/__init__.py +75 -0
- scitex/{reproduce → repro}/_gen_ID.py +1 -1
- scitex/{reproduce → repro}/_gen_timestamp.py +1 -1
- scitex/repro_rng/_RandomStateManager.py +590 -0
- scitex/repro_rng/_RandomStateManager_v01-no-verbose-options.py +414 -0
- scitex/repro_rng/__init__.py +39 -0
- scitex/reproduce/__init__.py +25 -13
- scitex/reproduce/_hash_array.py +22 -0
- scitex/resource/_get_processor_usages.py +4 -4
- scitex/resource/_get_specs.py +2 -2
- scitex/resource/_log_processor_usages.py +2 -2
- scitex/rng/_RandomStateManager.py +590 -0
- scitex/rng/_RandomStateManager_v01-no-verbose-options.py +414 -0
- scitex/rng/__init__.py +39 -0
- scitex/scholar/__init__.py +309 -19
- scitex/scholar/__main__.py +319 -0
- scitex/scholar/auth/ScholarAuthManager.py +308 -0
- scitex/scholar/auth/__init__.py +12 -0
- scitex/scholar/auth/core/AuthenticationGateway.py +473 -0
- scitex/scholar/auth/core/BrowserAuthenticator.py +386 -0
- scitex/scholar/auth/core/StrategyResolver.py +309 -0
- scitex/scholar/auth/core/__init__.py +16 -0
- scitex/scholar/auth/gateway/_OpenURLLinkFinder.py +120 -0
- scitex/scholar/auth/gateway/_OpenURLResolver.py +209 -0
- scitex/scholar/auth/gateway/__init__.py +38 -0
- scitex/scholar/auth/gateway/_resolve_functions.py +101 -0
- scitex/scholar/auth/providers/BaseAuthenticator.py +166 -0
- scitex/scholar/auth/providers/EZProxyAuthenticator.py +484 -0
- scitex/scholar/auth/providers/OpenAthensAuthenticator.py +619 -0
- scitex/scholar/auth/providers/ShibbolethAuthenticator.py +686 -0
- scitex/scholar/auth/providers/__init__.py +18 -0
- scitex/scholar/auth/session/AuthCacheManager.py +189 -0
- scitex/scholar/auth/session/SessionManager.py +159 -0
- scitex/scholar/auth/session/__init__.py +11 -0
- scitex/scholar/auth/sso/BaseSSOAutomator.py +373 -0
- scitex/scholar/auth/sso/OpenAthensSSOAutomator.py +378 -0
- scitex/scholar/auth/sso/SSOAutomator.py +180 -0
- scitex/scholar/auth/sso/UniversityOfMelbourneSSOAutomator.py +380 -0
- scitex/scholar/auth/sso/__init__.py +15 -0
- scitex/scholar/browser/ScholarBrowserManager.py +705 -0
- scitex/scholar/browser/__init__.py +38 -0
- scitex/scholar/browser/utils/__init__.py +13 -0
- scitex/scholar/browser/utils/click_and_wait.py +205 -0
- scitex/scholar/browser/utils/close_unwanted_pages.py +140 -0
- scitex/scholar/browser/utils/wait_redirects.py +732 -0
- scitex/scholar/config/PublisherRules.py +132 -0
- scitex/scholar/config/ScholarConfig.py +126 -0
- scitex/scholar/config/__init__.py +17 -0
- scitex/scholar/core/Paper.py +627 -0
- scitex/scholar/core/Papers.py +722 -0
- scitex/scholar/core/Scholar.py +1975 -0
- scitex/scholar/core/__init__.py +9 -0
- scitex/scholar/impact_factor/ImpactFactorEngine.py +204 -0
- scitex/scholar/impact_factor/__init__.py +20 -0
- scitex/scholar/impact_factor/estimation/ImpactFactorEstimationEngine.py +0 -0
- scitex/scholar/impact_factor/estimation/__init__.py +40 -0
- scitex/scholar/impact_factor/estimation/build_database.py +0 -0
- scitex/scholar/impact_factor/estimation/core/__init__.py +28 -0
- scitex/scholar/impact_factor/estimation/core/cache_manager.py +523 -0
- scitex/scholar/impact_factor/estimation/core/calculator.py +355 -0
- scitex/scholar/impact_factor/estimation/core/journal_matcher.py +428 -0
- scitex/scholar/integration/__init__.py +59 -0
- scitex/scholar/integration/base.py +502 -0
- scitex/scholar/integration/mendeley/__init__.py +22 -0
- scitex/scholar/integration/mendeley/exporter.py +166 -0
- scitex/scholar/integration/mendeley/importer.py +236 -0
- scitex/scholar/integration/mendeley/linker.py +79 -0
- scitex/scholar/integration/mendeley/mapper.py +212 -0
- scitex/scholar/integration/zotero/__init__.py +27 -0
- scitex/scholar/integration/zotero/__main__.py +264 -0
- scitex/scholar/integration/zotero/exporter.py +351 -0
- scitex/scholar/integration/zotero/importer.py +372 -0
- scitex/scholar/integration/zotero/linker.py +415 -0
- scitex/scholar/integration/zotero/mapper.py +286 -0
- scitex/scholar/metadata_engines/ScholarEngine.py +588 -0
- scitex/scholar/metadata_engines/__init__.py +21 -0
- scitex/scholar/metadata_engines/individual/ArXivEngine.py +397 -0
- scitex/scholar/metadata_engines/individual/CrossRefEngine.py +274 -0
- scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +263 -0
- scitex/scholar/metadata_engines/individual/OpenAlexEngine.py +350 -0
- scitex/scholar/metadata_engines/individual/PubMedEngine.py +329 -0
- scitex/scholar/metadata_engines/individual/SemanticScholarEngine.py +438 -0
- scitex/scholar/metadata_engines/individual/URLDOIEngine.py +410 -0
- scitex/scholar/metadata_engines/individual/_BaseDOIEngine.py +487 -0
- scitex/scholar/metadata_engines/individual/__init__.py +7 -0
- scitex/scholar/metadata_engines/utils/_PubMedConverter.py +469 -0
- scitex/scholar/metadata_engines/utils/_URLDOIExtractor.py +283 -0
- scitex/scholar/metadata_engines/utils/__init__.py +30 -0
- scitex/scholar/metadata_engines/utils/_metadata2bibtex.py +103 -0
- scitex/scholar/metadata_engines/utils/_standardize_metadata.py +376 -0
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +579 -0
- scitex/scholar/pdf_download/__init__.py +5 -0
- scitex/scholar/pdf_download/strategies/__init__.py +38 -0
- scitex/scholar/pdf_download/strategies/chrome_pdf_viewer.py +376 -0
- scitex/scholar/pdf_download/strategies/direct_download.py +131 -0
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +167 -0
- scitex/scholar/pdf_download/strategies/manual_download_utils.py +996 -0
- scitex/scholar/pdf_download/strategies/response_body.py +207 -0
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +364 -0
- scitex/scholar/pipelines/ScholarPipelineParallel.py +478 -0
- scitex/scholar/pipelines/ScholarPipelineSingle.py +767 -0
- scitex/scholar/pipelines/__init__.py +49 -0
- scitex/scholar/storage/BibTeXHandler.py +1018 -0
- scitex/scholar/storage/PaperIO.py +468 -0
- scitex/scholar/storage/ScholarLibrary.py +182 -0
- scitex/scholar/storage/_DeduplicationManager.py +548 -0
- scitex/scholar/storage/_LibraryCacheManager.py +724 -0
- scitex/scholar/storage/_LibraryManager.py +1835 -0
- scitex/scholar/storage/__init__.py +28 -0
- scitex/scholar/url_finder/ScholarURLFinder.py +379 -0
- scitex/scholar/url_finder/__init__.py +7 -0
- scitex/scholar/url_finder/strategies/__init__.py +33 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_direct_links.py +261 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_dropdown.py +67 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_href.py +204 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_navigation.py +256 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_publisher_patterns.py +165 -0
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_zotero_translators.py +163 -0
- scitex/scholar/url_finder/strategies/find_supplementary_urls_by_href.py +70 -0
- scitex/scholar/utils/__init__.py +22 -0
- scitex/scholar/utils/bibtex/__init__.py +9 -0
- scitex/scholar/utils/bibtex/_parse_bibtex.py +71 -0
- scitex/scholar/utils/cleanup/__init__.py +8 -0
- scitex/scholar/utils/cleanup/_cleanup_scholar_processes.py +96 -0
- scitex/scholar/utils/cleanup/cleanup_old_extractions.py +117 -0
- scitex/scholar/utils/text/_TextNormalizer.py +407 -0
- scitex/scholar/utils/text/__init__.py +9 -0
- scitex/scholar/zotero/__init__.py +38 -0
- scitex/session/__init__.py +51 -0
- scitex/session/_lifecycle.py +736 -0
- scitex/session/_manager.py +102 -0
- scitex/session/template.py +122 -0
- scitex/stats/__init__.py +30 -26
- scitex/stats/correct/__init__.py +21 -0
- scitex/stats/correct/_correct_bonferroni.py +551 -0
- scitex/stats/correct/_correct_fdr.py +634 -0
- scitex/stats/correct/_correct_holm.py +548 -0
- scitex/stats/correct/_correct_sidak.py +499 -0
- scitex/stats/descriptive/__init__.py +85 -0
- scitex/stats/descriptive/_circular.py +540 -0
- scitex/stats/descriptive/_describe.py +219 -0
- scitex/stats/descriptive/_nan.py +518 -0
- scitex/stats/descriptive/_real.py +189 -0
- scitex/stats/effect_sizes/__init__.py +41 -0
- scitex/stats/effect_sizes/_cliffs_delta.py +325 -0
- scitex/stats/effect_sizes/_cohens_d.py +342 -0
- scitex/stats/effect_sizes/_epsilon_squared.py +315 -0
- scitex/stats/effect_sizes/_eta_squared.py +302 -0
- scitex/stats/effect_sizes/_prob_superiority.py +296 -0
- scitex/stats/posthoc/__init__.py +19 -0
- scitex/stats/posthoc/_dunnett.py +463 -0
- scitex/stats/posthoc/_games_howell.py +383 -0
- scitex/stats/posthoc/_tukey_hsd.py +367 -0
- scitex/stats/power/__init__.py +19 -0
- scitex/stats/power/_power.py +433 -0
- scitex/stats/template.py +119 -0
- scitex/stats/utils/__init__.py +62 -0
- scitex/stats/utils/_effect_size.py +985 -0
- scitex/stats/utils/_formatters.py +270 -0
- scitex/stats/utils/_normalizers.py +927 -0
- scitex/stats/utils/_power.py +433 -0
- scitex/stats_v01/_EffectSizeCalculator.py +488 -0
- scitex/stats_v01/_StatisticalValidator.py +411 -0
- scitex/stats_v01/__init__.py +60 -0
- scitex/stats_v01/_additional_tests.py +415 -0
- scitex/{stats → stats_v01}/_p2stars.py +19 -5
- scitex/stats_v01/_two_sample_tests.py +141 -0
- scitex/stats_v01/desc/__init__.py +83 -0
- scitex/stats_v01/desc/_circular.py +540 -0
- scitex/stats_v01/desc/_describe.py +219 -0
- scitex/stats_v01/desc/_nan.py +518 -0
- scitex/{stats/desc/_nan.py → stats_v01/desc/_nan_v01-20250920_145731.py} +23 -12
- scitex/stats_v01/desc/_real.py +189 -0
- scitex/stats_v01/tests/__corr_test_optimized.py +221 -0
- scitex/stats_v01/tests/_corr_test_optimized.py +179 -0
- scitex/str/__init__.py +1 -3
- scitex/str/_clean_path.py +6 -2
- scitex/str/_latex_fallback.py +267 -160
- scitex/str/_parse.py +44 -36
- scitex/str/_printc.py +1 -3
- scitex/template/__init__.py +87 -0
- scitex/template/_create_project.py +267 -0
- scitex/template/create_pip_project.py +80 -0
- scitex/template/create_research.py +80 -0
- scitex/template/create_singularity.py +80 -0
- scitex/units.py +291 -0
- scitex/utils/_compress_hdf5.py +14 -3
- scitex/utils/_email.py +21 -2
- scitex/utils/_grid.py +6 -4
- scitex/utils/_notify.py +13 -10
- scitex/utils/_verify_scitex_format.py +589 -0
- scitex/utils/_verify_scitex_format_v01.py +370 -0
- scitex/utils/template.py +122 -0
- scitex/web/_search_pubmed.py +62 -16
- scitex-2.1.0.dist-info/LICENSE +21 -0
- scitex-2.1.0.dist-info/METADATA +677 -0
- scitex-2.1.0.dist-info/RECORD +919 -0
- {scitex-2.0.0.dist-info → scitex-2.1.0.dist-info}/WHEEL +1 -1
- scitex-2.1.0.dist-info/entry_points.txt +3 -0
- scitex/ai/__Classifiers.py +0 -101
- scitex/ai/classification/classification_reporter.py +0 -1137
- scitex/ai/classification/classifiers.py +0 -101
- scitex/ai/classification_reporter.py +0 -1161
- scitex/ai/genai/__init__.py +0 -277
- scitex/ai/genai/anthropic_provider.py +0 -320
- scitex/ai/genai/anthropic_refactored.py +0 -109
- scitex/ai/genai/auth_manager.py +0 -200
- scitex/ai/genai/base_provider.py +0 -291
- scitex/ai/genai/chat_history.py +0 -307
- scitex/ai/genai/cost_tracker.py +0 -276
- scitex/ai/genai/deepseek_provider.py +0 -251
- scitex/ai/genai/google_provider.py +0 -228
- scitex/ai/genai/groq_provider.py +0 -248
- scitex/ai/genai/image_processor.py +0 -250
- scitex/ai/genai/llama_provider.py +0 -214
- scitex/ai/genai/mock_provider.py +0 -127
- scitex/ai/genai/model_registry.py +0 -304
- scitex/ai/genai/openai_provider.py +0 -293
- scitex/ai/genai/perplexity_provider.py +0 -205
- scitex/ai/genai/provider_base.py +0 -302
- scitex/ai/genai/provider_factory.py +0 -370
- scitex/ai/genai/response_handler.py +0 -235
- scitex/ai/layer/_Pass.py +0 -21
- scitex/ai/layer/__init__.py +0 -10
- scitex/ai/layer/_switch.py +0 -8
- scitex/ai/metrics/_bACC.py +0 -51
- scitex/ai/plt/_learning_curve.py +0 -194
- scitex/ai/plt/_optuna_study.py +0 -111
- scitex/ai/plt/aucs/__init__.py +0 -2
- scitex/ai/plt/aucs/example.py +0 -60
- scitex/ai/plt/aucs/pre_rec_auc.py +0 -223
- scitex/ai/plt/aucs/roc_auc.py +0 -246
- scitex/ai/sampling/undersample.py +0 -29
- scitex/db/_SQLite3.py +0 -2136
- scitex/db/_SQLite3Mixins/_BlobMixin.py +0 -229
- scitex/gen/_close.py +0 -222
- scitex/gen/_start.py +0 -451
- scitex/general/__init__.py +0 -5
- scitex/io/_load_modules/_db.py +0 -24
- scitex/life/__init__.py +0 -10
- scitex/life/_monitor_rain.py +0 -49
- scitex/reproduce/_fix_seeds.py +0 -45
- scitex/res/__init__.py +0 -5
- scitex/scholar/_local_search.py +0 -454
- scitex/scholar/_paper.py +0 -244
- scitex/scholar/_pdf_downloader.py +0 -325
- scitex/scholar/_search.py +0 -393
- scitex/scholar/_vector_search.py +0 -370
- scitex/scholar/_web_sources.py +0 -457
- scitex/stats/desc/__init__.py +0 -40
- scitex-2.0.0.dist-info/METADATA +0 -307
- scitex-2.0.0.dist-info/RECORD +0 -572
- scitex-2.0.0.dist-info/licenses/LICENSE +0 -7
- /scitex/ai/{act → activation}/__init__.py +0 -0
- /scitex/ai/{act → activation}/_define.py +0 -0
- /scitex/ai/{early_stopping.py → training/_EarlyStopping.py} +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_ImportExportMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_IndexMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_RowMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/_TableMixin.py +0 -0
- /scitex/db/{_PostgreSQLMixins → _postgresql/_PostgreSQLMixins}/__init__.py +0 -0
- /scitex/{stats → stats_v01}/_calc_partial_corr.py +0 -0
- /scitex/{stats → stats_v01}/_corr_test_multi.py +0 -0
- /scitex/{stats → stats_v01}/_corr_test_wrapper.py +0 -0
- /scitex/{stats → stats_v01}/_describe_wrapper.py +0 -0
- /scitex/{stats → stats_v01}/_multiple_corrections.py +0 -0
- /scitex/{stats → stats_v01}/_nan_stats.py +0 -0
- /scitex/{stats → stats_v01}/_p2stars_wrapper.py +0 -0
- /scitex/{stats → stats_v01}/_statistical_tests.py +0 -0
- /scitex/{stats/desc/_describe.py → stats_v01/desc/_describe_v01-20250920_145731.py} +0 -0
- /scitex/{stats/desc/_real.py → stats_v01/desc/_real_v01-20250920_145731.py} +0 -0
- /scitex/{stats → stats_v01}/multiple/__init__.py +0 -0
- /scitex/{stats → stats_v01}/multiple/_bonferroni_correction.py +0 -0
- /scitex/{stats → stats_v01}/multiple/_fdr_correction.py +0 -0
- /scitex/{stats → stats_v01}/multiple/_multicompair.py +0 -0
- /scitex/{stats → stats_v01}/tests/__corr_test.py +0 -0
- /scitex/{stats → stats_v01}/tests/__corr_test_multi.py +0 -0
- /scitex/{stats → stats_v01}/tests/__corr_test_single.py +0 -0
- /scitex/{stats → stats_v01}/tests/__init__.py +0 -0
- /scitex/{stats → stats_v01}/tests/_brunner_munzel_test.py +0 -0
- /scitex/{stats → stats_v01}/tests/_nocorrelation_test.py +0 -0
- /scitex/{stats → stats_v01}/tests/_smirnov_grubbs.py +0 -0
- {scitex-2.0.0.dist-info → scitex-2.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Deduplication manager for handling duplicate papers in the library."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, List, Optional, Tuple, Set
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
from scitex import logging
|
|
12
|
+
from scitex.scholar.config import ScholarConfig
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DeduplicationManager:
|
|
18
|
+
"""Manages deduplication of papers in the MASTER library."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: ScholarConfig = None):
|
|
21
|
+
self.name = self.__class__.__name__
|
|
22
|
+
self.config = config or ScholarConfig()
|
|
23
|
+
self.library_dir = self.config.path_manager.library_dir
|
|
24
|
+
self.master_dir = self.config.path_manager.get_library_master_dir()
|
|
25
|
+
|
|
26
|
+
def find_duplicate_papers(self) -> Dict[str, List[Path]]:
|
|
27
|
+
"""Find all duplicate papers in MASTER library.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Dictionary mapping paper fingerprint to list of duplicate paths
|
|
31
|
+
"""
|
|
32
|
+
logger.info("Scanning MASTER library for duplicates...")
|
|
33
|
+
|
|
34
|
+
paper_groups = {} # fingerprint -> list of paths
|
|
35
|
+
papers_by_title = {} # normalized_title -> list of (path, metadata)
|
|
36
|
+
|
|
37
|
+
if not self.master_dir.exists():
|
|
38
|
+
return paper_groups
|
|
39
|
+
|
|
40
|
+
# First pass: collect all papers
|
|
41
|
+
all_papers = []
|
|
42
|
+
for paper_dir in self.master_dir.iterdir():
|
|
43
|
+
if not paper_dir.is_dir():
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
metadata_file = paper_dir / "metadata.json"
|
|
47
|
+
if not metadata_file.exists():
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
with open(metadata_file) as f:
|
|
52
|
+
metadata = json.load(f)
|
|
53
|
+
all_papers.append((paper_dir, metadata))
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.debug(f"Error reading {metadata_file}: {e}")
|
|
56
|
+
|
|
57
|
+
# Second pass: group by fingerprint AND by normalized title
|
|
58
|
+
for paper_dir, metadata in all_papers:
|
|
59
|
+
# Group by fingerprint (existing logic)
|
|
60
|
+
fingerprint = self._generate_paper_fingerprint(metadata)
|
|
61
|
+
if fingerprint:
|
|
62
|
+
if fingerprint not in paper_groups:
|
|
63
|
+
paper_groups[fingerprint] = []
|
|
64
|
+
paper_groups[fingerprint].append(paper_dir)
|
|
65
|
+
|
|
66
|
+
# Also group by normalized title for cross-DOI duplicate detection
|
|
67
|
+
title = metadata.get("title")
|
|
68
|
+
if title:
|
|
69
|
+
title_norm = self._normalize_title(title)
|
|
70
|
+
if title_norm:
|
|
71
|
+
if title_norm not in papers_by_title:
|
|
72
|
+
papers_by_title[title_norm] = []
|
|
73
|
+
papers_by_title[title_norm].append((paper_dir, metadata))
|
|
74
|
+
|
|
75
|
+
# Find duplicates by title (papers with same title but different fingerprints)
|
|
76
|
+
for title_norm, papers in papers_by_title.items():
|
|
77
|
+
if len(papers) > 1:
|
|
78
|
+
# Check if these are truly duplicates (same title, similar year)
|
|
79
|
+
groups_to_merge = {} # fingerprint -> paths
|
|
80
|
+
|
|
81
|
+
for paper_dir, metadata in papers:
|
|
82
|
+
fp = self._generate_paper_fingerprint(metadata)
|
|
83
|
+
if fp not in groups_to_merge:
|
|
84
|
+
groups_to_merge[fp] = []
|
|
85
|
+
groups_to_merge[fp].append(paper_dir)
|
|
86
|
+
|
|
87
|
+
# If we have multiple fingerprints for same title, merge them
|
|
88
|
+
if len(groups_to_merge) > 1:
|
|
89
|
+
# Use the fingerprint with DOI if available, otherwise first one
|
|
90
|
+
main_fp = None
|
|
91
|
+
for fp in groups_to_merge:
|
|
92
|
+
if fp.startswith("DOI:"):
|
|
93
|
+
main_fp = fp
|
|
94
|
+
break
|
|
95
|
+
if not main_fp:
|
|
96
|
+
main_fp = list(groups_to_merge.keys())[0]
|
|
97
|
+
|
|
98
|
+
# Merge all papers into the main fingerprint group
|
|
99
|
+
if main_fp not in paper_groups:
|
|
100
|
+
paper_groups[main_fp] = []
|
|
101
|
+
|
|
102
|
+
for fp, paths in groups_to_merge.items():
|
|
103
|
+
for path in paths:
|
|
104
|
+
if path not in paper_groups[main_fp]:
|
|
105
|
+
paper_groups[main_fp].append(path)
|
|
106
|
+
|
|
107
|
+
# Filter to only groups with duplicates
|
|
108
|
+
duplicates = {
|
|
109
|
+
fp: paths for fp, paths in paper_groups.items()
|
|
110
|
+
if len(paths) > 1
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if duplicates:
|
|
114
|
+
total_dups = sum(len(paths) - 1 for paths in duplicates.values())
|
|
115
|
+
logger.warning(f"Found {len(duplicates)} groups with {total_dups} duplicate papers")
|
|
116
|
+
else:
|
|
117
|
+
logger.info("No duplicates found")
|
|
118
|
+
|
|
119
|
+
return duplicates
|
|
120
|
+
|
|
121
|
+
def _generate_paper_fingerprint(self, metadata: Dict) -> Optional[str]:
|
|
122
|
+
"""Generate a fingerprint for paper comparison.
|
|
123
|
+
|
|
124
|
+
Uses DOI if available, otherwise title+author+year.
|
|
125
|
+
"""
|
|
126
|
+
# Prefer DOI as unique identifier
|
|
127
|
+
doi = metadata.get("doi")
|
|
128
|
+
if doi:
|
|
129
|
+
return f"DOI:{self._normalize_doi(doi)}"
|
|
130
|
+
|
|
131
|
+
# Fallback to title+author+year
|
|
132
|
+
title = metadata.get("title")
|
|
133
|
+
if not title:
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
# Normalize title
|
|
137
|
+
title_norm = self._normalize_title(title)
|
|
138
|
+
|
|
139
|
+
# Get first author
|
|
140
|
+
authors = metadata.get("authors", [])
|
|
141
|
+
first_author = ""
|
|
142
|
+
if authors:
|
|
143
|
+
if isinstance(authors[0], str):
|
|
144
|
+
first_author = self._normalize_author(authors[0])
|
|
145
|
+
elif isinstance(authors[0], dict):
|
|
146
|
+
name = authors[0].get("name", "")
|
|
147
|
+
first_author = self._normalize_author(name)
|
|
148
|
+
|
|
149
|
+
# Get year
|
|
150
|
+
year = str(metadata.get("year", ""))
|
|
151
|
+
|
|
152
|
+
return f"META:{title_norm}:{first_author}:{year}"
|
|
153
|
+
|
|
154
|
+
def _normalize_doi(self, doi: str) -> str:
|
|
155
|
+
"""Normalize DOI for comparison."""
|
|
156
|
+
if not doi:
|
|
157
|
+
return ""
|
|
158
|
+
# Remove URL prefixes
|
|
159
|
+
doi = doi.replace("https://doi.org/", "")
|
|
160
|
+
doi = doi.replace("http://dx.doi.org/", "")
|
|
161
|
+
doi = doi.replace("doi:", "")
|
|
162
|
+
return doi.lower().strip()
|
|
163
|
+
|
|
164
|
+
def _normalize_title(self, title: str) -> str:
|
|
165
|
+
"""Normalize title for comparison."""
|
|
166
|
+
if not title:
|
|
167
|
+
return ""
|
|
168
|
+
# Remove special characters and normalize whitespace
|
|
169
|
+
title = re.sub(r'[^\w\s]', '', title.lower())
|
|
170
|
+
title = ' '.join(title.split())
|
|
171
|
+
# Remove common words
|
|
172
|
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for'}
|
|
173
|
+
words = [w for w in title.split() if w not in stop_words]
|
|
174
|
+
return ' '.join(words)
|
|
175
|
+
|
|
176
|
+
def _normalize_author(self, author: str) -> str:
|
|
177
|
+
"""Normalize author name for comparison."""
|
|
178
|
+
if not author:
|
|
179
|
+
return ""
|
|
180
|
+
# Extract last name
|
|
181
|
+
author = author.strip()
|
|
182
|
+
if ',' in author:
|
|
183
|
+
# Last, First format
|
|
184
|
+
return author.split(',')[0].strip().lower()
|
|
185
|
+
else:
|
|
186
|
+
# First Last format
|
|
187
|
+
parts = author.split()
|
|
188
|
+
return parts[-1].lower() if parts else ""
|
|
189
|
+
|
|
190
|
+
def merge_duplicate_papers(
|
|
191
|
+
self,
|
|
192
|
+
paper_dirs: List[Path],
|
|
193
|
+
strategy: str = "best_metadata"
|
|
194
|
+
) -> Tuple[Path, List[Path]]:
|
|
195
|
+
"""Merge duplicate papers into one canonical entry.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
paper_dirs: List of duplicate paper directories
|
|
199
|
+
strategy: Merge strategy ('best_metadata', 'newest', 'oldest')
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Tuple of (kept_dir, removed_dirs)
|
|
203
|
+
"""
|
|
204
|
+
if len(paper_dirs) < 2:
|
|
205
|
+
return paper_dirs[0] if paper_dirs else None, []
|
|
206
|
+
|
|
207
|
+
# Score each paper to determine which to keep
|
|
208
|
+
scored_papers = []
|
|
209
|
+
for paper_dir in paper_dirs:
|
|
210
|
+
metadata_file = paper_dir / "metadata.json"
|
|
211
|
+
try:
|
|
212
|
+
with open(metadata_file) as f:
|
|
213
|
+
metadata = json.load(f)
|
|
214
|
+
|
|
215
|
+
score = self._score_paper_metadata(metadata, paper_dir)
|
|
216
|
+
scored_papers.append((score, paper_dir, metadata))
|
|
217
|
+
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.debug(f"Error scoring {paper_dir}: {e}")
|
|
220
|
+
scored_papers.append((0, paper_dir, {}))
|
|
221
|
+
|
|
222
|
+
# Sort by score (highest first)
|
|
223
|
+
scored_papers.sort(key=lambda x: x[0], reverse=True)
|
|
224
|
+
|
|
225
|
+
# Keep the best one
|
|
226
|
+
best_score, keep_dir, keep_metadata = scored_papers[0]
|
|
227
|
+
remove_dirs = [p[1] for p in scored_papers[1:]]
|
|
228
|
+
|
|
229
|
+
logger.info(f"Keeping {keep_dir.name} (score: {best_score})")
|
|
230
|
+
logger.info(f"Will merge/remove: {[d.name for d in remove_dirs]}")
|
|
231
|
+
|
|
232
|
+
# Merge metadata from all duplicates
|
|
233
|
+
merged_metadata = self._merge_metadata(scored_papers)
|
|
234
|
+
|
|
235
|
+
# Save merged metadata
|
|
236
|
+
metadata_file = keep_dir / "metadata.json"
|
|
237
|
+
metadata_backup = keep_dir / f"metadata.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
238
|
+
|
|
239
|
+
# Backup original
|
|
240
|
+
shutil.copy2(metadata_file, metadata_backup)
|
|
241
|
+
|
|
242
|
+
# Write merged metadata
|
|
243
|
+
with open(metadata_file, 'w') as f:
|
|
244
|
+
json.dump(merged_metadata, f, indent=2)
|
|
245
|
+
|
|
246
|
+
# Merge any PDFs or other files
|
|
247
|
+
self._merge_files(keep_dir, remove_dirs)
|
|
248
|
+
|
|
249
|
+
return keep_dir, remove_dirs
|
|
250
|
+
|
|
251
|
+
def _score_paper_metadata(self, metadata: Dict, paper_dir: Path) -> int:
|
|
252
|
+
"""Score paper metadata quality for deduplication priority.
|
|
253
|
+
|
|
254
|
+
Higher score = better metadata = should be kept
|
|
255
|
+
"""
|
|
256
|
+
score = 0
|
|
257
|
+
|
|
258
|
+
# DOI is most important
|
|
259
|
+
if metadata.get("doi"):
|
|
260
|
+
score += 1000
|
|
261
|
+
|
|
262
|
+
# Citation count (log scale to avoid extreme dominance)
|
|
263
|
+
citation_count = metadata.get("citation_count", 0)
|
|
264
|
+
if citation_count:
|
|
265
|
+
import math
|
|
266
|
+
score += min(int(math.log10(citation_count + 1) * 100), 500)
|
|
267
|
+
|
|
268
|
+
# Impact factor
|
|
269
|
+
impact_factor = metadata.get("impact_factor", 0)
|
|
270
|
+
if impact_factor:
|
|
271
|
+
score += min(int(impact_factor * 10), 200)
|
|
272
|
+
|
|
273
|
+
# Abstract
|
|
274
|
+
if metadata.get("abstract"):
|
|
275
|
+
score += 50
|
|
276
|
+
|
|
277
|
+
# PDF exists
|
|
278
|
+
pdf_files = list(paper_dir.glob("*.pdf"))
|
|
279
|
+
if pdf_files:
|
|
280
|
+
score += 100
|
|
281
|
+
|
|
282
|
+
# Complete author list
|
|
283
|
+
authors = metadata.get("authors", [])
|
|
284
|
+
if len(authors) > 1:
|
|
285
|
+
score += 20
|
|
286
|
+
|
|
287
|
+
# Journal name
|
|
288
|
+
if metadata.get("journal"):
|
|
289
|
+
score += 30
|
|
290
|
+
|
|
291
|
+
# URL
|
|
292
|
+
if metadata.get("url"):
|
|
293
|
+
score += 10
|
|
294
|
+
|
|
295
|
+
# PDF URL
|
|
296
|
+
if metadata.get("pdf_url"):
|
|
297
|
+
score += 20
|
|
298
|
+
|
|
299
|
+
# Publisher
|
|
300
|
+
if metadata.get("publisher"):
|
|
301
|
+
score += 10
|
|
302
|
+
|
|
303
|
+
return score
|
|
304
|
+
|
|
305
|
+
def _merge_metadata(self, scored_papers: List[Tuple[int, Path, Dict]]) -> Dict:
|
|
306
|
+
"""Merge metadata from multiple papers, keeping best values."""
|
|
307
|
+
if not scored_papers:
|
|
308
|
+
return {}
|
|
309
|
+
|
|
310
|
+
# Start with best paper's metadata
|
|
311
|
+
_, _, merged = scored_papers[0]
|
|
312
|
+
merged = merged.copy()
|
|
313
|
+
|
|
314
|
+
# Track sources for transparency
|
|
315
|
+
merged["_deduplication"] = {
|
|
316
|
+
"merged_from": [str(p[1].name) for p in scored_papers],
|
|
317
|
+
"merge_timestamp": datetime.now().isoformat(),
|
|
318
|
+
"scores": {str(p[1].name): p[0] for p in scored_papers}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
# Merge from other papers
|
|
322
|
+
for _, paper_dir, metadata in scored_papers[1:]:
|
|
323
|
+
# Add missing fields
|
|
324
|
+
for key, value in metadata.items():
|
|
325
|
+
if key not in merged and value:
|
|
326
|
+
merged[key] = value
|
|
327
|
+
|
|
328
|
+
# Update with better values for specific fields
|
|
329
|
+
|
|
330
|
+
# Take highest citation count
|
|
331
|
+
new_cc = metadata.get("citation_count", 0) or 0
|
|
332
|
+
old_cc = merged.get("citation_count", 0) or 0
|
|
333
|
+
if new_cc > old_cc:
|
|
334
|
+
merged["citation_count"] = metadata["citation_count"]
|
|
335
|
+
merged["citation_count_source"] = metadata.get("citation_count_source", "merged")
|
|
336
|
+
|
|
337
|
+
# Take highest impact factor
|
|
338
|
+
new_if = metadata.get("impact_factor", 0) or 0
|
|
339
|
+
old_if = merged.get("impact_factor", 0) or 0
|
|
340
|
+
if new_if > old_if:
|
|
341
|
+
merged["impact_factor"] = metadata["impact_factor"]
|
|
342
|
+
merged["impact_factor_source"] = metadata.get("impact_factor_source", "merged")
|
|
343
|
+
|
|
344
|
+
# Take DOI if missing
|
|
345
|
+
if not merged.get("doi") and metadata.get("doi"):
|
|
346
|
+
merged["doi"] = metadata["doi"]
|
|
347
|
+
merged["doi_source"] = metadata.get("doi_source", "merged")
|
|
348
|
+
|
|
349
|
+
# Take abstract if missing
|
|
350
|
+
if not merged.get("abstract") and metadata.get("abstract"):
|
|
351
|
+
merged["abstract"] = metadata["abstract"]
|
|
352
|
+
merged["abstract_source"] = metadata.get("abstract_source", "merged")
|
|
353
|
+
|
|
354
|
+
return merged
|
|
355
|
+
|
|
356
|
+
def _merge_files(self, keep_dir: Path, remove_dirs: List[Path]):
|
|
357
|
+
"""Merge files from duplicate directories."""
|
|
358
|
+
for remove_dir in remove_dirs:
|
|
359
|
+
# Copy PDFs if not already present
|
|
360
|
+
for pdf_file in remove_dir.glob("*.pdf"):
|
|
361
|
+
target_pdf = keep_dir / pdf_file.name
|
|
362
|
+
if not target_pdf.exists():
|
|
363
|
+
logger.info(f"Copying PDF: {pdf_file.name}")
|
|
364
|
+
shutil.copy2(pdf_file, target_pdf)
|
|
365
|
+
|
|
366
|
+
# Merge screenshots directory
|
|
367
|
+
remove_screenshots = remove_dir / "screenshots"
|
|
368
|
+
if remove_screenshots.exists():
|
|
369
|
+
keep_screenshots = keep_dir / "screenshots"
|
|
370
|
+
keep_screenshots.mkdir(exist_ok=True)
|
|
371
|
+
|
|
372
|
+
for screenshot in remove_screenshots.glob("*"):
|
|
373
|
+
target = keep_screenshots / screenshot.name
|
|
374
|
+
if not target.exists():
|
|
375
|
+
shutil.copy2(screenshot, target)
|
|
376
|
+
|
|
377
|
+
# Merge logs directory
|
|
378
|
+
remove_logs = remove_dir / "logs"
|
|
379
|
+
if remove_logs.exists():
|
|
380
|
+
keep_logs = keep_dir / "logs"
|
|
381
|
+
keep_logs.mkdir(exist_ok=True)
|
|
382
|
+
|
|
383
|
+
for log in remove_logs.glob("*"):
|
|
384
|
+
target = keep_logs / log.name
|
|
385
|
+
if not target.exists():
|
|
386
|
+
shutil.copy2(log, target)
|
|
387
|
+
|
|
388
|
+
def deduplicate_library(self, dry_run: bool = True) -> Dict[str, int]:
|
|
389
|
+
"""Deduplicate entire MASTER library.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
dry_run: If True, only report what would be done
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
Statistics about deduplication
|
|
396
|
+
"""
|
|
397
|
+
stats = {
|
|
398
|
+
"groups_found": 0,
|
|
399
|
+
"duplicates_found": 0,
|
|
400
|
+
"duplicates_merged": 0,
|
|
401
|
+
"dirs_removed": 0,
|
|
402
|
+
"broken_symlinks_removed": 0,
|
|
403
|
+
"errors": 0
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
# Find all duplicates
|
|
407
|
+
duplicates = self.find_duplicate_papers()
|
|
408
|
+
stats["groups_found"] = len(duplicates)
|
|
409
|
+
stats["duplicates_found"] = sum(len(paths) - 1 for paths in duplicates.values())
|
|
410
|
+
|
|
411
|
+
if not duplicates:
|
|
412
|
+
logger.info("No duplicates to process")
|
|
413
|
+
return stats
|
|
414
|
+
|
|
415
|
+
if dry_run:
|
|
416
|
+
logger.info("DRY RUN - no changes will be made")
|
|
417
|
+
for fingerprint, paper_dirs in duplicates.items():
|
|
418
|
+
logger.info(f"\nDuplicate group: {fingerprint}")
|
|
419
|
+
for paper_dir in paper_dirs:
|
|
420
|
+
metadata_file = paper_dir / "metadata.json"
|
|
421
|
+
if metadata_file.exists():
|
|
422
|
+
with open(metadata_file) as f:
|
|
423
|
+
metadata = json.load(f)
|
|
424
|
+
cc = metadata.get("citation_count", 0)
|
|
425
|
+
doi = metadata.get("doi", "No DOI")
|
|
426
|
+
logger.info(f" - {paper_dir.name}: CC={cc}, DOI={doi}")
|
|
427
|
+
else:
|
|
428
|
+
# Actually merge duplicates
|
|
429
|
+
for fingerprint, paper_dirs in duplicates.items():
|
|
430
|
+
try:
|
|
431
|
+
logger.info(f"\nProcessing duplicate group: {fingerprint}")
|
|
432
|
+
keep_dir, remove_dirs = self.merge_duplicate_papers(paper_dirs)
|
|
433
|
+
|
|
434
|
+
# Remove duplicate directories
|
|
435
|
+
for remove_dir in remove_dirs:
|
|
436
|
+
# Move to .deduplicated directory instead of deleting
|
|
437
|
+
dedup_dir = self.master_dir / ".deduplicated" / datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
438
|
+
dedup_dir.mkdir(parents=True, exist_ok=True)
|
|
439
|
+
|
|
440
|
+
target = dedup_dir / remove_dir.name
|
|
441
|
+
logger.info(f"Moving {remove_dir.name} to {target}")
|
|
442
|
+
shutil.move(str(remove_dir), str(target))
|
|
443
|
+
stats["dirs_removed"] += 1
|
|
444
|
+
|
|
445
|
+
stats["duplicates_merged"] += len(remove_dirs)
|
|
446
|
+
|
|
447
|
+
# Update project symlinks
|
|
448
|
+
self._update_project_symlinks(fingerprint, keep_dir, remove_dirs)
|
|
449
|
+
|
|
450
|
+
except Exception as e:
|
|
451
|
+
logger.error(f"Error processing group {fingerprint}: {e}")
|
|
452
|
+
stats["errors"] += 1
|
|
453
|
+
|
|
454
|
+
# Clean up broken symlinks after deduplication
|
|
455
|
+
if not dry_run:
|
|
456
|
+
broken_count = self._cleanup_broken_symlinks()
|
|
457
|
+
stats["broken_symlinks_removed"] = broken_count
|
|
458
|
+
if broken_count > 0:
|
|
459
|
+
logger.info(f"Removed {broken_count} broken symlinks")
|
|
460
|
+
|
|
461
|
+
logger.info(f"\nDeduplication complete: {stats}")
|
|
462
|
+
return stats
|
|
463
|
+
|
|
464
|
+
def _cleanup_broken_symlinks(self) -> int:
|
|
465
|
+
"""Remove broken symlinks from all project directories.
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Number of broken symlinks removed
|
|
469
|
+
"""
|
|
470
|
+
removed_count = 0
|
|
471
|
+
|
|
472
|
+
# Check all project directories
|
|
473
|
+
for project_dir in self.library_dir.iterdir():
|
|
474
|
+
if not project_dir.is_dir() or project_dir.name == "MASTER":
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
# Check each symlink in the project
|
|
478
|
+
for item in project_dir.iterdir():
|
|
479
|
+
if item.is_symlink():
|
|
480
|
+
# Check if symlink target exists
|
|
481
|
+
try:
|
|
482
|
+
target = item.resolve(strict=True)
|
|
483
|
+
except (OSError, RuntimeError):
|
|
484
|
+
# Symlink is broken
|
|
485
|
+
logger.info(f"Removing broken symlink: {project_dir.name}/{item.name}")
|
|
486
|
+
item.unlink()
|
|
487
|
+
removed_count += 1
|
|
488
|
+
|
|
489
|
+
return removed_count
|
|
490
|
+
|
|
491
|
+
def _update_project_symlinks(self, fingerprint: str, keep_dir: Path, remove_dirs: List[Path]):
|
|
492
|
+
"""Update project symlinks after deduplication."""
|
|
493
|
+
removed_ids = {d.name for d in remove_dirs}
|
|
494
|
+
|
|
495
|
+
# Check all project directories
|
|
496
|
+
for project_dir in self.library_dir.iterdir():
|
|
497
|
+
if not project_dir.is_dir() or project_dir.name == "MASTER":
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
# Find symlinks pointing to removed directories
|
|
501
|
+
for symlink in project_dir.iterdir():
|
|
502
|
+
if symlink.is_symlink():
|
|
503
|
+
target = symlink.resolve()
|
|
504
|
+
if target.name in removed_ids:
|
|
505
|
+
# Update to point to kept directory
|
|
506
|
+
logger.info(f"Updating symlink: {symlink} -> {keep_dir}")
|
|
507
|
+
symlink.unlink()
|
|
508
|
+
symlink.symlink_to(Path("..") / "MASTER" / keep_dir.name)
|
|
509
|
+
|
|
510
|
+
def check_for_existing_paper(self, metadata: Dict) -> Optional[Path]:
|
|
511
|
+
"""Check if a paper already exists in MASTER library.
|
|
512
|
+
|
|
513
|
+
Args:
|
|
514
|
+
metadata: Paper metadata to check
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
Path to existing paper directory if found, None otherwise
|
|
518
|
+
"""
|
|
519
|
+
if not self.master_dir.exists():
|
|
520
|
+
return None
|
|
521
|
+
|
|
522
|
+
# Generate fingerprint for the paper
|
|
523
|
+
fingerprint = self._generate_paper_fingerprint(metadata)
|
|
524
|
+
if not fingerprint:
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
# Check all papers in MASTER
|
|
528
|
+
for paper_dir in self.master_dir.iterdir():
|
|
529
|
+
if not paper_dir.is_dir():
|
|
530
|
+
continue
|
|
531
|
+
|
|
532
|
+
metadata_file = paper_dir / "metadata.json"
|
|
533
|
+
if not metadata_file.exists():
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
try:
|
|
537
|
+
with open(metadata_file) as f:
|
|
538
|
+
existing_metadata = json.load(f)
|
|
539
|
+
|
|
540
|
+
existing_fingerprint = self._generate_paper_fingerprint(existing_metadata)
|
|
541
|
+
|
|
542
|
+
if fingerprint == existing_fingerprint:
|
|
543
|
+
return paper_dir
|
|
544
|
+
|
|
545
|
+
except Exception:
|
|
546
|
+
continue
|
|
547
|
+
|
|
548
|
+
return None
|