scitex 2.5.0__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +19 -8
- scitex/__main__.py +2 -1
- scitex/__version__.py +1 -1
- scitex/_optional_deps.py +13 -20
- scitex/ai/__init__.py +5 -0
- scitex/ai/_gen_ai/_Anthropic.py +3 -1
- scitex/ai/_gen_ai/_BaseGenAI.py +3 -2
- scitex/ai/_gen_ai/_DeepSeek.py +1 -1
- scitex/ai/_gen_ai/_Google.py +3 -2
- scitex/ai/_gen_ai/_Llama.py +4 -2
- scitex/ai/_gen_ai/_OpenAI.py +3 -1
- scitex/ai/_gen_ai/_PARAMS.py +1 -0
- scitex/ai/_gen_ai/_Perplexity.py +3 -1
- scitex/ai/_gen_ai/__init__.py +1 -0
- scitex/ai/_gen_ai/_format_output_func.py +3 -1
- scitex/ai/classification/CrossValidationExperiment.py +8 -14
- scitex/ai/classification/examples/timeseries_cv_demo.py +128 -112
- scitex/ai/classification/reporters/_BaseClassificationReporter.py +2 -0
- scitex/ai/classification/reporters/_ClassificationReporter.py +30 -45
- scitex/ai/classification/reporters/_MultiClassificationReporter.py +8 -11
- scitex/ai/classification/reporters/_SingleClassificationReporter.py +126 -182
- scitex/ai/classification/reporters/__init__.py +1 -1
- scitex/ai/classification/reporters/reporter_utils/_Plotter.py +213 -119
- scitex/ai/classification/reporters/reporter_utils/__init__.py +28 -36
- scitex/ai/classification/reporters/reporter_utils/aggregation.py +125 -143
- scitex/ai/classification/reporters/reporter_utils/data_models.py +128 -120
- scitex/ai/classification/reporters/reporter_utils/reporting.py +507 -340
- scitex/ai/classification/reporters/reporter_utils/storage.py +4 -1
- scitex/ai/classification/reporters/reporter_utils/validation.py +141 -154
- scitex/ai/classification/timeseries/_TimeSeriesBlockingSplit.py +204 -129
- scitex/ai/classification/timeseries/_TimeSeriesCalendarSplit.py +215 -171
- scitex/ai/classification/timeseries/_TimeSeriesMetadata.py +17 -17
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +67 -143
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +67 -143
- scitex/ai/classification/timeseries/_TimeSeriesStrategy.py +12 -13
- scitex/ai/classification/timeseries/_TimeSeriesStratifiedSplit.py +231 -144
- scitex/ai/classification/timeseries/__init__.py +2 -4
- scitex/ai/classification/timeseries/_normalize_timestamp.py +3 -0
- scitex/ai/clustering/_pca.py +0 -1
- scitex/ai/clustering/_umap.py +1 -2
- scitex/ai/feature_extraction/__init__.py +10 -8
- scitex/ai/feature_extraction/vit.py +0 -1
- scitex/ai/feature_selection/feature_selection.py +3 -8
- scitex/ai/metrics/_calc_conf_mat.py +2 -0
- scitex/ai/metrics/_calc_feature_importance.py +3 -7
- scitex/ai/metrics/_calc_pre_rec_auc.py +5 -5
- scitex/ai/metrics/_calc_roc_auc.py +4 -2
- scitex/ai/metrics/_calc_seizure_prediction_metrics.py +35 -20
- scitex/ai/metrics/_calc_silhouette_score.py +1 -3
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger.py +0 -3
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger2020.py +0 -3
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger913A.py +0 -3
- scitex/ai/optim/_optimizers.py +1 -1
- scitex/ai/plt/__init__.py +6 -1
- scitex/ai/plt/_plot_feature_importance.py +1 -3
- scitex/ai/plt/_plot_learning_curve.py +9 -24
- scitex/ai/plt/_plot_optuna_study.py +4 -3
- scitex/ai/plt/_plot_pre_rec_curve.py +9 -15
- scitex/ai/plt/_plot_roc_curve.py +6 -8
- scitex/ai/plt/_stx_conf_mat.py +121 -122
- scitex/ai/sampling/undersample.py +3 -2
- scitex/ai/sklearn/__init__.py +2 -2
- scitex/ai/training/_LearningCurveLogger.py +23 -10
- scitex/ai/utils/_check_params.py +0 -1
- scitex/benchmark/__init__.py +15 -25
- scitex/benchmark/benchmark.py +124 -117
- scitex/benchmark/monitor.py +117 -107
- scitex/benchmark/profiler.py +61 -58
- scitex/bridge/__init__.py +110 -0
- scitex/bridge/_helpers.py +149 -0
- scitex/bridge/_plt_vis.py +529 -0
- scitex/bridge/_protocol.py +283 -0
- scitex/bridge/_stats_plt.py +261 -0
- scitex/bridge/_stats_vis.py +265 -0
- scitex/browser/__init__.py +0 -2
- scitex/browser/auth/__init__.py +0 -0
- scitex/browser/auth/google.py +16 -11
- scitex/browser/automation/CookieHandler.py +2 -3
- scitex/browser/collaboration/__init__.py +3 -0
- scitex/browser/collaboration/auth_helpers.py +3 -1
- scitex/browser/collaboration/collaborative_agent.py +2 -0
- scitex/browser/collaboration/interactive_panel.py +2 -2
- scitex/browser/collaboration/shared_session.py +20 -11
- scitex/browser/collaboration/standard_interactions.py +1 -0
- scitex/browser/core/BrowserMixin.py +12 -30
- scitex/browser/core/ChromeProfileManager.py +9 -24
- scitex/browser/debugging/_browser_logger.py +15 -25
- scitex/browser/debugging/_failure_capture.py +9 -2
- scitex/browser/debugging/_highlight_element.py +15 -6
- scitex/browser/debugging/_show_grid.py +5 -6
- scitex/browser/debugging/_sync_session.py +4 -3
- scitex/browser/debugging/_test_monitor.py +14 -5
- scitex/browser/debugging/_visual_cursor.py +46 -35
- scitex/browser/interaction/click_center.py +4 -3
- scitex/browser/interaction/click_with_fallbacks.py +7 -10
- scitex/browser/interaction/close_popups.py +79 -66
- scitex/browser/interaction/fill_with_fallbacks.py +8 -8
- scitex/browser/pdf/__init__.py +3 -1
- scitex/browser/pdf/click_download_for_chrome_pdf_viewer.py +11 -10
- scitex/browser/pdf/detect_chrome_pdf_viewer.py +3 -6
- scitex/browser/remote/CaptchaHandler.py +109 -96
- scitex/browser/remote/ZenRowsAPIClient.py +91 -97
- scitex/browser/remote/ZenRowsBrowserManager.py +138 -112
- scitex/browser/stealth/HumanBehavior.py +4 -9
- scitex/browser/stealth/StealthManager.py +11 -26
- scitex/capture/__init__.py +17 -17
- scitex/capture/__main__.py +2 -3
- scitex/capture/capture.py +23 -51
- scitex/capture/cli.py +14 -39
- scitex/capture/gif.py +5 -9
- scitex/capture/mcp_server.py +7 -20
- scitex/capture/session.py +4 -3
- scitex/capture/utils.py +18 -53
- scitex/cli/__init__.py +1 -1
- scitex/cli/cloud.py +158 -116
- scitex/cli/config.py +224 -0
- scitex/cli/main.py +41 -40
- scitex/cli/scholar.py +60 -27
- scitex/cli/security.py +14 -20
- scitex/cli/web.py +87 -90
- scitex/cli/writer.py +51 -45
- scitex/cloud/__init__.py +14 -11
- scitex/cloud/_matplotlib_hook.py +6 -6
- scitex/config/README.md +313 -0
- scitex/config/{PriorityConfig.py → _PriorityConfig.py} +114 -17
- scitex/config/_ScitexConfig.py +319 -0
- scitex/config/__init__.py +41 -9
- scitex/config/_paths.py +325 -0
- scitex/config/default.yaml +81 -0
- scitex/context/_suppress_output.py +2 -3
- scitex/db/_BaseMixins/_BaseBackupMixin.py +3 -1
- scitex/db/_BaseMixins/_BaseBatchMixin.py +3 -1
- scitex/db/_BaseMixins/_BaseBlobMixin.py +3 -1
- scitex/db/_BaseMixins/_BaseImportExportMixin.py +1 -3
- scitex/db/_BaseMixins/_BaseIndexMixin.py +3 -1
- scitex/db/_BaseMixins/_BaseMaintenanceMixin.py +1 -3
- scitex/db/_BaseMixins/_BaseQueryMixin.py +3 -1
- scitex/db/_BaseMixins/_BaseRowMixin.py +3 -1
- scitex/db/_BaseMixins/_BaseTableMixin.py +3 -1
- scitex/db/_BaseMixins/_BaseTransactionMixin.py +1 -3
- scitex/db/_BaseMixins/__init__.py +1 -1
- scitex/db/__init__.py +9 -1
- scitex/db/__main__.py +8 -21
- scitex/db/_check_health.py +15 -31
- scitex/db/_delete_duplicates.py +7 -4
- scitex/db/_inspect.py +22 -38
- scitex/db/_inspect_optimized.py +89 -85
- scitex/db/_postgresql/_PostgreSQL.py +0 -1
- scitex/db/_postgresql/_PostgreSQLMixins/_BlobMixin.py +3 -1
- scitex/db/_postgresql/_PostgreSQLMixins/_ConnectionMixin.py +1 -3
- scitex/db/_postgresql/_PostgreSQLMixins/_ImportExportMixin.py +1 -3
- scitex/db/_postgresql/_PostgreSQLMixins/_MaintenanceMixin.py +1 -4
- scitex/db/_postgresql/_PostgreSQLMixins/_QueryMixin.py +3 -3
- scitex/db/_postgresql/_PostgreSQLMixins/_RowMixin.py +3 -1
- scitex/db/_postgresql/_PostgreSQLMixins/_TransactionMixin.py +1 -3
- scitex/db/_postgresql/__init__.py +1 -1
- scitex/db/_sqlite3/_SQLite3.py +2 -4
- scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin.py +11 -12
- scitex/db/_sqlite3/_SQLite3Mixins/_ArrayMixin_v01-need-_hash-col.py +19 -14
- scitex/db/_sqlite3/_SQLite3Mixins/_BatchMixin.py +3 -1
- scitex/db/_sqlite3/_SQLite3Mixins/_BlobMixin.py +7 -7
- scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin.py +118 -111
- scitex/db/_sqlite3/_SQLite3Mixins/_ConnectionMixin.py +8 -10
- scitex/db/_sqlite3/_SQLite3Mixins/_GitMixin.py +17 -45
- scitex/db/_sqlite3/_SQLite3Mixins/_ImportExportMixin.py +1 -3
- scitex/db/_sqlite3/_SQLite3Mixins/_IndexMixin.py +3 -1
- scitex/db/_sqlite3/_SQLite3Mixins/_QueryMixin.py +3 -4
- scitex/db/_sqlite3/_SQLite3Mixins/_RowMixin.py +9 -9
- scitex/db/_sqlite3/_SQLite3Mixins/_TableMixin.py +18 -11
- scitex/db/_sqlite3/_SQLite3Mixins/__init__.py +1 -0
- scitex/db/_sqlite3/__init__.py +1 -1
- scitex/db/_sqlite3/_delete_duplicates.py +13 -11
- scitex/decorators/__init__.py +29 -4
- scitex/decorators/_auto_order.py +43 -43
- scitex/decorators/_batch_fn.py +12 -6
- scitex/decorators/_cache_disk.py +8 -9
- scitex/decorators/_cache_disk_async.py +8 -7
- scitex/decorators/_combined.py +19 -13
- scitex/decorators/_converters.py +16 -3
- scitex/decorators/_deprecated.py +32 -22
- scitex/decorators/_numpy_fn.py +18 -4
- scitex/decorators/_pandas_fn.py +17 -5
- scitex/decorators/_signal_fn.py +17 -3
- scitex/decorators/_torch_fn.py +32 -15
- scitex/decorators/_xarray_fn.py +23 -9
- scitex/dev/_analyze_code_flow.py +0 -2
- scitex/dict/_DotDict.py +15 -19
- scitex/dict/_flatten.py +1 -0
- scitex/dict/_listed_dict.py +1 -0
- scitex/dict/_pop_keys.py +1 -0
- scitex/dict/_replace.py +1 -0
- scitex/dict/_safe_merge.py +1 -0
- scitex/dict/_to_str.py +2 -3
- scitex/dsp/__init__.py +13 -4
- scitex/dsp/_crop.py +3 -1
- scitex/dsp/_detect_ripples.py +3 -1
- scitex/dsp/_modulation_index.py +3 -1
- scitex/dsp/_time.py +3 -1
- scitex/dsp/_wavelet.py +0 -1
- scitex/dsp/example.py +0 -5
- scitex/dsp/filt.py +4 -0
- scitex/dsp/utils/__init__.py +4 -1
- scitex/dsp/utils/pac.py +3 -3
- scitex/dt/_normalize_timestamp.py +4 -1
- scitex/errors.py +3 -6
- scitex/etc/__init__.py +1 -1
- scitex/gen/_DimHandler.py +6 -6
- scitex/gen/__init__.py +5 -1
- scitex/gen/_deprecated_close.py +1 -0
- scitex/gen/_deprecated_start.py +5 -3
- scitex/gen/_detect_environment.py +44 -41
- scitex/gen/_detect_notebook_path.py +51 -47
- scitex/gen/_embed.py +1 -1
- scitex/gen/_get_notebook_path.py +81 -62
- scitex/gen/_inspect_module.py +0 -1
- scitex/gen/_norm.py +16 -7
- scitex/gen/_norm_cache.py +78 -65
- scitex/gen/_print_config.py +0 -3
- scitex/gen/_src.py +2 -3
- scitex/gen/_title_case.py +3 -2
- scitex/gen/_to_even.py +8 -8
- scitex/gen/_transpose.py +3 -3
- scitex/gen/misc.py +0 -3
- scitex/gists/_SigMacro_processFigure_S.py +2 -2
- scitex/gists/_SigMacro_toBlue.py +2 -2
- scitex/gists/__init__.py +4 -1
- scitex/git/_branch.py +19 -11
- scitex/git/_clone.py +23 -15
- scitex/git/_commit.py +10 -12
- scitex/git/_init.py +15 -38
- scitex/git/_remote.py +9 -3
- scitex/git/_result.py +3 -0
- scitex/git/_retry.py +2 -5
- scitex/git/_types.py +4 -0
- scitex/git/_validation.py +8 -8
- scitex/git/_workflow.py +4 -4
- scitex/io/__init__.py +2 -1
- scitex/io/_glob.py +2 -2
- scitex/io/_json2md.py +3 -3
- scitex/io/_load.py +6 -8
- scitex/io/_load_cache.py +71 -71
- scitex/io/_load_configs.py +2 -3
- scitex/io/_load_modules/_H5Explorer.py +6 -12
- scitex/io/_load_modules/_ZarrExplorer.py +3 -3
- scitex/io/_load_modules/_bibtex.py +62 -63
- scitex/io/_load_modules/_canvas.py +4 -9
- scitex/io/_load_modules/_catboost.py +7 -2
- scitex/io/_load_modules/_hdf5.py +2 -0
- scitex/io/_load_modules/_image.py +5 -1
- scitex/io/_load_modules/_matlab.py +3 -1
- scitex/io/_load_modules/_optuna.py +0 -1
- scitex/io/_load_modules/_pdf.py +38 -29
- scitex/io/_load_modules/_sqlite3.py +1 -0
- scitex/io/_load_modules/_txt.py +2 -0
- scitex/io/_load_modules/_xml.py +9 -9
- scitex/io/_load_modules/_zarr.py +12 -10
- scitex/io/_metadata.py +76 -37
- scitex/io/_qr_utils.py +18 -13
- scitex/io/_save.py +220 -63
- scitex/io/_save_modules/__init__.py +7 -2
- scitex/io/_save_modules/_bibtex.py +66 -61
- scitex/io/_save_modules/_canvas.py +5 -6
- scitex/io/_save_modules/_catboost.py +2 -2
- scitex/io/_save_modules/_csv.py +4 -4
- scitex/io/_save_modules/_excel.py +5 -9
- scitex/io/_save_modules/_hdf5.py +9 -21
- scitex/io/_save_modules/_html.py +5 -5
- scitex/io/_save_modules/_image.py +105 -8
- scitex/io/_save_modules/_joblib.py +2 -2
- scitex/io/_save_modules/_json.py +51 -6
- scitex/io/_save_modules/_listed_dfs_as_csv.py +2 -1
- scitex/io/_save_modules/_listed_scalars_as_csv.py +2 -1
- scitex/io/_save_modules/_matlab.py +2 -2
- scitex/io/_save_modules/_numpy.py +6 -8
- scitex/io/_save_modules/_pickle.py +4 -4
- scitex/io/_save_modules/_plotly.py +3 -3
- scitex/io/_save_modules/_tex.py +23 -25
- scitex/io/_save_modules/_text.py +2 -2
- scitex/io/_save_modules/_yaml.py +9 -9
- scitex/io/_save_modules/_zarr.py +15 -15
- scitex/io/utils/__init__.py +2 -1
- scitex/io/utils/h5_to_zarr.py +173 -155
- scitex/linalg/__init__.py +1 -1
- scitex/linalg/_geometric_median.py +4 -3
- scitex/logging/_Tee.py +5 -7
- scitex/logging/__init__.py +18 -19
- scitex/logging/_config.py +4 -1
- scitex/logging/_context.py +6 -5
- scitex/logging/_formatters.py +2 -3
- scitex/logging/_handlers.py +19 -20
- scitex/logging/_levels.py +9 -17
- scitex/logging/_logger.py +74 -15
- scitex/logging/_print_capture.py +17 -17
- scitex/nn/_BNet.py +1 -3
- scitex/nn/_Filters.py +6 -2
- scitex/nn/_ModulationIndex.py +3 -1
- scitex/nn/_PAC.py +3 -2
- scitex/nn/_PSD.py +0 -1
- scitex/nn/__init__.py +16 -3
- scitex/path/_clean.py +10 -8
- scitex/path/_find.py +1 -1
- scitex/path/_get_spath.py +1 -2
- scitex/path/_mk_spath.py +1 -1
- scitex/path/_symlink.py +5 -10
- scitex/pd/__init__.py +4 -1
- scitex/pd/_force_df.py +24 -24
- scitex/pd/_get_unique.py +1 -0
- scitex/pd/_merge_columns.py +1 -1
- scitex/pd/_round.py +11 -7
- scitex/pd/_to_xy.py +0 -1
- scitex/plt/REQUESTS.md +191 -0
- scitex/plt/__init__.py +185 -87
- scitex/plt/_subplots/_AxesWrapper.py +22 -6
- scitex/plt/_subplots/_AxisWrapper.py +100 -39
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin.py +74 -52
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin.py +183 -73
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin.py +61 -45
- scitex/plt/_subplots/_AxisWrapperMixins/_TrackingMixin.py +26 -14
- scitex/plt/_subplots/_AxisWrapperMixins/_UnitAwareMixin.py +80 -73
- scitex/plt/_subplots/_FigWrapper.py +93 -60
- scitex/plt/_subplots/_SubplotsWrapper.py +135 -68
- scitex/plt/_subplots/__init__.py +10 -0
- scitex/plt/_subplots/_export_as_csv.py +89 -47
- scitex/plt/_subplots/_export_as_csv_formatters/__init__.py +1 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_annotate.py +6 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_bar.py +88 -38
- scitex/plt/_subplots/_export_as_csv_formatters/_format_barh.py +25 -31
- scitex/plt/_subplots/_export_as_csv_formatters/_format_boxplot.py +53 -23
- scitex/plt/_subplots/_export_as_csv_formatters/_format_contour.py +38 -25
- scitex/plt/_subplots/_export_as_csv_formatters/_format_contourf.py +17 -9
- scitex/plt/_subplots/_export_as_csv_formatters/_format_errorbar.py +70 -124
- scitex/plt/_subplots/_export_as_csv_formatters/_format_eventplot.py +12 -10
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill.py +31 -17
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill_between.py +33 -21
- scitex/plt/_subplots/_export_as_csv_formatters/_format_hexbin.py +14 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_hist.py +43 -29
- scitex/plt/_subplots/_export_as_csv_formatters/_format_hist2d.py +14 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow.py +27 -11
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow2d.py +7 -5
- scitex/plt/_subplots/_export_as_csv_formatters/_format_matshow.py +9 -7
- scitex/plt/_subplots/_export_as_csv_formatters/_format_pie.py +15 -6
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +85 -46
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_box.py +52 -27
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_imshow.py +1 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_kde.py +16 -17
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter.py +7 -5
- scitex/plt/_subplots/_export_as_csv_formatters/_format_quiver.py +10 -8
- scitex/plt/_subplots/_export_as_csv_formatters/_format_scatter.py +17 -6
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_barplot.py +43 -26
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_boxplot.py +68 -47
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_heatmap.py +52 -64
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_histplot.py +55 -50
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_jointplot.py +9 -11
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_kdeplot.py +63 -29
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_lineplot.py +4 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_pairplot.py +6 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_scatterplot.py +44 -40
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_stripplot.py +46 -39
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_swarmplot.py +46 -39
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_violinplot.py +75 -94
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stem.py +12 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_step.py +12 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_streamplot.py +10 -8
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_conf_mat.py +17 -15
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_ecdf.py +10 -9
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_fillv.py +35 -31
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_heatmap.py +18 -18
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_image.py +24 -18
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_joyplot.py +9 -7
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_line.py +34 -23
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_mean_ci.py +15 -13
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_mean_std.py +12 -10
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_median_iqr.py +15 -13
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_raster.py +11 -9
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_rectangle.py +84 -56
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_scatter_hist.py +35 -32
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_shaded_line.py +46 -30
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_violin.py +51 -51
- scitex/plt/_subplots/_export_as_csv_formatters/_format_text.py +32 -31
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violin.py +34 -31
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violinplot.py +44 -37
- scitex/plt/_subplots/_export_as_csv_formatters/verify_formatters.py +91 -74
- scitex/plt/_tpl.py +6 -5
- scitex/plt/ax/_plot/__init__.py +24 -0
- scitex/plt/ax/_plot/_add_fitted_line.py +12 -11
- scitex/plt/ax/_plot/_plot_circular_hist.py +3 -1
- scitex/plt/ax/_plot/_plot_statistical_shaded_line.py +25 -19
- scitex/plt/ax/_plot/_stx_conf_mat.py +6 -3
- scitex/plt/ax/_plot/_stx_ecdf.py +5 -3
- scitex/plt/ax/_plot/_stx_fillv.py +4 -2
- scitex/plt/ax/_plot/_stx_heatmap.py +7 -4
- scitex/plt/ax/_plot/_stx_image.py +7 -5
- scitex/plt/ax/_plot/_stx_joyplot.py +32 -10
- scitex/plt/ax/_plot/_stx_raster.py +26 -11
- scitex/plt/ax/_plot/_stx_rectangle.py +2 -2
- scitex/plt/ax/_plot/_stx_shaded_line.py +15 -11
- scitex/plt/ax/_plot/_stx_violin.py +3 -1
- scitex/plt/ax/_style/_add_marginal_ax.py +6 -4
- scitex/plt/ax/_style/_auto_scale_axis.py +14 -10
- scitex/plt/ax/_style/_extend.py +3 -1
- scitex/plt/ax/_style/_force_aspect.py +5 -3
- scitex/plt/ax/_style/_format_units.py +2 -2
- scitex/plt/ax/_style/_hide_spines.py +5 -1
- scitex/plt/ax/_style/_map_ticks.py +5 -3
- scitex/plt/ax/_style/_rotate_labels.py +5 -4
- scitex/plt/ax/_style/_rotate_labels_v01.py +73 -63
- scitex/plt/ax/_style/_set_log_scale.py +120 -85
- scitex/plt/ax/_style/_set_meta.py +99 -76
- scitex/plt/ax/_style/_set_supxyt.py +33 -16
- scitex/plt/ax/_style/_set_xyt.py +27 -18
- scitex/plt/ax/_style/_share_axes.py +15 -5
- scitex/plt/ax/_style/_show_spines.py +58 -57
- scitex/plt/ax/_style/_style_barplot.py +1 -1
- scitex/plt/ax/_style/_style_boxplot.py +25 -14
- scitex/plt/ax/_style/_style_errorbar.py +0 -0
- scitex/plt/ax/_style/_style_scatter.py +1 -1
- scitex/plt/ax/_style/_style_suptitles.py +3 -3
- scitex/plt/ax/_style/_style_violinplot.py +8 -2
- scitex/plt/color/__init__.py +34 -2
- scitex/plt/color/_add_hue_col.py +1 -0
- scitex/plt/color/_colors.py +0 -1
- scitex/plt/color/_get_colors_from_conf_matap.py +3 -1
- scitex/plt/color/_vizualize_colors.py +0 -1
- scitex/plt/docs/FIGURE_ARCHITECTURE.md +155 -97
- scitex/plt/gallery/README.md +75 -0
- scitex/plt/gallery/__init__.py +29 -0
- scitex/plt/gallery/_generate.py +153 -0
- scitex/plt/gallery/_plots.py +594 -0
- scitex/plt/gallery/_registry.py +153 -0
- scitex/plt/styles/__init__.py +9 -9
- scitex/plt/styles/_plot_defaults.py +62 -61
- scitex/plt/styles/_plot_postprocess.py +126 -77
- scitex/plt/styles/_style_loader.py +0 -0
- scitex/plt/styles/presets.py +43 -18
- scitex/plt/templates/research-master/scitex/vis/gallery/area/fill_between.json +110 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/area/fill_betweenx.json +88 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/area/stx_fill_between.json +103 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/area/stx_fillv.json +106 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/bar.json +92 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/barh.json +92 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/boxplot.json +92 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_bar.json +84 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_barh.json +84 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_box.json +83 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_boxplot.json +93 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_violin.json +91 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_violinplot.json +91 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/violinplot.json +91 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/contour/contour.json +97 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/contour/contourf.json +98 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/contour/stx_contour.json +84 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/hist.json +101 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/hist2d.json +96 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/stx_ecdf.json +95 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/stx_joyplot.json +95 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/stx_kde.json +93 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/imshow.json +95 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/matshow.json +95 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_conf_mat.json +83 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_heatmap.json +92 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_image.json +121 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_imshow.json +84 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/line/plot.json +110 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/line/step.json +92 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/line/stx_line.json +95 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/line/stx_shaded_line.json +96 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/hexbin.json +95 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/scatter.json +95 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/stem.json +92 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/stx_scatter.json +84 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/special/pie.json +94 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/special/stx_raster.json +109 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/special/stx_rectangle.json +108 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/errorbar.json +93 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_errorbar.json +84 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_mean_ci.json +96 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_mean_std.json +96 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_median_iqr.json +96 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/vector/quiver.json +99 -0
- scitex/plt/templates/research-master/scitex/vis/gallery/vector/streamplot.json +100 -0
- scitex/plt/utils/__init__.py +29 -2
- scitex/plt/utils/_close.py +8 -3
- scitex/plt/utils/_collect_figure_metadata.py +3031 -265
- scitex/plt/utils/_colorbar.py +15 -17
- scitex/plt/utils/_configure_mpl.py +22 -14
- scitex/plt/utils/_crop.py +60 -27
- scitex/plt/utils/_csv_column_naming.py +123 -72
- scitex/plt/utils/_dimension_viewer.py +7 -19
- scitex/plt/utils/_figure_from_axes_mm.py +70 -16
- scitex/plt/utils/_figure_mm.py +3 -2
- scitex/plt/utils/_get_actual_font.py +5 -4
- scitex/plt/utils/_histogram_utils.py +52 -48
- scitex/plt/utils/_is_valid_axis.py +19 -13
- scitex/plt/utils/_mk_colorbar.py +3 -3
- scitex/plt/utils/_scientific_captions.py +202 -139
- scitex/plt/utils/_scitex_config.py +98 -98
- scitex/plt/utils/_units.py +0 -0
- scitex/plt/utils/metadata/__init__.py +36 -0
- scitex/plt/utils/metadata/_artist_extraction.py +119 -0
- scitex/plt/utils/metadata/_axes_metadata.py +93 -0
- scitex/plt/utils/metadata/_collection_artists.py +292 -0
- scitex/plt/utils/metadata/_core.py +208 -0
- scitex/plt/utils/metadata/_csv_column_extraction.py +186 -0
- scitex/plt/utils/metadata/_csv_hash.py +115 -0
- scitex/plt/utils/metadata/_csv_verification.py +95 -0
- scitex/plt/utils/metadata/_data_linkage.py +263 -0
- scitex/plt/utils/metadata/_dimensions.py +239 -0
- scitex/plt/utils/metadata/_figure_metadata.py +58 -0
- scitex/plt/utils/metadata/_image_text_artists.py +168 -0
- scitex/plt/utils/metadata/_label_parsing.py +82 -0
- scitex/plt/utils/metadata/_legend_extraction.py +120 -0
- scitex/plt/utils/metadata/_line_artists.py +367 -0
- scitex/plt/utils/metadata/_line_semantic_handling.py +173 -0
- scitex/plt/utils/metadata/_patch_artists.py +211 -0
- scitex/plt/utils/metadata/_plot_content.py +26 -0
- scitex/plt/utils/metadata/_plot_type_detection.py +184 -0
- scitex/plt/utils/metadata/_precision.py +134 -0
- scitex/plt/utils/metadata/_precision_config.py +68 -0
- scitex/plt/utils/metadata/_precision_sections.py +211 -0
- scitex/plt/utils/metadata/_recipe_extraction.py +267 -0
- scitex/plt/utils/metadata/_style_parsing.py +174 -0
- scitex/repro/_RandomStateManager.py +33 -38
- scitex/repro/__init__.py +16 -7
- scitex/repro/_gen_ID.py +7 -9
- scitex/repro/_gen_timestamp.py +7 -6
- scitex/repro/_hash_array.py +8 -12
- scitex/reproduce/__init__.py +1 -1
- scitex/resource/_get_processor_usages.py +3 -1
- scitex/resource/_log_processor_usages.py +3 -1
- scitex/rng/__init__.py +1 -1
- scitex/schema/README.md +178 -0
- scitex/schema/__init__.py +144 -0
- scitex/schema/_canvas.py +444 -0
- scitex/schema/_stats.py +762 -0
- scitex/schema/_validation.py +590 -0
- scitex/scholar/.legacy/Scholar.py +5 -12
- scitex/scholar/.legacy/_Scholar.py +66 -99
- scitex/scholar/.legacy/_ScholarAPI.py +75 -66
- scitex/scholar/.legacy/_tmp/search_engine/_BaseSearchEngine.py +3 -3
- scitex/scholar/.legacy/_tmp/search_engine/_UnifiedSearcher.py +4 -9
- scitex/scholar/.legacy/_tmp/search_engine/__init__.py +14 -21
- scitex/scholar/.legacy/_tmp/search_engine/local/_LocalSearchEngine.py +40 -37
- scitex/scholar/.legacy/_tmp/search_engine/local/_VectorSearchEngine.py +31 -28
- scitex/scholar/.legacy/_tmp/search_engine/web/_ArxivSearchEngine.py +74 -65
- scitex/scholar/.legacy/_tmp/search_engine/web/_CrossRefSearchEngine.py +122 -116
- scitex/scholar/.legacy/_tmp/search_engine/web/_GoogleScholarSearchEngine.py +65 -59
- scitex/scholar/.legacy/_tmp/search_engine/web/_PubMedSearchEngine.py +121 -107
- scitex/scholar/.legacy/_tmp/search_engine/web/_SemanticScholarSearchEngine.py +5 -12
- scitex/scholar/.legacy/database/_DatabaseEntry.py +49 -45
- scitex/scholar/.legacy/database/_DatabaseIndex.py +131 -94
- scitex/scholar/.legacy/database/_LibraryManager.py +65 -63
- scitex/scholar/.legacy/database/_PaperDatabase.py +138 -124
- scitex/scholar/.legacy/database/_ScholarDatabaseIntegration.py +14 -36
- scitex/scholar/.legacy/database/_StorageIntegratedDB.py +192 -156
- scitex/scholar/.legacy/database/_ZoteroCompatibleDB.py +300 -237
- scitex/scholar/.legacy/database/__init__.py +2 -1
- scitex/scholar/.legacy/database/manage.py +92 -84
- scitex/scholar/.legacy/lookup/_LookupIndex.py +157 -101
- scitex/scholar/.legacy/lookup/__init__.py +2 -1
- scitex/scholar/.legacy/metadata/doi/batch/_MetadataHandlerForBatchDOIResolution.py +4 -9
- scitex/scholar/.legacy/metadata/doi/batch/_ProgressManagerForBatchDOIResolution.py +10 -23
- scitex/scholar/.legacy/metadata/doi/batch/_SourceStatsManagerForBatchDOIResolution.py +4 -9
- scitex/scholar/.legacy/metadata/doi/batch/__init__.py +3 -1
- scitex/scholar/.legacy/metadata/doi/resolvers/_BatchDOIResolver.py +10 -25
- scitex/scholar/.legacy/metadata/doi/resolvers/_BibTeXDOIResolver.py +19 -49
- scitex/scholar/.legacy/metadata/doi/resolvers/_DOIResolver.py +1 -0
- scitex/scholar/.legacy/metadata/doi/resolvers/_SingleDOIResolver.py +8 -20
- scitex/scholar/.legacy/metadata/doi/sources/.combined-SemanticScholarSource/_SemanticScholarSource.py +37 -35
- scitex/scholar/.legacy/metadata/doi/sources/.combined-SemanticScholarSource/_SemanticScholarSourceEnhanced.py +49 -37
- scitex/scholar/.legacy/metadata/doi/sources/_ArXivSource.py +11 -30
- scitex/scholar/.legacy/metadata/doi/sources/_BaseDOISource.py +19 -47
- scitex/scholar/.legacy/metadata/doi/sources/_CrossRefLocalSource.py +1 -0
- scitex/scholar/.legacy/metadata/doi/sources/_CrossRefSource.py +12 -33
- scitex/scholar/.legacy/metadata/doi/sources/_OpenAlexSource.py +8 -20
- scitex/scholar/.legacy/metadata/doi/sources/_PubMedSource.py +10 -27
- scitex/scholar/.legacy/metadata/doi/sources/_SemanticScholarSource.py +11 -29
- scitex/scholar/.legacy/metadata/doi/sources/_SourceManager.py +8 -21
- scitex/scholar/.legacy/metadata/doi/sources/_SourceResolutionStrategy.py +24 -55
- scitex/scholar/.legacy/metadata/doi/sources/_SourceRotationManager.py +8 -21
- scitex/scholar/.legacy/metadata/doi/sources/_URLDOISource.py +9 -16
- scitex/scholar/.legacy/metadata/doi/sources/_UnifiedSource.py +8 -22
- scitex/scholar/.legacy/metadata/doi/sources/__init__.py +1 -0
- scitex/scholar/.legacy/metadata/doi/utils/_PubMedConverter.py +4 -8
- scitex/scholar/.legacy/metadata/doi/utils/_RateLimitHandler.py +17 -43
- scitex/scholar/.legacy/metadata/doi/utils/_TextNormalizer.py +8 -18
- scitex/scholar/.legacy/metadata/doi/utils/_URLDOIExtractor.py +4 -8
- scitex/scholar/.legacy/metadata/doi/utils/__init__.py +1 -0
- scitex/scholar/.legacy/metadata/doi/utils/_to_complete_metadata_structure.py +1 -0
- scitex/scholar/.legacy/metadata/enrichment/_LibraryEnricher.py +2 -3
- scitex/scholar/.legacy/metadata/enrichment/enrichers/_ImpactFactorEnricher.py +6 -12
- scitex/scholar/.legacy/metadata/enrichment/enrichers/_SmartEnricher.py +5 -10
- scitex/scholar/.legacy/metadata/enrichment/sources/_UnifiedMetadataSource.py +4 -5
- scitex/scholar/.legacy/metadata/query_to_full_meta_json.py +8 -12
- scitex/scholar/.legacy/metadata/urls/_URLMetadataHandler.py +3 -3
- scitex/scholar/.legacy/metadata/urls/_ZoteroTranslatorRunner.py +15 -21
- scitex/scholar/.legacy/metadata/urls/__init__.py +3 -3
- scitex/scholar/.legacy/metadata/urls/_finder.py +4 -6
- scitex/scholar/.legacy/metadata/urls/_handler.py +7 -15
- scitex/scholar/.legacy/metadata/urls/_resolver.py +6 -12
- scitex/scholar/.legacy/search/_Embedder.py +74 -69
- scitex/scholar/.legacy/search/_SemanticSearch.py +91 -90
- scitex/scholar/.legacy/search/_SemanticSearchEngine.py +104 -109
- scitex/scholar/.legacy/search/_UnifiedSearcher.py +530 -471
- scitex/scholar/.legacy/search/_VectorDatabase.py +111 -92
- scitex/scholar/.legacy/search/__init__.py +1 -0
- scitex/scholar/.legacy/storage/_EnhancedStorageManager.py +182 -154
- scitex/scholar/.legacy/storage/__init__.py +2 -1
- scitex/scholar/__init__.py +0 -2
- scitex/scholar/__main__.py +1 -3
- scitex/scholar/auth/ScholarAuthManager.py +13 -36
- scitex/scholar/auth/core/AuthenticationGateway.py +15 -29
- scitex/scholar/auth/core/BrowserAuthenticator.py +22 -57
- scitex/scholar/auth/core/StrategyResolver.py +10 -27
- scitex/scholar/auth/core/__init__.py +5 -1
- scitex/scholar/auth/gateway/_OpenURLLinkFinder.py +11 -21
- scitex/scholar/auth/gateway/_OpenURLResolver.py +10 -18
- scitex/scholar/auth/gateway/_resolve_functions.py +3 -3
- scitex/scholar/auth/providers/BaseAuthenticator.py +1 -0
- scitex/scholar/auth/providers/EZProxyAuthenticator.py +7 -14
- scitex/scholar/auth/providers/OpenAthensAuthenticator.py +29 -57
- scitex/scholar/auth/providers/ShibbolethAuthenticator.py +87 -73
- scitex/scholar/auth/session/AuthCacheManager.py +12 -22
- scitex/scholar/auth/session/SessionManager.py +4 -6
- scitex/scholar/auth/sso/BaseSSOAutomator.py +13 -19
- scitex/scholar/auth/sso/OpenAthensSSOAutomator.py +16 -45
- scitex/scholar/auth/sso/SSOAutomator.py +8 -15
- scitex/scholar/auth/sso/UniversityOfMelbourneSSOAutomator.py +13 -23
- scitex/scholar/browser/ScholarBrowserManager.py +31 -56
- scitex/scholar/browser/__init__.py +1 -0
- scitex/scholar/browser/utils/click_and_wait.py +3 -4
- scitex/scholar/browser/utils/close_unwanted_pages.py +4 -7
- scitex/scholar/browser/utils/wait_redirects.py +15 -40
- scitex/scholar/citation_graph/__init__.py +0 -0
- scitex/scholar/citation_graph/builder.py +3 -7
- scitex/scholar/citation_graph/database.py +4 -11
- scitex/scholar/citation_graph/example.py +5 -10
- scitex/scholar/citation_graph/models.py +0 -0
- scitex/scholar/cli/_url_utils.py +1 -1
- scitex/scholar/cli/chrome.py +5 -3
- scitex/scholar/cli/download_pdf.py +13 -14
- scitex/scholar/cli/handlers/bibtex_handler.py +4 -12
- scitex/scholar/cli/handlers/doi_handler.py +1 -3
- scitex/scholar/cli/handlers/project_handler.py +6 -20
- scitex/scholar/cli/open_browser.py +41 -39
- scitex/scholar/cli/open_browser_auto.py +31 -39
- scitex/scholar/cli/open_browser_monitored.py +27 -24
- scitex/scholar/config/ScholarConfig.py +5 -8
- scitex/scholar/config/__init__.py +1 -0
- scitex/scholar/config/core/_CascadeConfig.py +3 -3
- scitex/scholar/config/core/_PathManager.py +16 -28
- scitex/scholar/core/Paper.py +79 -78
- scitex/scholar/core/Papers.py +16 -27
- scitex/scholar/core/Scholar.py +98 -229
- scitex/scholar/core/journal_normalizer.py +52 -49
- scitex/scholar/core/oa_cache.py +27 -23
- scitex/scholar/core/open_access.py +17 -8
- scitex/scholar/docs/template.py +4 -3
- scitex/scholar/docs/to_claude/examples/example-python-project-scitex/scripts/mnist/clf_svm.py +0 -0
- scitex/scholar/docs/to_claude/examples/example-python-project-scitex/scripts/mnist/download.py +0 -0
- scitex/scholar/docs/to_claude/examples/example-python-project-scitex/scripts/mnist/plot_conf_mat.py +0 -0
- scitex/scholar/docs/to_claude/examples/example-python-project-scitex/scripts/mnist/plot_digits.py +0 -0
- scitex/scholar/docs/to_claude/examples/example-python-project-scitex/scripts/mnist/plot_umap_space.py +0 -0
- scitex/scholar/examples/00_config.py +10 -9
- scitex/scholar/examples/01_auth.py +3 -0
- scitex/scholar/examples/02_browser.py +14 -10
- scitex/scholar/examples/03_01-engine.py +3 -0
- scitex/scholar/examples/03_02-engine-for-bibtex.py +4 -3
- scitex/scholar/examples/04_01-url.py +9 -9
- scitex/scholar/examples/04_02-url-for-bibtex.py +7 -3
- scitex/scholar/examples/04_02-url-for-dois.py +87 -97
- scitex/scholar/examples/05_download_pdf.py +10 -4
- scitex/scholar/examples/06_find_and_download.py +6 -6
- scitex/scholar/examples/06_parse_bibtex.py +17 -17
- scitex/scholar/examples/07_storage_integration.py +6 -9
- scitex/scholar/examples/99_fullpipeline-for-bibtex.py +14 -15
- scitex/scholar/examples/99_fullpipeline-for-one-entry.py +31 -23
- scitex/scholar/examples/99_maintenance.py +3 -0
- scitex/scholar/examples/dev.py +2 -3
- scitex/scholar/examples/zotero_integration.py +11 -18
- scitex/scholar/impact_factor/ImpactFactorEngine.py +7 -9
- scitex/scholar/impact_factor/estimation/__init__.py +4 -4
- scitex/scholar/impact_factor/estimation/core/__init__.py +3 -7
- scitex/scholar/impact_factor/estimation/core/cache_manager.py +223 -211
- scitex/scholar/impact_factor/estimation/core/calculator.py +165 -131
- scitex/scholar/impact_factor/estimation/core/journal_matcher.py +217 -172
- scitex/scholar/impact_factor/jcr/ImpactFactorJCREngine.py +6 -14
- scitex/scholar/impact_factor/jcr/build_database.py +4 -3
- scitex/scholar/integration/base.py +9 -17
- scitex/scholar/integration/mendeley/exporter.py +2 -4
- scitex/scholar/integration/mendeley/importer.py +3 -3
- scitex/scholar/integration/mendeley/linker.py +3 -3
- scitex/scholar/integration/mendeley/mapper.py +9 -6
- scitex/scholar/integration/zotero/__main__.py +26 -43
- scitex/scholar/integration/zotero/exporter.py +15 -11
- scitex/scholar/integration/zotero/importer.py +12 -10
- scitex/scholar/integration/zotero/linker.py +8 -12
- scitex/scholar/integration/zotero/mapper.py +17 -12
- scitex/scholar/metadata_engines/.combined-SemanticScholarSource/_SemanticScholarSource.py +37 -35
- scitex/scholar/metadata_engines/.combined-SemanticScholarSource/_SemanticScholarSourceEnhanced.py +47 -35
- scitex/scholar/metadata_engines/ScholarEngine.py +21 -43
- scitex/scholar/metadata_engines/__init__.py +1 -0
- scitex/scholar/metadata_engines/individual/ArXivEngine.py +15 -37
- scitex/scholar/metadata_engines/individual/CrossRefEngine.py +15 -42
- scitex/scholar/metadata_engines/individual/CrossRefLocalEngine.py +24 -45
- scitex/scholar/metadata_engines/individual/OpenAlexEngine.py +11 -21
- scitex/scholar/metadata_engines/individual/PubMedEngine.py +10 -27
- scitex/scholar/metadata_engines/individual/SemanticScholarEngine.py +28 -35
- scitex/scholar/metadata_engines/individual/URLDOIEngine.py +11 -22
- scitex/scholar/metadata_engines/individual/_BaseDOIEngine.py +20 -49
- scitex/scholar/metadata_engines/utils/_PubMedConverter.py +4 -8
- scitex/scholar/metadata_engines/utils/_URLDOIExtractor.py +5 -10
- scitex/scholar/metadata_engines/utils/__init__.py +2 -0
- scitex/scholar/metadata_engines/utils/_metadata2bibtex.py +3 -0
- scitex/scholar/metadata_engines/utils/_standardize_metadata.py +2 -3
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +25 -37
- scitex/scholar/pdf_download/strategies/chrome_pdf_viewer.py +11 -19
- scitex/scholar/pdf_download/strategies/direct_download.py +5 -9
- scitex/scholar/pdf_download/strategies/manual_download_fallback.py +3 -3
- scitex/scholar/pdf_download/strategies/manual_download_utils.py +6 -13
- scitex/scholar/pdf_download/strategies/open_access_download.py +49 -31
- scitex/scholar/pdf_download/strategies/response_body.py +8 -19
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +9 -18
- scitex/scholar/pipelines/ScholarPipelineMetadataParallel.py +25 -26
- scitex/scholar/pipelines/ScholarPipelineMetadataSingle.py +62 -23
- scitex/scholar/pipelines/ScholarPipelineParallel.py +13 -30
- scitex/scholar/pipelines/ScholarPipelineSearchParallel.py +299 -220
- scitex/scholar/pipelines/ScholarPipelineSearchSingle.py +202 -165
- scitex/scholar/pipelines/ScholarPipelineSingle.py +25 -51
- scitex/scholar/pipelines/SearchQueryParser.py +55 -55
- scitex/scholar/search_engines/ScholarSearchEngine.py +31 -27
- scitex/scholar/search_engines/_BaseSearchEngine.py +20 -23
- scitex/scholar/search_engines/individual/ArXivSearchEngine.py +53 -35
- scitex/scholar/search_engines/individual/CrossRefSearchEngine.py +47 -40
- scitex/scholar/search_engines/individual/OpenAlexSearchEngine.py +55 -50
- scitex/scholar/search_engines/individual/PubMedSearchEngine.py +8 -10
- scitex/scholar/search_engines/individual/SemanticScholarSearchEngine.py +55 -49
- scitex/scholar/storage/BibTeXHandler.py +150 -95
- scitex/scholar/storage/PaperIO.py +3 -6
- scitex/scholar/storage/ScholarLibrary.py +70 -49
- scitex/scholar/storage/_DeduplicationManager.py +52 -25
- scitex/scholar/storage/_LibraryCacheManager.py +19 -46
- scitex/scholar/storage/_LibraryManager.py +65 -175
- scitex/scholar/url_finder/ScholarURLFinder.py +9 -25
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_direct_links.py +1 -1
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_href.py +6 -10
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_navigation.py +4 -6
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_publisher_patterns.py +8 -15
- scitex/scholar/url_finder/strategies/find_pdf_urls_by_zotero_translators.py +3 -3
- scitex/scholar/url_finder/strategies/find_supplementary_urls_by_href.py +3 -3
- scitex/scholar/url_finder/translators/core/patterns.py +6 -4
- scitex/scholar/url_finder/translators/core/registry.py +6 -9
- scitex/scholar/url_finder/translators/individual/BOFiP_Impots.py +60 -52
- scitex/scholar/url_finder/translators/individual/Baidu_Scholar.py +54 -62
- scitex/scholar/url_finder/translators/individual/Bangkok_Post.py +38 -44
- scitex/scholar/url_finder/translators/individual/Baruch_Foundation.py +43 -47
- scitex/scholar/url_finder/translators/individual/Beobachter.py +46 -50
- scitex/scholar/url_finder/translators/individual/Bezneng_Gajit.py +37 -41
- scitex/scholar/url_finder/translators/individual/BibLaTeX.py +59 -52
- scitex/scholar/url_finder/translators/individual/BibTeX.py +83 -79
- scitex/scholar/url_finder/translators/individual/Biblio_com.py +48 -51
- scitex/scholar/url_finder/translators/individual/Bibliontology_RDF.py +58 -56
- scitex/scholar/url_finder/translators/individual/Camara_Brasileira_do_Livro_ISBN.py +102 -99
- scitex/scholar/url_finder/translators/individual/CanLII.py +49 -43
- scitex/scholar/url_finder/translators/individual/Canada_com.py +36 -40
- scitex/scholar/url_finder/translators/individual/Canadian_Letters_and_Images.py +43 -43
- scitex/scholar/url_finder/translators/individual/Canadiana_ca.py +77 -66
- scitex/scholar/url_finder/translators/individual/Cascadilla_Proceedings_Project.py +68 -62
- scitex/scholar/url_finder/translators/individual/Central_and_Eastern_European_Online_Library_Journals.py +60 -60
- scitex/scholar/url_finder/translators/individual/Champlain_Society_Collection.py +63 -61
- scitex/scholar/url_finder/translators/individual/Chicago_Journal_of_Theoretical_Computer_Science.py +74 -58
- scitex/scholar/url_finder/translators/individual/Christian_Science_Monitor.py +32 -38
- scitex/scholar/url_finder/translators/individual/Columbia_University_Press.py +51 -47
- scitex/scholar/url_finder/translators/individual/Common_Place.py +66 -57
- scitex/scholar/url_finder/translators/individual/Cornell_LII.py +66 -62
- scitex/scholar/url_finder/translators/individual/Cornell_University_Press.py +38 -45
- scitex/scholar/url_finder/translators/individual/CourtListener.py +52 -56
- scitex/scholar/url_finder/translators/individual/DAI_Zenon.py +53 -54
- scitex/scholar/url_finder/translators/individual/access_medicine.py +27 -33
- scitex/scholar/url_finder/translators/individual/acm.py +1 -1
- scitex/scholar/url_finder/translators/individual/acm_digital_library.py +93 -63
- scitex/scholar/url_finder/translators/individual/airiti.py +3 -1
- scitex/scholar/url_finder/translators/individual/aosic.py +3 -1
- scitex/scholar/url_finder/translators/individual/archive_ouverte_aosic.py +3 -1
- scitex/scholar/url_finder/translators/individual/archive_ouverte_en_sciences_de_l_information_et_de_la_communication___aosic_.py +6 -2
- scitex/scholar/url_finder/translators/individual/artforum.py +35 -27
- scitex/scholar/url_finder/translators/individual/arxiv.py +1 -1
- scitex/scholar/url_finder/translators/individual/arxiv_org.py +8 -4
- scitex/scholar/url_finder/translators/individual/atlanta_journal_constitution.py +22 -18
- scitex/scholar/url_finder/translators/individual/atypon_journals.py +19 -11
- scitex/scholar/url_finder/translators/individual/austlii_and_nzlii.py +48 -44
- scitex/scholar/url_finder/translators/individual/australian_dictionary_of_biography.py +21 -17
- scitex/scholar/url_finder/translators/individual/bailii.py +22 -19
- scitex/scholar/url_finder/translators/individual/bbc.py +46 -42
- scitex/scholar/url_finder/translators/individual/bbc_genome.py +37 -25
- scitex/scholar/url_finder/translators/individual/biblioteca_nacional_de_maestros.py +24 -20
- scitex/scholar/url_finder/translators/individual/bibliotheque_archives_nationale_quebec_pistard.py +42 -43
- scitex/scholar/url_finder/translators/individual/bibliotheque_archives_nationales_quebec.py +87 -81
- scitex/scholar/url_finder/translators/individual/bibliotheque_nationale_france.py +39 -37
- scitex/scholar/url_finder/translators/individual/bibsys.py +32 -28
- scitex/scholar/url_finder/translators/individual/bioconductor.py +58 -52
- scitex/scholar/url_finder/translators/individual/biomed_central.py +23 -15
- scitex/scholar/url_finder/translators/individual/biorxiv.py +26 -13
- scitex/scholar/url_finder/translators/individual/blogger.py +39 -43
- scitex/scholar/url_finder/translators/individual/bloomberg.py +48 -52
- scitex/scholar/url_finder/translators/individual/bloomsbury_food_library.py +37 -37
- scitex/scholar/url_finder/translators/individual/bluesky.py +30 -28
- scitex/scholar/url_finder/translators/individual/bnf_isbn.py +1 -1
- scitex/scholar/url_finder/translators/individual/bocc.py +66 -60
- scitex/scholar/url_finder/translators/individual/boe.py +52 -52
- scitex/scholar/url_finder/translators/individual/brill.py +3 -1
- scitex/scholar/url_finder/translators/individual/business_standard.py +36 -38
- scitex/scholar/url_finder/translators/individual/cabi_cab_abstracts.py +39 -41
- scitex/scholar/url_finder/translators/individual/cambridge.py +3 -1
- scitex/scholar/url_finder/translators/individual/cambridge_core.py +30 -24
- scitex/scholar/url_finder/translators/individual/caod.py +50 -46
- scitex/scholar/url_finder/translators/individual/cbc.py +91 -67
- scitex/scholar/url_finder/translators/individual/ccfr_bnf.py +49 -53
- scitex/scholar/url_finder/translators/individual/cia_world_factbook.py +43 -33
- scitex/scholar/url_finder/translators/individual/crossref_rest.py +208 -174
- scitex/scholar/url_finder/translators/individual/current_affairs.py +29 -35
- scitex/scholar/url_finder/translators/individual/dabi.py +70 -66
- scitex/scholar/url_finder/translators/individual/dagens_nyheter.py +3 -1
- scitex/scholar/url_finder/translators/individual/dagstuhl.py +10 -15
- scitex/scholar/url_finder/translators/individual/dar_almandumah.py +13 -9
- scitex/scholar/url_finder/translators/individual/dart_europe.py +19 -22
- scitex/scholar/url_finder/translators/individual/data_gov.py +2 -2
- scitex/scholar/url_finder/translators/individual/databrary.py +27 -28
- scitex/scholar/url_finder/translators/individual/datacite_json.py +152 -137
- scitex/scholar/url_finder/translators/individual/dataverse.py +68 -64
- scitex/scholar/url_finder/translators/individual/daum_news.py +38 -38
- scitex/scholar/url_finder/translators/individual/dblp.py +4 -8
- scitex/scholar/url_finder/translators/individual/dblp_computer_science_bibliography.py +8 -3
- scitex/scholar/url_finder/translators/individual/dbpia.py +5 -3
- scitex/scholar/url_finder/translators/individual/defense_technical_information_center.py +30 -28
- scitex/scholar/url_finder/translators/individual/delpher.py +102 -79
- scitex/scholar/url_finder/translators/individual/demographic_research.py +35 -31
- scitex/scholar/url_finder/translators/individual/denik_cz.py +58 -54
- scitex/scholar/url_finder/translators/individual/depatisnet.py +7 -10
- scitex/scholar/url_finder/translators/individual/der_freitag.py +81 -66
- scitex/scholar/url_finder/translators/individual/der_spiegel.py +56 -54
- scitex/scholar/url_finder/translators/individual/digibib_net.py +3 -1
- scitex/scholar/url_finder/translators/individual/digizeitschriften.py +3 -1
- scitex/scholar/url_finder/translators/individual/dpla.py +13 -14
- scitex/scholar/url_finder/translators/individual/dspace.py +2 -2
- scitex/scholar/url_finder/translators/individual/ebrary.py +3 -1
- scitex/scholar/url_finder/translators/individual/ebscohost.py +3 -1
- scitex/scholar/url_finder/translators/individual/electronic_colloquium_on_computational_complexity.py +3 -1
- scitex/scholar/url_finder/translators/individual/elife.py +3 -1
- scitex/scholar/url_finder/translators/individual/elsevier_health_journals.py +3 -1
- scitex/scholar/url_finder/translators/individual/emerald.py +3 -1
- scitex/scholar/url_finder/translators/individual/emerald_insight.py +3 -1
- scitex/scholar/url_finder/translators/individual/epicurious.py +3 -1
- scitex/scholar/url_finder/translators/individual/eurogamerusgamer.py +3 -1
- scitex/scholar/url_finder/translators/individual/fachportal_padagogik.py +3 -1
- scitex/scholar/url_finder/translators/individual/frontiers.py +1 -1
- scitex/scholar/url_finder/translators/individual/gale_databases.py +3 -1
- scitex/scholar/url_finder/translators/individual/gms_german_medical_science.py +6 -2
- scitex/scholar/url_finder/translators/individual/ieee_computer_society.py +6 -2
- scitex/scholar/url_finder/translators/individual/ieee_xplore.py +41 -35
- scitex/scholar/url_finder/translators/individual/inter_research_science_center.py +6 -2
- scitex/scholar/url_finder/translators/individual/jisc_historical_texts.py +3 -1
- scitex/scholar/url_finder/translators/individual/jstor.py +14 -12
- scitex/scholar/url_finder/translators/individual/korean_national_library.py +3 -1
- scitex/scholar/url_finder/translators/individual/la_times.py +3 -1
- scitex/scholar/url_finder/translators/individual/landesbibliographie_baden_wurttemberg.py +3 -1
- scitex/scholar/url_finder/translators/individual/legislative_insight.py +3 -1
- scitex/scholar/url_finder/translators/individual/libraries_tasmania.py +3 -1
- scitex/scholar/url_finder/translators/individual/library_catalog__koha_.py +3 -1
- scitex/scholar/url_finder/translators/individual/lingbuzz.py +2 -2
- scitex/scholar/url_finder/translators/individual/max_planck_institute_for_the_history_of_science_virtual_laboratory_library.py +3 -1
- scitex/scholar/url_finder/translators/individual/mdpi.py +12 -6
- scitex/scholar/url_finder/translators/individual/microbiology_society_journals.py +3 -1
- scitex/scholar/url_finder/translators/individual/midas_journals.py +3 -1
- scitex/scholar/url_finder/translators/individual/nagoya_university_opac.py +3 -1
- scitex/scholar/url_finder/translators/individual/nature_publishing_group.py +32 -19
- scitex/scholar/url_finder/translators/individual/ntsb_accident_reports.py +3 -1
- scitex/scholar/url_finder/translators/individual/openedition_journals.py +8 -4
- scitex/scholar/url_finder/translators/individual/orcid.py +16 -15
- scitex/scholar/url_finder/translators/individual/oxford.py +25 -19
- scitex/scholar/url_finder/translators/individual/oxford_dictionaries_premium.py +3 -1
- scitex/scholar/url_finder/translators/individual/ozon_ru.py +3 -1
- scitex/scholar/url_finder/translators/individual/plos.py +9 -12
- scitex/scholar/url_finder/translators/individual/polygon.py +3 -1
- scitex/scholar/url_finder/translators/individual/primo.py +3 -1
- scitex/scholar/url_finder/translators/individual/project_muse.py +3 -1
- scitex/scholar/url_finder/translators/individual/pubfactory_journals.py +3 -1
- scitex/scholar/url_finder/translators/individual/pubmed.py +71 -65
- scitex/scholar/url_finder/translators/individual/pubmed_central.py +8 -6
- scitex/scholar/url_finder/translators/individual/rechtspraak_nl.py +3 -1
- scitex/scholar/url_finder/translators/individual/sage_journals.py +25 -17
- scitex/scholar/url_finder/translators/individual/sciencedirect.py +36 -17
- scitex/scholar/url_finder/translators/individual/semantics_visual_library.py +3 -1
- scitex/scholar/url_finder/translators/individual/silverchair.py +70 -52
- scitex/scholar/url_finder/translators/individual/sora.py +3 -1
- scitex/scholar/url_finder/translators/individual/springer.py +15 -11
- scitex/scholar/url_finder/translators/individual/ssrn.py +3 -3
- scitex/scholar/url_finder/translators/individual/stanford_encyclopedia_of_philosophy.py +3 -1
- scitex/scholar/url_finder/translators/individual/superlib.py +3 -1
- scitex/scholar/url_finder/translators/individual/treesearch.py +3 -1
- scitex/scholar/url_finder/translators/individual/university_of_chicago_press_books.py +3 -1
- scitex/scholar/url_finder/translators/individual/vlex.py +3 -1
- scitex/scholar/url_finder/translators/individual/web_of_science.py +3 -1
- scitex/scholar/url_finder/translators/individual/web_of_science_nextgen.py +3 -1
- scitex/scholar/url_finder/translators/individual/wiley.py +31 -25
- scitex/scholar/url_finder/translators/individual/wilson_center_digital_archive.py +3 -1
- scitex/scholar/utils/bibtex/_parse_bibtex.py +3 -3
- scitex/scholar/utils/cleanup/_cleanup_scholar_processes.py +5 -9
- scitex/scholar/utils/text/_TextNormalizer.py +249 -176
- scitex/scholar/utils/validation/DOIValidator.py +31 -28
- scitex/scholar/utils/validation/__init__.py +0 -0
- scitex/scholar/utils/validation/validate_library_dois.py +61 -57
- scitex/scholar/zotero/__init__.py +1 -1
- scitex/security/cli.py +7 -20
- scitex/security/github.py +45 -32
- scitex/session/__init__.py +8 -9
- scitex/session/_decorator.py +49 -42
- scitex/session/_lifecycle.py +39 -39
- scitex/session/_manager.py +24 -20
- scitex/sh/__init__.py +4 -3
- scitex/sh/_execute.py +10 -7
- scitex/sh/_security.py +3 -3
- scitex/sh/_types.py +2 -3
- scitex/stats/__init__.py +57 -6
- scitex/stats/_schema.py +42 -569
- scitex/stats/auto/__init__.py +188 -0
- scitex/stats/auto/_context.py +331 -0
- scitex/stats/auto/_formatting.py +679 -0
- scitex/stats/auto/_rules.py +901 -0
- scitex/stats/auto/_selector.py +554 -0
- scitex/stats/auto/_styles.py +721 -0
- scitex/stats/correct/__init__.py +4 -4
- scitex/stats/correct/_correct_bonferroni.py +43 -34
- scitex/stats/correct/_correct_fdr.py +14 -40
- scitex/stats/correct/_correct_fdr_.py +39 -46
- scitex/stats/correct/_correct_holm.py +14 -32
- scitex/stats/correct/_correct_sidak.py +36 -21
- scitex/stats/descriptive/_circular.py +20 -21
- scitex/stats/descriptive/_describe.py +19 -5
- scitex/stats/descriptive/_nan.py +5 -7
- scitex/stats/descriptive/_real.py +4 -3
- scitex/stats/effect_sizes/__init__.py +10 -11
- scitex/stats/effect_sizes/_cliffs_delta.py +35 -32
- scitex/stats/effect_sizes/_cohens_d.py +30 -31
- scitex/stats/effect_sizes/_epsilon_squared.py +19 -22
- scitex/stats/effect_sizes/_eta_squared.py +23 -27
- scitex/stats/effect_sizes/_prob_superiority.py +18 -21
- scitex/stats/posthoc/__init__.py +3 -3
- scitex/stats/posthoc/_dunnett.py +75 -55
- scitex/stats/posthoc/_games_howell.py +61 -43
- scitex/stats/posthoc/_tukey_hsd.py +42 -34
- scitex/stats/power/__init__.py +2 -2
- scitex/stats/power/_power.py +56 -56
- scitex/stats/tests/__init__.py +1 -1
- scitex/stats/tests/correlation/__init__.py +1 -1
- scitex/stats/tests/correlation/_test_pearson.py +28 -38
- scitex/stats/utils/__init__.py +14 -17
- scitex/stats/utils/_effect_size.py +85 -78
- scitex/stats/utils/_formatters.py +49 -43
- scitex/stats/utils/_normalizers.py +7 -14
- scitex/stats/utils/_power.py +56 -56
- scitex/str/__init__.py +1 -0
- scitex/str/_clean_path.py +3 -3
- scitex/str/_factor_out_digits.py +86 -58
- scitex/str/_format_plot_text.py +180 -111
- scitex/str/_latex.py +19 -19
- scitex/str/_latex_fallback.py +9 -10
- scitex/str/_parse.py +3 -6
- scitex/str/_print_debug.py +13 -13
- scitex/str/_printc.py +2 -0
- scitex/str/_search.py +3 -3
- scitex/template/.legacy/_clone_project.py +9 -13
- scitex/template/__init__.py +10 -2
- scitex/template/_clone_project.py +7 -2
- scitex/template/_copy.py +1 -0
- scitex/template/_customize.py +3 -6
- scitex/template/_git_strategy.py +2 -3
- scitex/template/_rename.py +1 -0
- scitex/template/clone_pip_project.py +6 -7
- scitex/template/clone_research.py +7 -10
- scitex/template/clone_singularity.py +6 -7
- scitex/template/clone_writer_directory.py +6 -7
- scitex/tex/_preview.py +26 -11
- scitex/tex/_to_vec.py +10 -7
- scitex/torch/__init__.py +11 -1
- scitex/types/_ArrayLike.py +2 -0
- scitex/types/_is_listed_X.py +3 -3
- scitex/units.py +110 -77
- scitex/utils/_compress_hdf5.py +3 -3
- scitex/utils/_email.py +8 -4
- scitex/utils/_notify.py +14 -8
- scitex/utils/_search.py +6 -6
- scitex/utils/_verify_scitex_format.py +17 -42
- scitex/utils/_verify_scitex_format_v01.py +12 -34
- scitex/utils/template.py +4 -3
- scitex/vis/__init__.py +0 -0
- scitex/vis/backend/__init__.py +3 -3
- scitex/vis/backend/{export.py → _export.py} +1 -1
- scitex/vis/backend/{parser.py → _parser.py} +1 -3
- scitex/vis/backend/{render.py → _render.py} +1 -1
- scitex/vis/canvas.py +15 -3
- scitex/vis/editor/__init__.py +0 -0
- scitex/vis/editor/_dearpygui_editor.py +450 -304
- scitex/vis/editor/_defaults.py +114 -123
- scitex/vis/editor/_edit.py +38 -26
- scitex/vis/editor/_flask_editor.py +8 -8
- scitex/vis/editor/_mpl_editor.py +63 -48
- scitex/vis/editor/_qt_editor.py +210 -159
- scitex/vis/editor/_tkinter_editor.py +146 -89
- scitex/vis/editor/flask_editor/__init__.py +10 -10
- scitex/vis/editor/flask_editor/_bbox.py +529 -0
- scitex/vis/editor/flask_editor/{core.py → _core.py} +45 -29
- scitex/vis/editor/flask_editor/_plotter.py +567 -0
- scitex/vis/editor/flask_editor/_renderer.py +393 -0
- scitex/vis/editor/flask_editor/{utils.py → _utils.py} +13 -14
- scitex/vis/editor/flask_editor/templates/__init__.py +5 -5
- scitex/vis/editor/flask_editor/templates/{html.py → _html.py} +234 -16
- scitex/vis/editor/flask_editor/templates/_scripts.py +1261 -0
- scitex/vis/editor/flask_editor/templates/{styles.py → _styles.py} +192 -2
- scitex/vis/io/__init__.py +5 -5
- scitex/vis/io/{canvas.py → _canvas.py} +8 -4
- scitex/vis/io/{data.py → _data.py} +13 -9
- scitex/vis/io/{directory.py → _directory.py} +7 -4
- scitex/vis/io/{export.py → _export.py} +15 -12
- scitex/vis/io/{load.py → _load.py} +1 -1
- scitex/vis/io/{panel.py → _panel.py} +21 -13
- scitex/vis/io/{save.py → _save.py} +0 -0
- scitex/vis/model/__init__.py +7 -7
- scitex/vis/model/{annotations.py → _annotations.py} +2 -4
- scitex/vis/model/{axes.py → _axes.py} +1 -1
- scitex/vis/model/{figure.py → _figure.py} +0 -0
- scitex/vis/model/{guides.py → _guides.py} +1 -1
- scitex/vis/model/{plot.py → _plot.py} +2 -4
- scitex/vis/model/{plot_types.py → _plot_types.py} +0 -0
- scitex/vis/model/{styles.py → _styles.py} +0 -0
- scitex/vis/utils/__init__.py +2 -2
- scitex/vis/utils/{defaults.py → _defaults.py} +1 -2
- scitex/vis/utils/{validate.py → _validate.py} +3 -9
- scitex/web/__init__.py +7 -1
- scitex/web/_scraping.py +54 -38
- scitex/web/_search_pubmed.py +30 -14
- scitex/writer/.legacy/Writer_v01-refactored.py +4 -4
- scitex/writer/.legacy/_compile.py +18 -28
- scitex/writer/Writer.py +8 -21
- scitex/writer/__init__.py +11 -11
- scitex/writer/_clone_writer_project.py +2 -6
- scitex/writer/_compile/__init__.py +1 -0
- scitex/writer/_compile/_parser.py +1 -0
- scitex/writer/_compile/_runner.py +35 -38
- scitex/writer/_compile/_validator.py +1 -0
- scitex/writer/_compile/manuscript.py +1 -0
- scitex/writer/_compile/revision.py +1 -0
- scitex/writer/_compile/supplementary.py +1 -0
- scitex/writer/_compile_async.py +5 -12
- scitex/writer/_project/__init__.py +1 -0
- scitex/writer/_project/_create.py +10 -25
- scitex/writer/_project/_trees.py +4 -9
- scitex/writer/_project/_validate.py +2 -3
- scitex/writer/_validate_tree_structures.py +7 -18
- scitex/writer/dataclasses/__init__.py +8 -10
- scitex/writer/dataclasses/config/_CONSTANTS.py +2 -3
- scitex/writer/dataclasses/config/_WriterConfig.py +4 -9
- scitex/writer/dataclasses/contents/_ManuscriptContents.py +14 -25
- scitex/writer/dataclasses/contents/_RevisionContents.py +21 -16
- scitex/writer/dataclasses/contents/_SupplementaryContents.py +21 -24
- scitex/writer/dataclasses/core/_Document.py +2 -3
- scitex/writer/dataclasses/core/_DocumentSection.py +8 -23
- scitex/writer/dataclasses/results/_CompilationResult.py +2 -3
- scitex/writer/dataclasses/results/_LaTeXIssue.py +3 -6
- scitex/writer/dataclasses/results/_SaveSectionsResponse.py +20 -9
- scitex/writer/dataclasses/results/_SectionReadResponse.py +24 -10
- scitex/writer/dataclasses/tree/_ConfigTree.py +7 -4
- scitex/writer/dataclasses/tree/_ManuscriptTree.py +10 -13
- scitex/writer/dataclasses/tree/_RevisionTree.py +16 -17
- scitex/writer/dataclasses/tree/_ScriptsTree.py +10 -5
- scitex/writer/dataclasses/tree/_SharedTree.py +10 -13
- scitex/writer/dataclasses/tree/_SupplementaryTree.py +15 -14
- scitex/writer/utils/.legacy_git_retry.py +3 -8
- scitex/writer/utils/_parse_latex_logs.py +2 -3
- scitex/writer/utils/_parse_script_args.py +20 -23
- scitex/writer/utils/_watch.py +5 -5
- {scitex-2.5.0.dist-info → scitex-2.7.0.dist-info}/METADATA +4 -10
- {scitex-2.5.0.dist-info → scitex-2.7.0.dist-info}/RECORD +1071 -975
- scitex/db/_sqlite3/_SQLite3Mixins/_ColumnMixin_v01-indentation-issues.py +0 -583
- scitex/plt/_subplots/_export_as_csv_formatters.py +0 -112
- scitex/vis/editor/flask_editor/bbox.py +0 -216
- scitex/vis/editor/flask_editor/plotter.py +0 -130
- scitex/vis/editor/flask_editor/renderer.py +0 -184
- scitex/vis/editor/flask_editor/templates/scripts.py +0 -614
- {scitex-2.5.0.dist-info → scitex-2.7.0.dist-info}/WHEEL +0 -0
- {scitex-2.5.0.dist-info → scitex-2.7.0.dist-info}/entry_points.txt +0 -0
- {scitex-2.5.0.dist-info → scitex-2.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
# File: ./src/scitex/scholar/_search_unified.py
|
|
5
5
|
# ----------------------------------------
|
|
6
6
|
import os
|
|
7
|
+
|
|
7
8
|
__FILE__ = __file__
|
|
8
9
|
__DIR__ = os.path.dirname(__FILE__)
|
|
9
10
|
# ----------------------------------------
|
|
@@ -38,19 +39,20 @@ logger = logging.getLogger(__name__)
|
|
|
38
39
|
|
|
39
40
|
class SearchEngine:
|
|
40
41
|
"""Base class for all search engines."""
|
|
41
|
-
|
|
42
|
+
|
|
42
43
|
def __init__(self, name: str):
|
|
43
44
|
self.name = name
|
|
44
45
|
self.rate_limit = 0.1 # seconds between requests
|
|
45
46
|
self._last_request = 0
|
|
46
|
-
|
|
47
|
+
|
|
47
48
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
48
49
|
"""Search for papers. Must be implemented by subclasses."""
|
|
49
50
|
raise NotImplementedError
|
|
50
|
-
|
|
51
|
+
|
|
51
52
|
async def _rate_limit_async(self):
|
|
52
53
|
"""Enforce rate limiting."""
|
|
53
54
|
import time
|
|
55
|
+
|
|
54
56
|
now = time.time()
|
|
55
57
|
elapsed = now - self._last_request
|
|
56
58
|
if elapsed < self.rate_limit:
|
|
@@ -60,75 +62,76 @@ class SearchEngine:
|
|
|
60
62
|
|
|
61
63
|
class SemanticScholarEngine(SearchEngine):
|
|
62
64
|
"""Semantic Scholar search engine."""
|
|
63
|
-
|
|
65
|
+
|
|
64
66
|
def __init__(self, api_key: Optional[str] = None):
|
|
65
67
|
super().__init__("semantic_scholar")
|
|
66
68
|
self.api_key = api_key
|
|
67
69
|
self.base_url = "https://api.semanticscholar.org/graph/v1"
|
|
68
70
|
self.rate_limit = 0.1 if api_key else 1.0 # Faster with API key
|
|
69
|
-
|
|
70
|
-
|
|
71
|
+
|
|
71
72
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
72
73
|
"""Search Semantic Scholar for papers."""
|
|
73
74
|
await self._rate_limit_async()
|
|
74
|
-
|
|
75
|
+
|
|
75
76
|
# Check if query is for a specific paper ID
|
|
76
|
-
if query.startswith(
|
|
77
|
-
corpus_id = query.replace(
|
|
77
|
+
if query.startswith("CorpusId:"):
|
|
78
|
+
corpus_id = query.replace("CorpusId:", "").strip()
|
|
78
79
|
paper = await self._fetch_paper_by_id_async(f"CorpusId:{corpus_id}")
|
|
79
80
|
return [paper] if paper else []
|
|
80
|
-
|
|
81
|
+
|
|
81
82
|
headers = {}
|
|
82
83
|
if self.api_key:
|
|
83
|
-
headers[
|
|
84
|
-
|
|
84
|
+
headers["x-api-key"] = self.api_key
|
|
85
|
+
|
|
85
86
|
params = {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
87
|
+
"query": query,
|
|
88
|
+
"limit": min(limit, 100),
|
|
89
|
+
"fields": "title,authors,abstract,year,citationCount,journal,paperId,venue,fieldsOfStudy,isOpenAccess,url,tldr,doi,externalIds",
|
|
89
90
|
}
|
|
90
|
-
|
|
91
|
+
|
|
91
92
|
# Add year filters if provided
|
|
92
|
-
if
|
|
93
|
-
params[
|
|
94
|
-
if
|
|
95
|
-
if
|
|
96
|
-
params[
|
|
93
|
+
if "year_min" in kwargs:
|
|
94
|
+
params["year"] = f"{kwargs['year_min']}-"
|
|
95
|
+
if "year_max" in kwargs:
|
|
96
|
+
if "year" in params:
|
|
97
|
+
params["year"] = f"{kwargs['year_min']}-{kwargs['year_max']}"
|
|
97
98
|
else:
|
|
98
|
-
params[
|
|
99
|
-
|
|
99
|
+
params["year"] = f"-{kwargs['year_max']}"
|
|
100
|
+
|
|
100
101
|
papers = []
|
|
101
|
-
|
|
102
|
+
|
|
102
103
|
try:
|
|
103
104
|
async with aiohttp.ClientSession() as session:
|
|
104
105
|
async with session.get(
|
|
105
|
-
f"{self.base_url}/paper/search",
|
|
106
|
-
params=params,
|
|
107
|
-
headers=headers
|
|
106
|
+
f"{self.base_url}/paper/search", params=params, headers=headers
|
|
108
107
|
) as response:
|
|
109
108
|
if response.status == 200:
|
|
110
109
|
data = await response.json()
|
|
111
|
-
|
|
112
|
-
for item in data.get(
|
|
110
|
+
|
|
111
|
+
for item in data.get("data", []):
|
|
113
112
|
paper = self._parse_semantic_scholar_paper(item)
|
|
114
113
|
if paper:
|
|
115
114
|
papers.append(paper)
|
|
116
115
|
else:
|
|
117
116
|
error_msg = await response.text()
|
|
118
|
-
|
|
117
|
+
|
|
119
118
|
if response.status == 429:
|
|
120
119
|
# Rate limiting - show_async this to user
|
|
121
|
-
logger.warning(
|
|
120
|
+
logger.warning(
|
|
121
|
+
"Semantic Scholar rate limit reached. Please wait a moment or get a free API key at https://www.semanticscholar.org/product/api"
|
|
122
|
+
)
|
|
122
123
|
raise SearchError(
|
|
123
124
|
query=query,
|
|
124
125
|
source="semantic_scholar",
|
|
125
|
-
reason="Rate limit reached. Please wait 1-2 seconds between searches or get a free API key."
|
|
126
|
+
reason="Rate limit reached. Please wait 1-2 seconds between searches or get a free API key.",
|
|
126
127
|
)
|
|
127
128
|
else:
|
|
128
129
|
# Other errors - just log
|
|
129
|
-
logger.debug(
|
|
130
|
+
logger.debug(
|
|
131
|
+
f"Semantic Scholar API returned {response.status}: {error_msg}"
|
|
132
|
+
)
|
|
130
133
|
return []
|
|
131
|
-
|
|
134
|
+
|
|
132
135
|
except SearchError:
|
|
133
136
|
# Re-raise SearchError so user sees it
|
|
134
137
|
raise
|
|
@@ -136,23 +139,23 @@ class SemanticScholarEngine(SearchEngine):
|
|
|
136
139
|
logger.debug(f"Semantic Scholar search error: {e}")
|
|
137
140
|
# Return empty list instead of raising to allow fallback to other sources
|
|
138
141
|
return []
|
|
139
|
-
|
|
142
|
+
|
|
140
143
|
return papers
|
|
141
|
-
|
|
144
|
+
|
|
142
145
|
async def _fetch_paper_by_id_async(self, paper_id: str) -> Optional[Paper]:
|
|
143
146
|
"""Fetch a specific paper by its ID (CorpusId, DOI, arXiv ID, etc.)."""
|
|
144
147
|
await self._rate_limit_async()
|
|
145
|
-
|
|
148
|
+
|
|
146
149
|
headers = {}
|
|
147
150
|
if self.api_key:
|
|
148
|
-
headers[
|
|
149
|
-
|
|
151
|
+
headers["x-api-key"] = self.api_key
|
|
152
|
+
|
|
150
153
|
# Build URL for fetching paper by ID
|
|
151
154
|
url = f"{self.base_url}/paper/{paper_id}"
|
|
152
155
|
params = {
|
|
153
|
-
|
|
156
|
+
"fields": "title,authors,abstract,year,citationCount,journal,paperId,venue,fieldsOfStudy,isOpenAccess,url,tldr,externalIds"
|
|
154
157
|
}
|
|
155
|
-
|
|
158
|
+
|
|
156
159
|
try:
|
|
157
160
|
async with aiohttp.ClientSession() as session:
|
|
158
161
|
async with session.get(url, params=params, headers=headers) as response:
|
|
@@ -160,68 +163,73 @@ class SemanticScholarEngine(SearchEngine):
|
|
|
160
163
|
data = await response.json()
|
|
161
164
|
return self._parse_semantic_scholar_paper(data)
|
|
162
165
|
else:
|
|
163
|
-
logger.debug(
|
|
166
|
+
logger.debug(
|
|
167
|
+
f"Failed to fetch paper {paper_id}: {response.status}"
|
|
168
|
+
)
|
|
164
169
|
return None
|
|
165
|
-
|
|
170
|
+
|
|
166
171
|
except Exception as e:
|
|
167
172
|
logger.debug(f"Error fetching paper {paper_id}: {e}")
|
|
168
173
|
return None
|
|
169
|
-
|
|
174
|
+
|
|
170
175
|
def _parse_semantic_scholar_paper(self, data: Dict[str, Any]) -> Optional[Paper]:
|
|
171
176
|
"""Parse Semantic Scholar paper data."""
|
|
172
177
|
if not data or not isinstance(data, dict):
|
|
173
178
|
logger.warning("Received None or non-dict data for Semantic Scholar paper")
|
|
174
179
|
return None
|
|
175
|
-
|
|
180
|
+
|
|
176
181
|
try:
|
|
177
182
|
# Extract authors
|
|
178
183
|
authors = []
|
|
179
|
-
for author_data in data.get(
|
|
180
|
-
name = author_data.get(
|
|
184
|
+
for author_data in data.get("authors", []):
|
|
185
|
+
name = author_data.get("name", "")
|
|
181
186
|
if name:
|
|
182
187
|
authors.append(name)
|
|
183
|
-
|
|
188
|
+
|
|
184
189
|
# Get PDF URL if available
|
|
185
190
|
pdf_url = None
|
|
186
|
-
if data.get(
|
|
187
|
-
pdf_url = data.get(
|
|
188
|
-
|
|
191
|
+
if data.get("isOpenAccess"):
|
|
192
|
+
pdf_url = data.get("url")
|
|
193
|
+
|
|
189
194
|
# Extract journal/venue
|
|
190
|
-
journal_data = data.get(
|
|
191
|
-
journal =
|
|
195
|
+
journal_data = data.get("journal")
|
|
196
|
+
journal = ""
|
|
192
197
|
if journal_data and isinstance(journal_data, dict):
|
|
193
|
-
journal = journal_data.get(
|
|
198
|
+
journal = journal_data.get("name", "")
|
|
194
199
|
if not journal:
|
|
195
|
-
journal = data.get(
|
|
196
|
-
|
|
200
|
+
journal = data.get("venue", "")
|
|
201
|
+
|
|
197
202
|
# Extract DOI from externalIds if not directly available
|
|
198
|
-
doi = data.get(
|
|
199
|
-
if not doi and data.get(
|
|
200
|
-
doi = data.get(
|
|
201
|
-
|
|
203
|
+
doi = data.get("doi")
|
|
204
|
+
if not doi and data.get("externalIds"):
|
|
205
|
+
doi = data.get("externalIds", {}).get("DOI")
|
|
206
|
+
|
|
202
207
|
# Create paper
|
|
203
208
|
paper = Paper(
|
|
204
|
-
title=data.get(
|
|
209
|
+
title=data.get("title", ""),
|
|
205
210
|
authors=authors,
|
|
206
|
-
abstract=data.get(
|
|
207
|
-
|
|
211
|
+
abstract=data.get("abstract", "")
|
|
212
|
+
or (data.get("tldr", {}) or {}).get("text", ""),
|
|
213
|
+
year=data.get("year"),
|
|
208
214
|
doi=doi,
|
|
209
215
|
journal=journal,
|
|
210
|
-
keywords=data.get(
|
|
211
|
-
citation_count=data.get(
|
|
216
|
+
keywords=data.get("fieldsOfStudy", []),
|
|
217
|
+
citation_count=data.get("citationCount", 0),
|
|
212
218
|
pdf_url=pdf_url,
|
|
213
219
|
source="semantic_scholar",
|
|
214
220
|
metadata={
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
+
"semantic_scholar_paper_id": data.get("paperId"),
|
|
222
|
+
"fields_of_study": data.get("fieldsOfStudy", []),
|
|
223
|
+
"is_open_access": data.get("isOpenAccess", False),
|
|
224
|
+
"citation_count_source": "Semantic Scholar"
|
|
225
|
+
if data.get("citationCount") is not None
|
|
226
|
+
else None,
|
|
227
|
+
"external_ids": data.get("externalIds", {}),
|
|
228
|
+
},
|
|
221
229
|
)
|
|
222
|
-
|
|
230
|
+
|
|
223
231
|
return paper
|
|
224
|
-
|
|
232
|
+
|
|
225
233
|
except Exception as e:
|
|
226
234
|
logger.warning(f"Failed to parse Semantic Scholar paper: {e}")
|
|
227
235
|
return None
|
|
@@ -229,158 +237,165 @@ class SemanticScholarEngine(SearchEngine):
|
|
|
229
237
|
|
|
230
238
|
class PubMedEngine(SearchEngine):
|
|
231
239
|
"""PubMed search engine using E-utilities."""
|
|
232
|
-
|
|
240
|
+
|
|
233
241
|
def __init__(self, email: Optional[str] = None):
|
|
234
242
|
super().__init__("pubmed")
|
|
235
243
|
self.email = email or "research@example.com"
|
|
236
244
|
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
|
237
245
|
self.rate_limit = 0.4 # NCBI rate limit
|
|
238
|
-
|
|
246
|
+
|
|
239
247
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
240
248
|
"""Search PubMed for papers."""
|
|
241
249
|
await self._rate_limit_async()
|
|
242
|
-
|
|
250
|
+
|
|
243
251
|
# First, search for IDs
|
|
244
252
|
search_params = {
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
253
|
+
"db": "pubmed",
|
|
254
|
+
"term": query,
|
|
255
|
+
"retmax": limit,
|
|
256
|
+
"retmode": "json",
|
|
257
|
+
"email": self.email,
|
|
258
|
+
"sort": "relevance", # Sort by relevance instead of date to get diverse years
|
|
251
259
|
}
|
|
252
|
-
|
|
260
|
+
|
|
253
261
|
# Add date filters
|
|
254
|
-
year_min = kwargs.get(
|
|
255
|
-
year_max = kwargs.get(
|
|
262
|
+
year_min = kwargs.get("year_min")
|
|
263
|
+
year_max = kwargs.get("year_max")
|
|
256
264
|
if year_min is not None or year_max is not None:
|
|
257
265
|
min_date = f"{year_min or 1900}/01/01"
|
|
258
266
|
max_date = f"{year_max or datetime.now().year}/12/31"
|
|
259
|
-
search_params[
|
|
260
|
-
search_params[
|
|
261
|
-
search_params[
|
|
267
|
+
search_params["mindate"] = min_date
|
|
268
|
+
search_params["maxdate"] = max_date
|
|
269
|
+
search_params["datetype"] = "pdat" # Publication date
|
|
262
270
|
else:
|
|
263
271
|
# When no date range specified, search last 20 years to avoid only getting current year
|
|
264
272
|
current_year = datetime.now().year
|
|
265
|
-
search_params[
|
|
266
|
-
search_params[
|
|
267
|
-
search_params[
|
|
268
|
-
|
|
273
|
+
search_params["mindate"] = f"{current_year - 20}/01/01"
|
|
274
|
+
search_params["maxdate"] = f"{current_year}/12/31"
|
|
275
|
+
search_params["datetype"] = "pdat"
|
|
276
|
+
|
|
269
277
|
papers = []
|
|
270
|
-
|
|
278
|
+
|
|
271
279
|
try:
|
|
272
280
|
async with aiohttp.ClientSession() as session:
|
|
273
281
|
# Search for IDs
|
|
274
282
|
logger.info(f"PubMed API URL: {self.base_url}/esearch.fcgi")
|
|
275
283
|
logger.info(f"PubMed search params: {search_params}")
|
|
276
284
|
async with session.get(
|
|
277
|
-
f"{self.base_url}/esearch.fcgi",
|
|
278
|
-
params=search_params
|
|
285
|
+
f"{self.base_url}/esearch.fcgi", params=search_params
|
|
279
286
|
) as response:
|
|
280
287
|
if response.status == 200:
|
|
281
288
|
data = await response.json()
|
|
282
|
-
pmids = data.get(
|
|
289
|
+
pmids = data.get("esearchresult", {}).get("idlist", [])
|
|
283
290
|
logger.info(f"PubMed search returned {len(pmids)} PMIDs")
|
|
284
|
-
|
|
291
|
+
|
|
285
292
|
if pmids:
|
|
286
293
|
# Fetch details
|
|
287
|
-
papers = await self._fetch_pubmed_details_async(
|
|
294
|
+
papers = await self._fetch_pubmed_details_async(
|
|
295
|
+
session, pmids
|
|
296
|
+
)
|
|
288
297
|
else:
|
|
289
298
|
logger.error(f"PubMed search failed: {response.status}")
|
|
290
|
-
|
|
299
|
+
|
|
291
300
|
except Exception as e:
|
|
292
301
|
logger.error(f"PubMed search error: {type(e).__name__}: {e}")
|
|
293
302
|
import traceback
|
|
303
|
+
|
|
294
304
|
logger.error(traceback.format_exc())
|
|
295
305
|
# Return empty list instead of raising to allow other sources
|
|
296
306
|
return []
|
|
297
|
-
|
|
307
|
+
|
|
298
308
|
return papers
|
|
299
|
-
|
|
300
|
-
async def _fetch_pubmed_details_async(
|
|
309
|
+
|
|
310
|
+
async def _fetch_pubmed_details_async(
|
|
311
|
+
self, session: aiohttp.ClientSession, pmids: List[str]
|
|
312
|
+
) -> List[Paper]:
|
|
301
313
|
"""Fetch detailed information for PubMed IDs."""
|
|
302
314
|
await self._rate_limit_async()
|
|
303
|
-
|
|
315
|
+
|
|
304
316
|
fetch_params = {
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
317
|
+
"db": "pubmed",
|
|
318
|
+
"id": ",".join(pmids),
|
|
319
|
+
"retmode": "xml",
|
|
320
|
+
"email": self.email,
|
|
309
321
|
}
|
|
310
|
-
|
|
322
|
+
|
|
311
323
|
papers = []
|
|
312
|
-
|
|
324
|
+
|
|
313
325
|
async with session.get(
|
|
314
|
-
f"{self.base_url}/efetch.fcgi",
|
|
315
|
-
params=fetch_params
|
|
326
|
+
f"{self.base_url}/efetch.fcgi", params=fetch_params
|
|
316
327
|
) as response:
|
|
317
328
|
if response.status == 200:
|
|
318
329
|
xml_data = await response.text()
|
|
319
330
|
papers = self._parse_pubmed_xml(xml_data)
|
|
320
331
|
else:
|
|
321
332
|
logger.error(f"PubMed fetch failed: {response.status}")
|
|
322
|
-
|
|
333
|
+
|
|
323
334
|
return papers
|
|
324
|
-
|
|
335
|
+
|
|
325
336
|
def _parse_pubmed_xml(self, xml_data: str) -> List[Paper]:
|
|
326
337
|
"""Parse PubMed XML response."""
|
|
327
338
|
papers = []
|
|
328
|
-
|
|
339
|
+
|
|
329
340
|
try:
|
|
330
341
|
root = ET.fromstring(xml_data)
|
|
331
|
-
|
|
332
|
-
for article_elem in root.findall(
|
|
342
|
+
|
|
343
|
+
for article_elem in root.findall(".//PubmedArticle"):
|
|
333
344
|
try:
|
|
334
345
|
# Extract article data
|
|
335
|
-
medline = article_elem.find(
|
|
346
|
+
medline = article_elem.find(".//MedlineCitation")
|
|
336
347
|
if medline is None:
|
|
337
348
|
continue
|
|
338
|
-
|
|
349
|
+
|
|
339
350
|
# Title
|
|
340
|
-
title_elem = medline.find(
|
|
341
|
-
title = title_elem.text if title_elem is not None else
|
|
342
|
-
|
|
351
|
+
title_elem = medline.find(".//ArticleTitle")
|
|
352
|
+
title = title_elem.text if title_elem is not None else ""
|
|
353
|
+
|
|
343
354
|
# Authors
|
|
344
355
|
authors = []
|
|
345
|
-
for author_elem in medline.findall(
|
|
346
|
-
last_name = author_elem.findtext(
|
|
347
|
-
first_name = author_elem.findtext(
|
|
356
|
+
for author_elem in medline.findall(".//Author"):
|
|
357
|
+
last_name = author_elem.findtext("LastName", "")
|
|
358
|
+
first_name = author_elem.findtext("ForeName", "")
|
|
348
359
|
if last_name:
|
|
349
|
-
name =
|
|
360
|
+
name = (
|
|
361
|
+
f"{last_name}, {first_name}"
|
|
362
|
+
if first_name
|
|
363
|
+
else last_name
|
|
364
|
+
)
|
|
350
365
|
authors.append(name)
|
|
351
|
-
|
|
366
|
+
|
|
352
367
|
# Abstract
|
|
353
368
|
abstract_parts = []
|
|
354
|
-
for abstract_elem in medline.findall(
|
|
355
|
-
text = abstract_elem.text or
|
|
369
|
+
for abstract_elem in medline.findall(".//AbstractText"):
|
|
370
|
+
text = abstract_elem.text or ""
|
|
356
371
|
abstract_parts.append(text)
|
|
357
|
-
abstract =
|
|
358
|
-
|
|
372
|
+
abstract = " ".join(abstract_parts)
|
|
373
|
+
|
|
359
374
|
# Year
|
|
360
|
-
year_elem = medline.find(
|
|
375
|
+
year_elem = medline.find(".//PubDate/Year")
|
|
361
376
|
year = year_elem.text if year_elem is not None else None
|
|
362
|
-
|
|
377
|
+
|
|
363
378
|
# Journal
|
|
364
|
-
journal_elem = medline.find(
|
|
365
|
-
journal = journal_elem.text if journal_elem is not None else
|
|
366
|
-
|
|
379
|
+
journal_elem = medline.find(".//Journal/Title")
|
|
380
|
+
journal = journal_elem.text if journal_elem is not None else ""
|
|
381
|
+
|
|
367
382
|
# PMID
|
|
368
|
-
pmid_elem = medline.find(
|
|
369
|
-
pmid = pmid_elem.text if pmid_elem is not None else
|
|
370
|
-
|
|
383
|
+
pmid_elem = medline.find(".//PMID")
|
|
384
|
+
pmid = pmid_elem.text if pmid_elem is not None else ""
|
|
385
|
+
|
|
371
386
|
# DOI
|
|
372
387
|
doi = None
|
|
373
|
-
for id_elem in article_elem.findall(
|
|
374
|
-
if id_elem.get(
|
|
388
|
+
for id_elem in article_elem.findall(".//ArticleId"):
|
|
389
|
+
if id_elem.get("IdType") == "doi":
|
|
375
390
|
doi = id_elem.text
|
|
376
391
|
break
|
|
377
|
-
|
|
392
|
+
|
|
378
393
|
# Keywords
|
|
379
394
|
keywords = []
|
|
380
|
-
for kw_elem in medline.findall(
|
|
395
|
+
for kw_elem in medline.findall(".//MeshHeading/DescriptorName"):
|
|
381
396
|
if kw_elem.text:
|
|
382
397
|
keywords.append(kw_elem.text)
|
|
383
|
-
|
|
398
|
+
|
|
384
399
|
paper = Paper(
|
|
385
400
|
title=title,
|
|
386
401
|
authors=authors,
|
|
@@ -390,43 +405,43 @@ class PubMedEngine(SearchEngine):
|
|
|
390
405
|
pmid=pmid,
|
|
391
406
|
journal=journal,
|
|
392
407
|
keywords=keywords,
|
|
393
|
-
source="pubmed"
|
|
408
|
+
source="pubmed",
|
|
394
409
|
)
|
|
395
|
-
|
|
410
|
+
|
|
396
411
|
papers.append(paper)
|
|
397
|
-
|
|
412
|
+
|
|
398
413
|
except Exception as e:
|
|
399
414
|
logger.warning(f"Failed to parse PubMed article: {e}")
|
|
400
415
|
continue
|
|
401
|
-
|
|
416
|
+
|
|
402
417
|
except Exception as e:
|
|
403
418
|
logger.error(f"Failed to parse PubMed XML: {e}")
|
|
404
|
-
|
|
419
|
+
|
|
405
420
|
return papers
|
|
406
421
|
|
|
407
422
|
|
|
408
423
|
class ArxivEngine(SearchEngine):
|
|
409
424
|
"""arXiv search engine."""
|
|
410
|
-
|
|
425
|
+
|
|
411
426
|
def __init__(self):
|
|
412
427
|
super().__init__("arxiv")
|
|
413
428
|
self.base_url = "http://export.arxiv.org/api/query"
|
|
414
429
|
self.rate_limit = 0.5
|
|
415
|
-
|
|
430
|
+
|
|
416
431
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
417
432
|
"""Search arXiv for papers."""
|
|
418
433
|
await self._rate_limit_async()
|
|
419
|
-
|
|
434
|
+
|
|
420
435
|
params = {
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
436
|
+
"search_query": f"all:{query}",
|
|
437
|
+
"start": 0,
|
|
438
|
+
"max_results": limit,
|
|
439
|
+
"sortBy": "relevance",
|
|
440
|
+
"sortOrder": "descending",
|
|
426
441
|
}
|
|
427
|
-
|
|
442
|
+
|
|
428
443
|
papers = []
|
|
429
|
-
|
|
444
|
+
|
|
430
445
|
try:
|
|
431
446
|
async with aiohttp.ClientSession() as session:
|
|
432
447
|
async with session.get(self.base_url, params=params) as response:
|
|
@@ -435,61 +450,63 @@ class ArxivEngine(SearchEngine):
|
|
|
435
450
|
papers = self._parse_arxiv_xml(xml_data)
|
|
436
451
|
else:
|
|
437
452
|
logger.error(f"arXiv search failed: {response.status}")
|
|
438
|
-
|
|
453
|
+
|
|
439
454
|
except Exception as e:
|
|
440
455
|
logger.error(f"arXiv search error: {e}")
|
|
441
456
|
raise SearchError(query, "arXiv", str(e))
|
|
442
|
-
|
|
457
|
+
|
|
443
458
|
return papers
|
|
444
|
-
|
|
459
|
+
|
|
445
460
|
def _parse_arxiv_xml(self, xml_data: str) -> List[Paper]:
|
|
446
461
|
"""Parse arXiv XML response."""
|
|
447
462
|
papers = []
|
|
448
|
-
|
|
463
|
+
|
|
449
464
|
try:
|
|
450
465
|
# Parse XML with namespace
|
|
451
466
|
root = ET.fromstring(xml_data)
|
|
452
|
-
ns = {
|
|
453
|
-
|
|
454
|
-
for entry in root.findall(
|
|
467
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
468
|
+
|
|
469
|
+
for entry in root.findall("atom:entry", ns):
|
|
455
470
|
try:
|
|
456
471
|
# Title
|
|
457
|
-
title_elem = entry.find(
|
|
458
|
-
title = title_elem.text.strip() if title_elem is not None else
|
|
459
|
-
|
|
472
|
+
title_elem = entry.find("atom:title", ns)
|
|
473
|
+
title = title_elem.text.strip() if title_elem is not None else ""
|
|
474
|
+
|
|
460
475
|
# Authors
|
|
461
476
|
authors = []
|
|
462
|
-
for author_elem in entry.findall(
|
|
463
|
-
name_elem = author_elem.find(
|
|
477
|
+
for author_elem in entry.findall("atom:author", ns):
|
|
478
|
+
name_elem = author_elem.find("atom:name", ns)
|
|
464
479
|
if name_elem is not None and name_elem.text:
|
|
465
480
|
authors.append(name_elem.text)
|
|
466
|
-
|
|
481
|
+
|
|
467
482
|
# Abstract
|
|
468
|
-
summary_elem = entry.find(
|
|
469
|
-
abstract =
|
|
470
|
-
|
|
483
|
+
summary_elem = entry.find("atom:summary", ns)
|
|
484
|
+
abstract = (
|
|
485
|
+
summary_elem.text.strip() if summary_elem is not None else ""
|
|
486
|
+
)
|
|
487
|
+
|
|
471
488
|
# Year
|
|
472
|
-
published_elem = entry.find(
|
|
489
|
+
published_elem = entry.find("atom:published", ns)
|
|
473
490
|
year = None
|
|
474
491
|
if published_elem is not None and published_elem.text:
|
|
475
492
|
year = published_elem.text[:4]
|
|
476
|
-
|
|
493
|
+
|
|
477
494
|
# arXiv ID
|
|
478
|
-
id_elem = entry.find(
|
|
495
|
+
id_elem = entry.find("atom:id", ns)
|
|
479
496
|
arxiv_id = None
|
|
480
497
|
pdf_url = None
|
|
481
498
|
if id_elem is not None and id_elem.text:
|
|
482
499
|
# Extract ID from URL
|
|
483
|
-
arxiv_id = id_elem.text.split(
|
|
500
|
+
arxiv_id = id_elem.text.split("/")[-1]
|
|
484
501
|
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
|
|
485
|
-
|
|
502
|
+
|
|
486
503
|
# Categories (as keywords)
|
|
487
504
|
keywords = []
|
|
488
|
-
for cat_elem in entry.findall(
|
|
489
|
-
term = cat_elem.get(
|
|
505
|
+
for cat_elem in entry.findall("atom:category", ns):
|
|
506
|
+
term = cat_elem.get("term")
|
|
490
507
|
if term:
|
|
491
508
|
keywords.append(term)
|
|
492
|
-
|
|
509
|
+
|
|
493
510
|
paper = Paper(
|
|
494
511
|
title=title,
|
|
495
512
|
authors=authors,
|
|
@@ -498,146 +515,140 @@ class ArxivEngine(SearchEngine):
|
|
|
498
515
|
arxiv_id=arxiv_id,
|
|
499
516
|
keywords=keywords,
|
|
500
517
|
pdf_url=pdf_url,
|
|
501
|
-
source="arxiv"
|
|
518
|
+
source="arxiv",
|
|
502
519
|
)
|
|
503
|
-
|
|
520
|
+
|
|
504
521
|
papers.append(paper)
|
|
505
|
-
|
|
522
|
+
|
|
506
523
|
except Exception as e:
|
|
507
524
|
logger.warning(f"Failed to parse arXiv entry: {e}")
|
|
508
525
|
continue
|
|
509
|
-
|
|
526
|
+
|
|
510
527
|
except Exception as e:
|
|
511
528
|
logger.error(f"Failed to parse arXiv XML: {e}")
|
|
512
|
-
|
|
529
|
+
|
|
513
530
|
return papers
|
|
514
531
|
|
|
515
532
|
|
|
516
533
|
class CrossRefEngine(SearchEngine):
|
|
517
534
|
"""CrossRef search engine for academic papers."""
|
|
518
|
-
|
|
535
|
+
|
|
519
536
|
def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
|
|
520
537
|
super().__init__("crossref")
|
|
521
538
|
self.api_key = api_key
|
|
522
539
|
self.email = email or "research@example.com"
|
|
523
540
|
self.base_url = "https://api.crossref.org/works"
|
|
524
541
|
self.rate_limit = 0.5 # CrossRef recommends 50ms between requests
|
|
525
|
-
|
|
542
|
+
|
|
526
543
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
527
544
|
"""Search CrossRef for papers."""
|
|
528
545
|
await self._rate_limit_async()
|
|
529
|
-
|
|
546
|
+
|
|
530
547
|
# Build query parameters
|
|
531
548
|
params = {
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
549
|
+
"query": query,
|
|
550
|
+
"rows": min(limit, 1000), # CrossRef max is 1000
|
|
551
|
+
"sort": "relevance",
|
|
552
|
+
"order": "desc",
|
|
536
553
|
}
|
|
537
|
-
|
|
554
|
+
|
|
538
555
|
# Add filters for year if provided
|
|
539
556
|
filters = []
|
|
540
|
-
if
|
|
557
|
+
if "year_min" in kwargs and kwargs["year_min"] is not None:
|
|
541
558
|
filters.append(f"from-pub-date:{kwargs['year_min']}")
|
|
542
|
-
if
|
|
559
|
+
if "year_max" in kwargs and kwargs["year_max"] is not None:
|
|
543
560
|
filters.append(f"until-pub-date:{kwargs['year_max']}")
|
|
544
|
-
|
|
561
|
+
|
|
545
562
|
if filters:
|
|
546
|
-
params[
|
|
547
|
-
|
|
563
|
+
params["filter"] = ",".join(filters)
|
|
564
|
+
|
|
548
565
|
# Add API key if available
|
|
549
566
|
if self.api_key:
|
|
550
|
-
params[
|
|
551
|
-
|
|
567
|
+
params["key"] = self.api_key
|
|
568
|
+
|
|
552
569
|
# Headers with user agent
|
|
553
|
-
headers = {
|
|
554
|
-
|
|
555
|
-
}
|
|
556
|
-
|
|
570
|
+
headers = {"User-Agent": f"SciTeX/1.0 (mailto:{self.email})"}
|
|
571
|
+
|
|
557
572
|
papers = []
|
|
558
|
-
|
|
573
|
+
|
|
559
574
|
try:
|
|
560
575
|
async with aiohttp.ClientSession() as session:
|
|
561
|
-
async with session.get(
|
|
576
|
+
async with session.get(
|
|
577
|
+
self.base_url, params=params, headers=headers
|
|
578
|
+
) as response:
|
|
562
579
|
if response.status == 200:
|
|
563
580
|
data = await response.json()
|
|
564
581
|
papers = self._parse_crossref_response(data)
|
|
565
582
|
else:
|
|
566
583
|
error_text = await response.text()
|
|
567
|
-
logger.error(
|
|
584
|
+
logger.error(
|
|
585
|
+
f"CrossRef search failed: {response.status} - {error_text}"
|
|
586
|
+
)
|
|
568
587
|
raise SearchError(
|
|
569
588
|
query=query,
|
|
570
589
|
source="crossref",
|
|
571
|
-
reason=f"API returned status {response.status}"
|
|
590
|
+
reason=f"API returned status {response.status}",
|
|
572
591
|
)
|
|
573
|
-
|
|
592
|
+
|
|
574
593
|
except asyncio.TimeoutError:
|
|
575
594
|
logger.error("CrossRef search timed out")
|
|
576
|
-
raise SearchError(
|
|
577
|
-
query=query,
|
|
578
|
-
source="crossref",
|
|
579
|
-
reason="Search timed out"
|
|
580
|
-
)
|
|
595
|
+
raise SearchError(query=query, source="crossref", reason="Search timed out")
|
|
581
596
|
except Exception as e:
|
|
582
597
|
logger.error(f"CrossRef search error: {e}")
|
|
583
|
-
raise SearchError(
|
|
584
|
-
|
|
585
|
-
source="crossref",
|
|
586
|
-
reason=str(e)
|
|
587
|
-
)
|
|
588
|
-
|
|
598
|
+
raise SearchError(query=query, source="crossref", reason=str(e))
|
|
599
|
+
|
|
589
600
|
return papers
|
|
590
|
-
|
|
601
|
+
|
|
591
602
|
def _parse_crossref_response(self, data: Dict[str, Any]) -> List[Paper]:
|
|
592
603
|
"""Parse CrossRef API response into Paper objects."""
|
|
593
604
|
papers = []
|
|
594
|
-
|
|
595
|
-
items = data.get(
|
|
596
|
-
|
|
605
|
+
|
|
606
|
+
items = data.get("message", {}).get("items", [])
|
|
607
|
+
|
|
597
608
|
for item in items:
|
|
598
609
|
try:
|
|
599
610
|
# Extract basic metadata
|
|
600
|
-
title =
|
|
601
|
-
|
|
611
|
+
title = " ".join(item.get("title", ["No title"]))
|
|
612
|
+
|
|
602
613
|
# Authors
|
|
603
614
|
authors = []
|
|
604
|
-
for author in item.get(
|
|
605
|
-
given = author.get(
|
|
606
|
-
family = author.get(
|
|
615
|
+
for author in item.get("author", []):
|
|
616
|
+
given = author.get("given", "")
|
|
617
|
+
family = author.get("family", "")
|
|
607
618
|
if given and family:
|
|
608
619
|
authors.append(f"{given} {family}")
|
|
609
620
|
elif family:
|
|
610
621
|
authors.append(family)
|
|
611
|
-
|
|
622
|
+
|
|
612
623
|
# Abstract - CrossRef doesn't always have abstracts
|
|
613
|
-
abstract = item.get(
|
|
614
|
-
|
|
624
|
+
abstract = item.get("abstract", "")
|
|
625
|
+
|
|
615
626
|
# Year from published-print or published-online
|
|
616
627
|
year = None
|
|
617
|
-
published = item.get(
|
|
618
|
-
if published and
|
|
619
|
-
date_parts = published[
|
|
628
|
+
published = item.get("published-print") or item.get("published-online")
|
|
629
|
+
if published and "date-parts" in published:
|
|
630
|
+
date_parts = published["date-parts"]
|
|
620
631
|
if date_parts and date_parts[0]:
|
|
621
632
|
year = str(date_parts[0][0])
|
|
622
|
-
|
|
633
|
+
|
|
623
634
|
# Journal
|
|
624
635
|
journal = None
|
|
625
|
-
container_title = item.get(
|
|
636
|
+
container_title = item.get("container-title", [])
|
|
626
637
|
if container_title:
|
|
627
638
|
journal = container_title[0]
|
|
628
|
-
|
|
639
|
+
|
|
629
640
|
# DOI
|
|
630
|
-
doi = item.get(
|
|
631
|
-
|
|
641
|
+
doi = item.get("DOI")
|
|
642
|
+
|
|
632
643
|
# Citation count
|
|
633
|
-
citation_count = item.get(
|
|
634
|
-
|
|
644
|
+
citation_count = item.get("is-referenced-by-count", 0)
|
|
645
|
+
|
|
635
646
|
# Keywords/subjects
|
|
636
|
-
keywords = item.get(
|
|
637
|
-
|
|
647
|
+
keywords = item.get("subject", [])
|
|
648
|
+
|
|
638
649
|
# URL
|
|
639
|
-
url = item.get(
|
|
640
|
-
|
|
650
|
+
url = item.get("URL")
|
|
651
|
+
|
|
641
652
|
paper = Paper(
|
|
642
653
|
title=title,
|
|
643
654
|
authors=authors,
|
|
@@ -649,38 +660,39 @@ class CrossRefEngine(SearchEngine):
|
|
|
649
660
|
citation_count=citation_count,
|
|
650
661
|
source="crossref",
|
|
651
662
|
metadata={
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
}
|
|
663
|
+
"citation_count_source": "CrossRef",
|
|
664
|
+
"url": url,
|
|
665
|
+
"publisher": item.get("publisher"),
|
|
666
|
+
"issn": item.get("ISSN", []),
|
|
667
|
+
"type": item.get("type"),
|
|
668
|
+
"score": item.get("score"),
|
|
669
|
+
},
|
|
659
670
|
)
|
|
660
|
-
|
|
671
|
+
|
|
661
672
|
papers.append(paper)
|
|
662
|
-
|
|
673
|
+
|
|
663
674
|
except Exception as e:
|
|
664
675
|
logger.warning(f"Failed to parse CrossRef item: {e}")
|
|
665
676
|
continue
|
|
666
|
-
|
|
677
|
+
|
|
667
678
|
return papers
|
|
668
679
|
|
|
669
680
|
|
|
670
681
|
class GoogleScholarEngine(SearchEngine):
|
|
671
682
|
"""Search engine for Google Scholar using scholarly package."""
|
|
672
|
-
|
|
683
|
+
|
|
673
684
|
def __init__(self, timeout: int = 10):
|
|
674
685
|
super().__init__("google_scholar")
|
|
675
686
|
self.rate_limit = 2.0 # Be respectful to Google Scholar
|
|
676
687
|
self._scholarly = None
|
|
677
688
|
self.timeout = timeout
|
|
678
|
-
|
|
689
|
+
|
|
679
690
|
def _init_scholarly(self):
|
|
680
691
|
"""Lazy load scholarly package."""
|
|
681
692
|
if self._scholarly is None:
|
|
682
693
|
try:
|
|
683
694
|
from scholarly import scholarly
|
|
695
|
+
|
|
684
696
|
self._scholarly = scholarly
|
|
685
697
|
# Configure proxy to avoid blocking (optional)
|
|
686
698
|
# from scholarly import ProxyGenerator
|
|
@@ -693,25 +705,25 @@ class GoogleScholarEngine(SearchEngine):
|
|
|
693
705
|
"Install with: pip install scholarly"
|
|
694
706
|
)
|
|
695
707
|
return self._scholarly
|
|
696
|
-
|
|
708
|
+
|
|
697
709
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
698
710
|
"""
|
|
699
711
|
Search Google Scholar for papers.
|
|
700
|
-
|
|
712
|
+
|
|
701
713
|
Args:
|
|
702
714
|
query: Search query
|
|
703
715
|
limit: Maximum number of results
|
|
704
716
|
**kwargs: Additional parameters (year_min, year_max)
|
|
705
|
-
|
|
717
|
+
|
|
706
718
|
Returns:
|
|
707
719
|
List of Paper objects
|
|
708
720
|
"""
|
|
709
721
|
papers = []
|
|
710
|
-
|
|
722
|
+
|
|
711
723
|
try:
|
|
712
724
|
# Initialize scholarly
|
|
713
725
|
scholarly = self._init_scholarly()
|
|
714
|
-
|
|
726
|
+
|
|
715
727
|
# Quick test to see if Google Scholar is accessible
|
|
716
728
|
try:
|
|
717
729
|
# Try a minimal search to detect blocking immediately
|
|
@@ -723,27 +735,27 @@ class GoogleScholarEngine(SearchEngine):
|
|
|
723
735
|
raise SearchError(
|
|
724
736
|
query=query,
|
|
725
737
|
source="google_scholar",
|
|
726
|
-
reason="Google Scholar is blocking automated access. Use PubMed or Semantic Scholar instead."
|
|
738
|
+
reason="Google Scholar is blocking automated access. Use PubMed or Semantic Scholar instead.",
|
|
727
739
|
)
|
|
728
|
-
|
|
740
|
+
|
|
729
741
|
# Apply year filters if provided
|
|
730
|
-
year_min = kwargs.get(
|
|
731
|
-
year_max = kwargs.get(
|
|
732
|
-
|
|
742
|
+
year_min = kwargs.get("year_min")
|
|
743
|
+
year_max = kwargs.get("year_max")
|
|
744
|
+
|
|
733
745
|
# Build query with year filters
|
|
734
746
|
search_query = query
|
|
735
747
|
if year_min and year_max:
|
|
736
|
-
search_query += f" after:{year_min-1} before:{year_max+1}"
|
|
748
|
+
search_query += f" after:{year_min - 1} before:{year_max + 1}"
|
|
737
749
|
elif year_min:
|
|
738
|
-
search_query += f" after:{year_min-1}"
|
|
750
|
+
search_query += f" after:{year_min - 1}"
|
|
739
751
|
elif year_max:
|
|
740
|
-
search_query += f" before:{year_max+1}"
|
|
741
|
-
|
|
752
|
+
search_query += f" before:{year_max + 1}"
|
|
753
|
+
|
|
742
754
|
logger.info(f"Searching Google Scholar: {search_query}")
|
|
743
|
-
|
|
755
|
+
|
|
744
756
|
# Run search in executor with timeout to avoid blocking
|
|
745
757
|
loop = asyncio.get_event_loop()
|
|
746
|
-
|
|
758
|
+
|
|
747
759
|
def search_with_limit():
|
|
748
760
|
"""Search and limit results to avoid hanging."""
|
|
749
761
|
results = []
|
|
@@ -757,222 +769,231 @@ class GoogleScholarEngine(SearchEngine):
|
|
|
757
769
|
logger.warning(f"Google Scholar search interrupted: {e}")
|
|
758
770
|
# Common error from scholarly when blocked
|
|
759
771
|
if "Cannot Fetch" in str(e) or "403" in str(e):
|
|
760
|
-
logger.info(
|
|
772
|
+
logger.info(
|
|
773
|
+
"Google Scholar is blocking automated requests. This is common due to anti-bot measures."
|
|
774
|
+
)
|
|
761
775
|
return results
|
|
762
|
-
|
|
776
|
+
|
|
763
777
|
# Apply timeout to prevent hanging
|
|
764
778
|
try:
|
|
765
779
|
search_results = await asyncio.wait_for(
|
|
766
|
-
loop.run_in_executor(None, search_with_limit),
|
|
767
|
-
timeout=self.timeout
|
|
780
|
+
loop.run_in_executor(None, search_with_limit), timeout=self.timeout
|
|
768
781
|
)
|
|
769
782
|
except asyncio.TimeoutError:
|
|
770
|
-
logger.warning(
|
|
783
|
+
logger.warning(
|
|
784
|
+
f"Google Scholar search timed out after {self.timeout} seconds"
|
|
785
|
+
)
|
|
771
786
|
raise SearchError(
|
|
772
787
|
query=search_query,
|
|
773
788
|
source="google_scholar",
|
|
774
|
-
reason=f"Search timed out after {self.timeout} seconds. Google Scholar may be blocking requests. You can increase timeout with SCITEX_SCHOLAR_GOOGLE_SCHOLAR_TIMEOUT environment variable."
|
|
789
|
+
reason=f"Search timed out after {self.timeout} seconds. Google Scholar may be blocking requests. You can increase timeout with SCITEX_SCHOLAR_GOOGLE_SCHOLAR_TIMEOUT environment variable.",
|
|
775
790
|
)
|
|
776
|
-
|
|
791
|
+
|
|
777
792
|
# Process results
|
|
778
793
|
for result in search_results:
|
|
779
|
-
|
|
780
794
|
try:
|
|
781
795
|
# Extract basic info from search result
|
|
782
|
-
bib = result.get(
|
|
783
|
-
|
|
784
|
-
title = bib.get(
|
|
796
|
+
bib = result.get("bib", {})
|
|
797
|
+
|
|
798
|
+
title = bib.get("title", "")
|
|
785
799
|
if not title:
|
|
786
800
|
continue
|
|
787
|
-
|
|
801
|
+
|
|
788
802
|
# Authors
|
|
789
|
-
authors = bib.get(
|
|
790
|
-
if not authors or authors == [
|
|
803
|
+
authors = bib.get("author", "").split(" and ")
|
|
804
|
+
if not authors or authors == [""]:
|
|
791
805
|
authors = []
|
|
792
|
-
|
|
806
|
+
|
|
793
807
|
# Abstract (often not available in search results)
|
|
794
|
-
abstract = bib.get(
|
|
795
|
-
|
|
808
|
+
abstract = bib.get("abstract", "")
|
|
809
|
+
|
|
796
810
|
# Year
|
|
797
811
|
year = None
|
|
798
|
-
pub_year = bib.get(
|
|
812
|
+
pub_year = bib.get("pub_year")
|
|
799
813
|
if pub_year:
|
|
800
814
|
try:
|
|
801
815
|
year = int(pub_year)
|
|
802
816
|
except:
|
|
803
817
|
pass
|
|
804
|
-
|
|
818
|
+
|
|
805
819
|
# Journal/Venue
|
|
806
|
-
journal = bib.get(
|
|
807
|
-
|
|
820
|
+
journal = bib.get("venue", "")
|
|
821
|
+
|
|
808
822
|
# Citation count
|
|
809
|
-
citation_count = result.get(
|
|
810
|
-
|
|
823
|
+
citation_count = result.get("num_citations", 0)
|
|
824
|
+
|
|
811
825
|
# URL
|
|
812
|
-
url = result.get(
|
|
813
|
-
|
|
826
|
+
url = result.get("pub_url", "")
|
|
827
|
+
|
|
814
828
|
# Try to extract DOI from URL or other fields
|
|
815
829
|
doi = None
|
|
816
|
-
if
|
|
817
|
-
doi = url.split(
|
|
818
|
-
|
|
830
|
+
if "doi.org/" in url:
|
|
831
|
+
doi = url.split("doi.org/")[-1]
|
|
832
|
+
|
|
819
833
|
# Create Paper object
|
|
820
834
|
paper = Paper(
|
|
821
835
|
title=title,
|
|
822
836
|
authors=authors,
|
|
823
|
-
abstract=abstract
|
|
837
|
+
abstract=abstract
|
|
838
|
+
or "Abstract not available from Google Scholar search",
|
|
824
839
|
year=year,
|
|
825
840
|
journal=journal,
|
|
826
841
|
doi=doi,
|
|
827
842
|
citation_count=citation_count,
|
|
828
843
|
source="google_scholar",
|
|
829
844
|
metadata={
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
}
|
|
845
|
+
"google_scholar_url": url,
|
|
846
|
+
"google_scholar_id": result.get("author_id", ""),
|
|
847
|
+
"eprint_url": result.get("eprint_url", ""),
|
|
848
|
+
},
|
|
834
849
|
)
|
|
835
|
-
|
|
850
|
+
|
|
836
851
|
papers.append(paper)
|
|
837
|
-
|
|
852
|
+
|
|
838
853
|
except Exception as e:
|
|
839
854
|
logger.warning(f"Failed to parse Google Scholar result: {e}")
|
|
840
855
|
continue
|
|
841
|
-
|
|
856
|
+
|
|
842
857
|
except ImportError as e:
|
|
843
858
|
logger.error(f"Google Scholar search unavailable: {e}")
|
|
844
859
|
raise SearchError(
|
|
845
860
|
query=query,
|
|
846
861
|
source="google_scholar",
|
|
847
|
-
reason=f"Google Scholar search unavailable: {e}"
|
|
862
|
+
reason=f"Google Scholar search unavailable: {e}",
|
|
848
863
|
)
|
|
849
864
|
except Exception as e:
|
|
850
865
|
logger.error(f"Google Scholar search failed: {e}")
|
|
851
|
-
if
|
|
866
|
+
if (
|
|
867
|
+
"robot" in str(e).lower()
|
|
868
|
+
or "captcha" in str(e).lower()
|
|
869
|
+
or "Cannot Fetch" in str(e)
|
|
870
|
+
):
|
|
852
871
|
raise SearchError(
|
|
853
872
|
query=query,
|
|
854
873
|
source="google_scholar",
|
|
855
|
-
reason="Google Scholar is blocking automated access. Consider using PubMed or Semantic Scholar instead, or configure a proxy in the scholarly package."
|
|
874
|
+
reason="Google Scholar is blocking automated access. Consider using PubMed or Semantic Scholar instead, or configure a proxy in the scholarly package.",
|
|
856
875
|
)
|
|
857
876
|
raise SearchError(
|
|
858
|
-
query=query,
|
|
859
|
-
source="google_scholar",
|
|
860
|
-
reason=f"Search failed: {e}"
|
|
877
|
+
query=query, source="google_scholar", reason=f"Search failed: {e}"
|
|
861
878
|
)
|
|
862
|
-
|
|
879
|
+
|
|
863
880
|
logger.info(f"Found {len(papers)} papers from Google Scholar")
|
|
864
881
|
return papers
|
|
865
882
|
|
|
866
883
|
|
|
867
884
|
class LocalSearchEngine(SearchEngine):
|
|
868
885
|
"""Search engine for local PDF files."""
|
|
869
|
-
|
|
886
|
+
|
|
870
887
|
def __init__(self, index_path: Optional[Path] = None):
|
|
871
888
|
super().__init__("local")
|
|
872
889
|
self.index_path = index_path or get_scholar_dir() / "local_index.json"
|
|
873
890
|
self.index = self._load_index()
|
|
874
|
-
|
|
891
|
+
|
|
875
892
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
876
893
|
"""Search local PDF collection."""
|
|
877
894
|
# Local search is synchronous, wrap in async
|
|
878
895
|
return await asyncio.to_thread(self._search_sync, query, limit, kwargs)
|
|
879
|
-
|
|
896
|
+
|
|
880
897
|
def _search_sync(self, query: str, limit: int, kwargs: dict) -> List[Paper]:
|
|
881
898
|
"""Synchronous local search implementation."""
|
|
882
899
|
if not self.index:
|
|
883
900
|
return []
|
|
884
|
-
|
|
901
|
+
|
|
885
902
|
# Simple keyword matching
|
|
886
903
|
query_terms = query.lower().split()
|
|
887
904
|
scored_papers = []
|
|
888
|
-
|
|
905
|
+
|
|
889
906
|
for paper_data in self.index.values():
|
|
890
907
|
# Calculate relevance score
|
|
891
908
|
score = 0
|
|
892
909
|
searchable_text = f"{paper_data.get('title', '')} {paper_data.get('abstract', '')} {' '.join(paper_data.get('keywords', []))}".lower()
|
|
893
|
-
|
|
910
|
+
|
|
894
911
|
for term in query_terms:
|
|
895
912
|
score += searchable_text.count(term)
|
|
896
|
-
|
|
913
|
+
|
|
897
914
|
if score > 0:
|
|
898
915
|
# Create Paper object
|
|
899
916
|
paper = Paper(
|
|
900
|
-
title=paper_data.get(
|
|
901
|
-
authors=paper_data.get(
|
|
902
|
-
abstract=paper_data.get(
|
|
903
|
-
year=paper_data.get(
|
|
904
|
-
keywords=paper_data.get(
|
|
905
|
-
pdf_path=Path(paper_data.get(
|
|
906
|
-
source="local"
|
|
917
|
+
title=paper_data.get("title", "Unknown Title"),
|
|
918
|
+
authors=paper_data.get("authors", []),
|
|
919
|
+
abstract=paper_data.get("abstract", ""),
|
|
920
|
+
year=paper_data.get("year"),
|
|
921
|
+
keywords=paper_data.get("keywords", []),
|
|
922
|
+
pdf_path=Path(paper_data.get("pdf_path", "")),
|
|
923
|
+
source="local",
|
|
907
924
|
)
|
|
908
925
|
scored_papers.append((score, paper))
|
|
909
|
-
|
|
926
|
+
|
|
910
927
|
# Sort by score and return top results
|
|
911
928
|
scored_papers.sort(key=lambda x: x[0], reverse=True)
|
|
912
929
|
return [paper for score, paper in scored_papers[:limit]]
|
|
913
|
-
|
|
930
|
+
|
|
914
931
|
def _load_index(self) -> Dict[str, Any]:
|
|
915
932
|
"""Load local search index."""
|
|
916
933
|
if self.index_path.exists():
|
|
917
934
|
try:
|
|
918
|
-
with open(self.index_path,
|
|
935
|
+
with open(self.index_path, "r") as f:
|
|
919
936
|
return json.load(f)
|
|
920
937
|
except Exception as e:
|
|
921
938
|
logger.warning(f"Failed to load local index: {e}")
|
|
922
939
|
return {}
|
|
923
|
-
|
|
940
|
+
|
|
924
941
|
def build_index(self, pdf_dirs: List[Path]) -> Dict[str, Any]:
|
|
925
942
|
"""Build search index from PDF directories."""
|
|
926
943
|
logger.info(f"Building local index from {len(pdf_dirs)} directories")
|
|
927
|
-
|
|
944
|
+
|
|
928
945
|
index = {}
|
|
929
|
-
stats = {
|
|
930
|
-
|
|
946
|
+
stats = {"files_indexed": 0, "errors": 0}
|
|
947
|
+
|
|
931
948
|
for pdf_dir in pdf_dirs:
|
|
932
949
|
if not pdf_dir.exists():
|
|
933
950
|
continue
|
|
934
|
-
|
|
951
|
+
|
|
935
952
|
for pdf_path in pdf_dir.rglob("*.pdf"):
|
|
936
953
|
try:
|
|
937
954
|
# Extract text and metadata
|
|
938
955
|
paper_data = self._extract_pdf_metadata(pdf_path)
|
|
939
956
|
if paper_data:
|
|
940
957
|
index[str(pdf_path)] = paper_data
|
|
941
|
-
stats[
|
|
958
|
+
stats["files_indexed"] += 1
|
|
942
959
|
except Exception as e:
|
|
943
960
|
logger.warning(f"Failed to index {pdf_path}: {e}")
|
|
944
|
-
stats[
|
|
945
|
-
|
|
961
|
+
stats["errors"] += 1
|
|
962
|
+
|
|
946
963
|
# Save index
|
|
947
964
|
self.index = index
|
|
948
965
|
self._save_index()
|
|
949
|
-
|
|
950
|
-
logger.info(
|
|
966
|
+
|
|
967
|
+
logger.info(
|
|
968
|
+
f"Indexed {stats['files_indexed']} files with {stats['errors']} errors"
|
|
969
|
+
)
|
|
951
970
|
return stats
|
|
952
|
-
|
|
971
|
+
|
|
953
972
|
def _extract_pdf_metadata(self, pdf_path: Path) -> Optional[Dict[str, Any]]:
|
|
954
973
|
"""Extract metadata from PDF file."""
|
|
955
974
|
# This is a placeholder - in real implementation would use PyPDF2 or similar
|
|
956
975
|
return {
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
976
|
+
"title": pdf_path.stem.replace("_", " ").title(),
|
|
977
|
+
"authors": [],
|
|
978
|
+
"abstract": "",
|
|
979
|
+
"year": None,
|
|
980
|
+
"keywords": [],
|
|
981
|
+
"pdf_path": str(pdf_path),
|
|
963
982
|
}
|
|
964
|
-
|
|
983
|
+
|
|
965
984
|
def _save_index(self) -> None:
|
|
966
985
|
"""Save index to disk."""
|
|
967
986
|
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
968
|
-
with open(self.index_path,
|
|
987
|
+
with open(self.index_path, "w") as f:
|
|
969
988
|
json.dump(self.index, f, indent=2)
|
|
970
989
|
|
|
971
990
|
|
|
972
991
|
class VectorSearchEngine(SearchEngine):
|
|
973
992
|
"""Vector similarity search using sentence embeddings."""
|
|
974
|
-
|
|
975
|
-
def __init__(
|
|
993
|
+
|
|
994
|
+
def __init__(
|
|
995
|
+
self, index_path: Optional[Path] = None, model_name: str = "all-MiniLM-L6-v2"
|
|
996
|
+
):
|
|
976
997
|
super().__init__("vector")
|
|
977
998
|
self.index_path = index_path or get_scholar_dir() / "vector_index.pkl"
|
|
978
999
|
self.model_name = model_name
|
|
@@ -980,92 +1001,94 @@ class VectorSearchEngine(SearchEngine):
|
|
|
980
1001
|
self._papers = []
|
|
981
1002
|
self._embeddings = None
|
|
982
1003
|
self._load_index()
|
|
983
|
-
|
|
1004
|
+
|
|
984
1005
|
async def search_async(self, query: str, limit: int = 20, **kwargs) -> List[Paper]:
|
|
985
1006
|
"""Search using vector similarity."""
|
|
986
1007
|
# Vector search is CPU-bound, use thread
|
|
987
1008
|
return await asyncio.to_thread(self._search_sync, query, limit)
|
|
988
|
-
|
|
1009
|
+
|
|
989
1010
|
def _search_sync(self, query: str, limit: int) -> List[Paper]:
|
|
990
1011
|
"""Synchronous vector search implementation."""
|
|
991
1012
|
if not self._embeddings or not self._papers:
|
|
992
1013
|
return []
|
|
993
|
-
|
|
1014
|
+
|
|
994
1015
|
# Lazy load model
|
|
995
1016
|
if self._model is None:
|
|
996
1017
|
self._load_model()
|
|
997
|
-
|
|
1018
|
+
|
|
998
1019
|
# Encode query
|
|
999
1020
|
query_embedding = self._model.encode([query])[0]
|
|
1000
|
-
|
|
1021
|
+
|
|
1001
1022
|
# Calculate similarities
|
|
1002
1023
|
import numpy as np
|
|
1024
|
+
|
|
1003
1025
|
similarities = np.dot(self._embeddings, query_embedding)
|
|
1004
|
-
|
|
1026
|
+
|
|
1005
1027
|
# Get top results
|
|
1006
1028
|
top_indices = np.argsort(similarities)[-limit:][::-1]
|
|
1007
|
-
|
|
1029
|
+
|
|
1008
1030
|
results = []
|
|
1009
1031
|
for idx in top_indices:
|
|
1010
1032
|
if idx < len(self._papers):
|
|
1011
1033
|
results.append(self._papers[idx])
|
|
1012
|
-
|
|
1034
|
+
|
|
1013
1035
|
return results
|
|
1014
|
-
|
|
1036
|
+
|
|
1015
1037
|
def add_papers(self, papers: List[Paper]) -> None:
|
|
1016
1038
|
"""Add papers to vector index."""
|
|
1017
1039
|
if self._model is None:
|
|
1018
1040
|
self._load_model()
|
|
1019
|
-
|
|
1041
|
+
|
|
1020
1042
|
# Create searchable text for each paper
|
|
1021
1043
|
texts = []
|
|
1022
1044
|
for paper in papers:
|
|
1023
1045
|
text = f"{paper.title} {paper.abstract}"
|
|
1024
1046
|
texts.append(text)
|
|
1025
|
-
|
|
1047
|
+
|
|
1026
1048
|
# Encode papers
|
|
1027
1049
|
new_embeddings = self._model.encode(texts)
|
|
1028
|
-
|
|
1050
|
+
|
|
1029
1051
|
# Add to index
|
|
1030
1052
|
import numpy as np
|
|
1053
|
+
|
|
1031
1054
|
if self._embeddings is None:
|
|
1032
1055
|
self._embeddings = new_embeddings
|
|
1033
1056
|
self._papers = papers
|
|
1034
1057
|
else:
|
|
1035
1058
|
self._embeddings = np.vstack([self._embeddings, new_embeddings])
|
|
1036
1059
|
self._papers.extend(papers)
|
|
1037
|
-
|
|
1060
|
+
|
|
1038
1061
|
# Save index
|
|
1039
1062
|
self._save_index()
|
|
1040
|
-
|
|
1063
|
+
|
|
1041
1064
|
def _load_model(self) -> None:
|
|
1042
1065
|
"""Load sentence transformer model."""
|
|
1043
1066
|
try:
|
|
1044
1067
|
from sentence_transformers import SentenceTransformer
|
|
1068
|
+
|
|
1045
1069
|
self._model = SentenceTransformer(self.model_name)
|
|
1046
1070
|
except ImportError:
|
|
1047
|
-
logger.warning(
|
|
1071
|
+
logger.warning(
|
|
1072
|
+
"sentence-transformers not installed. Vector search disabled."
|
|
1073
|
+
)
|
|
1048
1074
|
self._model = None
|
|
1049
|
-
|
|
1075
|
+
|
|
1050
1076
|
def _load_index(self) -> None:
|
|
1051
1077
|
"""Load vector index from disk."""
|
|
1052
1078
|
if self.index_path.exists():
|
|
1053
1079
|
try:
|
|
1054
|
-
with open(self.index_path,
|
|
1080
|
+
with open(self.index_path, "rb") as f:
|
|
1055
1081
|
data = pickle.load(f)
|
|
1056
|
-
self._papers = data.get(
|
|
1057
|
-
self._embeddings = data.get(
|
|
1082
|
+
self._papers = data.get("papers", [])
|
|
1083
|
+
self._embeddings = data.get("embeddings")
|
|
1058
1084
|
except Exception as e:
|
|
1059
1085
|
logger.warning(f"Failed to load vector index: {e}")
|
|
1060
|
-
|
|
1086
|
+
|
|
1061
1087
|
def _save_index(self) -> None:
|
|
1062
1088
|
"""Save vector index to disk."""
|
|
1063
1089
|
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1064
|
-
data = {
|
|
1065
|
-
|
|
1066
|
-
'embeddings': self._embeddings
|
|
1067
|
-
}
|
|
1068
|
-
with open(self.index_path, 'wb') as f:
|
|
1090
|
+
data = {"papers": self._papers, "embeddings": self._embeddings}
|
|
1091
|
+
with open(self.index_path, "wb") as f:
|
|
1069
1092
|
pickle.dump(data, f)
|
|
1070
1093
|
|
|
1071
1094
|
|
|
@@ -1073,27 +1096,38 @@ class UnifiedSearcher:
|
|
|
1073
1096
|
"""
|
|
1074
1097
|
Unified searcher that combines results from multiple engines.
|
|
1075
1098
|
"""
|
|
1076
|
-
|
|
1077
|
-
def __init__(
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1099
|
+
|
|
1100
|
+
def __init__(
|
|
1101
|
+
self,
|
|
1102
|
+
config=None,
|
|
1103
|
+
email: Optional[str] = None,
|
|
1104
|
+
semantic_scholar_api_key: Optional[str] = None,
|
|
1105
|
+
crossref_api_key: Optional[str] = None,
|
|
1106
|
+
google_scholar_timeout: int = 10,
|
|
1107
|
+
):
|
|
1083
1108
|
"""Initialize unified searcher with all engines."""
|
|
1084
|
-
|
|
1109
|
+
|
|
1085
1110
|
# Handle config parameter
|
|
1086
1111
|
if config is not None:
|
|
1087
1112
|
from scitex.scholar.config import ScholarConfig
|
|
1113
|
+
|
|
1088
1114
|
if not isinstance(config, ScholarConfig):
|
|
1089
1115
|
raise TypeError("config must be a ScholarConfig instance")
|
|
1090
1116
|
self.config = config
|
|
1091
|
-
|
|
1117
|
+
|
|
1092
1118
|
# Use config resolution for parameters
|
|
1093
|
-
self.email = self.config.resolve(
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
self.
|
|
1119
|
+
self.email = self.config.resolve(
|
|
1120
|
+
"pubmed_email", email, "research@example.com"
|
|
1121
|
+
)
|
|
1122
|
+
self.semantic_scholar_api_key = self.config.resolve(
|
|
1123
|
+
"semantic_scholar_api_key", semantic_scholar_api_key, None
|
|
1124
|
+
)
|
|
1125
|
+
self.crossref_api_key = self.config.resolve(
|
|
1126
|
+
"crossref_api_key", crossref_api_key, None
|
|
1127
|
+
)
|
|
1128
|
+
self.google_scholar_timeout = (
|
|
1129
|
+
google_scholar_timeout # No config key for this yet
|
|
1130
|
+
)
|
|
1097
1131
|
else:
|
|
1098
1132
|
# Fallback to direct parameters
|
|
1099
1133
|
self.config = None
|
|
@@ -1101,65 +1135,81 @@ class UnifiedSearcher:
|
|
|
1101
1135
|
self.semantic_scholar_api_key = semantic_scholar_api_key
|
|
1102
1136
|
self.crossref_api_key = crossref_api_key
|
|
1103
1137
|
self.google_scholar_timeout = google_scholar_timeout
|
|
1104
|
-
|
|
1138
|
+
|
|
1105
1139
|
self._engines = {} # Lazy-loaded engines
|
|
1106
|
-
|
|
1140
|
+
|
|
1107
1141
|
@property
|
|
1108
1142
|
def engines(self):
|
|
1109
1143
|
"""Lazy-load engines as needed."""
|
|
1110
1144
|
return self._engines
|
|
1111
|
-
|
|
1145
|
+
|
|
1112
1146
|
def _get_engine(self, source: str):
|
|
1113
1147
|
"""Get or create engine for a source."""
|
|
1114
1148
|
if source not in self._engines:
|
|
1115
|
-
if source ==
|
|
1116
|
-
self._engines[source] = SemanticScholarEngine(
|
|
1117
|
-
|
|
1149
|
+
if source == "semantic_scholar":
|
|
1150
|
+
self._engines[source] = SemanticScholarEngine(
|
|
1151
|
+
self.semantic_scholar_api_key
|
|
1152
|
+
)
|
|
1153
|
+
elif source == "pubmed":
|
|
1118
1154
|
self._engines[source] = PubMedEngine(self.email)
|
|
1119
|
-
elif source ==
|
|
1155
|
+
elif source == "arxiv":
|
|
1120
1156
|
self._engines[source] = ArxivEngine()
|
|
1121
|
-
elif source ==
|
|
1122
|
-
self._engines[source] = GoogleScholarEngine(
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
elif source ==
|
|
1157
|
+
elif source == "google_scholar":
|
|
1158
|
+
self._engines[source] = GoogleScholarEngine(
|
|
1159
|
+
timeout=self.google_scholar_timeout
|
|
1160
|
+
)
|
|
1161
|
+
elif source == "crossref":
|
|
1162
|
+
self._engines[source] = CrossRefEngine(
|
|
1163
|
+
api_key=self.crossref_api_key, email=self.email
|
|
1164
|
+
)
|
|
1165
|
+
elif source == "local":
|
|
1126
1166
|
self._engines[source] = LocalSearchEngine()
|
|
1127
|
-
elif source ==
|
|
1167
|
+
elif source == "vector":
|
|
1128
1168
|
self._engines[source] = VectorSearchEngine()
|
|
1129
1169
|
else:
|
|
1130
1170
|
raise ValueError(f"Unknown source: {source}")
|
|
1131
1171
|
return self._engines[source]
|
|
1132
|
-
|
|
1133
|
-
async def search_async(
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1172
|
+
|
|
1173
|
+
async def search_async(
|
|
1174
|
+
self,
|
|
1175
|
+
query: str,
|
|
1176
|
+
sources: List[str] = None,
|
|
1177
|
+
limit: int = 20,
|
|
1178
|
+
deduplicate: bool = True,
|
|
1179
|
+
**kwargs,
|
|
1180
|
+
) -> List[Paper]:
|
|
1139
1181
|
"""
|
|
1140
1182
|
Search across multiple sources and merge results.
|
|
1141
|
-
|
|
1183
|
+
|
|
1142
1184
|
Args:
|
|
1143
1185
|
query: Search query
|
|
1144
1186
|
sources: List of sources to search (default: all web sources)
|
|
1145
1187
|
limit: Maximum results per source
|
|
1146
1188
|
deduplicate: Remove duplicate papers
|
|
1147
1189
|
**kwargs: Additional parameters for engines
|
|
1148
|
-
|
|
1190
|
+
|
|
1149
1191
|
Returns:
|
|
1150
1192
|
Merged and ranked list of papers
|
|
1151
1193
|
"""
|
|
1152
1194
|
if sources is None:
|
|
1153
|
-
sources = [
|
|
1154
|
-
|
|
1195
|
+
sources = ["pubmed"] # Default to PubMed only
|
|
1196
|
+
|
|
1155
1197
|
# Filter to valid sources
|
|
1156
|
-
valid_sources = [
|
|
1198
|
+
valid_sources = [
|
|
1199
|
+
"pubmed",
|
|
1200
|
+
"semantic_scholar",
|
|
1201
|
+
"google_scholar",
|
|
1202
|
+
"crossref",
|
|
1203
|
+
"arxiv",
|
|
1204
|
+
"local",
|
|
1205
|
+
"vector",
|
|
1206
|
+
]
|
|
1157
1207
|
sources = [s for s in sources if s in valid_sources]
|
|
1158
|
-
|
|
1208
|
+
|
|
1159
1209
|
if not sources:
|
|
1160
1210
|
logger.warning("No valid search sources specified")
|
|
1161
1211
|
return []
|
|
1162
|
-
|
|
1212
|
+
|
|
1163
1213
|
# Search all sources concurrently
|
|
1164
1214
|
tasks = []
|
|
1165
1215
|
for source in sources:
|
|
@@ -1169,10 +1219,10 @@ class UnifiedSearcher:
|
|
|
1169
1219
|
tasks.append(task)
|
|
1170
1220
|
except Exception as e:
|
|
1171
1221
|
logger.debug(f"Failed to initialize {source} engine: {e}")
|
|
1172
|
-
|
|
1222
|
+
|
|
1173
1223
|
logger.debug(f"Searching {len(tasks)} sources: {sources}")
|
|
1174
1224
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1175
|
-
|
|
1225
|
+
|
|
1176
1226
|
# Merge results
|
|
1177
1227
|
all_papers = []
|
|
1178
1228
|
for source, result in zip(sources, results):
|
|
@@ -1181,26 +1231,26 @@ class UnifiedSearcher:
|
|
|
1181
1231
|
else:
|
|
1182
1232
|
logger.debug(f"{source} returned {len(result)} papers")
|
|
1183
1233
|
all_papers.extend(result)
|
|
1184
|
-
|
|
1234
|
+
|
|
1185
1235
|
# Deduplicate if requested
|
|
1186
1236
|
if deduplicate:
|
|
1187
1237
|
all_papers = self._deduplicate_papers(all_papers)
|
|
1188
|
-
|
|
1238
|
+
|
|
1189
1239
|
# Sort by relevance (using citation count as proxy)
|
|
1190
1240
|
all_papers.sort(key=lambda p: p.citation_count or 0, reverse=True)
|
|
1191
|
-
|
|
1241
|
+
|
|
1192
1242
|
return all_papers[:limit]
|
|
1193
|
-
|
|
1243
|
+
|
|
1194
1244
|
def _deduplicate_papers(self, papers: List[Paper]) -> List[Paper]:
|
|
1195
1245
|
"""Remove duplicate papers based on similarity."""
|
|
1196
1246
|
if not papers:
|
|
1197
1247
|
return []
|
|
1198
|
-
|
|
1248
|
+
|
|
1199
1249
|
unique_papers = [papers[0]]
|
|
1200
|
-
|
|
1250
|
+
|
|
1201
1251
|
for paper in papers[1:]:
|
|
1202
1252
|
is_duplicate = False
|
|
1203
|
-
|
|
1253
|
+
|
|
1204
1254
|
for unique_paper in unique_papers:
|
|
1205
1255
|
if paper.similarity_score(unique_paper) > 0.85:
|
|
1206
1256
|
is_duplicate = True
|
|
@@ -1209,86 +1259,95 @@ class UnifiedSearcher:
|
|
|
1209
1259
|
unique_papers.remove(unique_paper)
|
|
1210
1260
|
unique_papers.append(paper)
|
|
1211
1261
|
break
|
|
1212
|
-
|
|
1262
|
+
|
|
1213
1263
|
if not is_duplicate:
|
|
1214
1264
|
unique_papers.append(paper)
|
|
1215
|
-
|
|
1265
|
+
|
|
1216
1266
|
return unique_papers
|
|
1217
|
-
|
|
1267
|
+
|
|
1218
1268
|
def build_local_index(self, pdf_dirs: List[Union[str, Path]]) -> Dict[str, Any]:
|
|
1219
1269
|
"""Build local search index."""
|
|
1220
1270
|
pdf_dirs = [Path(d) for d in pdf_dirs]
|
|
1221
|
-
return self.engines[
|
|
1222
|
-
|
|
1271
|
+
return self.engines["local"].build_index(pdf_dirs)
|
|
1272
|
+
|
|
1223
1273
|
def add_to_vector_index(self, papers: List[Paper]) -> None:
|
|
1224
1274
|
"""Add papers to vector search index."""
|
|
1225
|
-
self.engines[
|
|
1275
|
+
self.engines["vector"].add_papers(papers)
|
|
1226
1276
|
|
|
1227
1277
|
|
|
1228
1278
|
# Convenience functions - get_scholar_dir moved to utils._paths
|
|
1229
1279
|
|
|
1230
1280
|
|
|
1231
|
-
async def search_async(
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1281
|
+
async def search_async(
|
|
1282
|
+
query: str,
|
|
1283
|
+
sources: List[str] = None,
|
|
1284
|
+
limit: int = 20,
|
|
1285
|
+
email: Optional[str] = None,
|
|
1286
|
+
semantic_scholar_api_key: Optional[str] = None,
|
|
1287
|
+
**kwargs,
|
|
1288
|
+
) -> List[Paper]:
|
|
1237
1289
|
"""
|
|
1238
1290
|
Async convenience function for searching papers.
|
|
1239
1291
|
"""
|
|
1240
|
-
searcher = UnifiedSearcher(
|
|
1292
|
+
searcher = UnifiedSearcher(
|
|
1293
|
+
email=email, semantic_scholar_api_key=semantic_scholar_api_key
|
|
1294
|
+
)
|
|
1241
1295
|
return await searcher.search_async(query, sources, limit, **kwargs)
|
|
1242
1296
|
|
|
1243
1297
|
|
|
1244
|
-
def search_sync(
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1298
|
+
def search_sync(
|
|
1299
|
+
query: str,
|
|
1300
|
+
sources: List[str] = None,
|
|
1301
|
+
limit: int = 20,
|
|
1302
|
+
email: Optional[str] = None,
|
|
1303
|
+
semantic_scholar_api_key: Optional[str] = None,
|
|
1304
|
+
**kwargs,
|
|
1305
|
+
) -> List[Paper]:
|
|
1250
1306
|
"""
|
|
1251
1307
|
Synchronous convenience function for searching papers.
|
|
1252
1308
|
"""
|
|
1253
|
-
return asyncio.run(
|
|
1309
|
+
return asyncio.run(
|
|
1310
|
+
search_async(query, sources, limit, email, semantic_scholar_api_key, **kwargs)
|
|
1311
|
+
)
|
|
1254
1312
|
|
|
1255
1313
|
|
|
1256
|
-
def build_index(
|
|
1257
|
-
|
|
1314
|
+
def build_index(
|
|
1315
|
+
paths: List[Union[str, Path]], vector_index: bool = True
|
|
1316
|
+
) -> Dict[str, Any]:
|
|
1258
1317
|
"""
|
|
1259
1318
|
Build local search indices.
|
|
1260
|
-
|
|
1319
|
+
|
|
1261
1320
|
Args:
|
|
1262
1321
|
paths: Directories containing PDFs
|
|
1263
1322
|
vector_index: Also build vector similarity index
|
|
1264
|
-
|
|
1323
|
+
|
|
1265
1324
|
Returns:
|
|
1266
1325
|
Statistics about indexing
|
|
1267
1326
|
"""
|
|
1268
1327
|
searcher = UnifiedSearcher()
|
|
1269
1328
|
stats = searcher.build_local_index(paths)
|
|
1270
|
-
|
|
1329
|
+
|
|
1271
1330
|
if vector_index:
|
|
1272
1331
|
# Add papers to vector index
|
|
1273
|
-
papers = searcher.engines[
|
|
1332
|
+
papers = searcher.engines["local"]._search_sync("*", 9999, {})
|
|
1274
1333
|
if papers:
|
|
1275
1334
|
searcher.add_to_vector_index(papers)
|
|
1276
|
-
stats[
|
|
1277
|
-
|
|
1335
|
+
stats["vector_indexed"] = len(papers)
|
|
1336
|
+
|
|
1278
1337
|
return stats
|
|
1279
1338
|
|
|
1280
1339
|
|
|
1281
1340
|
# Export all classes and functions
|
|
1282
1341
|
__all__ = [
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
]
|
|
1342
|
+
"SearchEngine",
|
|
1343
|
+
"SemanticScholarEngine",
|
|
1344
|
+
"PubMedEngine",
|
|
1345
|
+
"ArxivEngine",
|
|
1346
|
+
"LocalSearchEngine",
|
|
1347
|
+
"VectorSearchEngine",
|
|
1348
|
+
"UnifiedSearcher",
|
|
1349
|
+
"get_scholar_dir",
|
|
1350
|
+
"search",
|
|
1351
|
+
"search_sync",
|
|
1352
|
+
"build_index",
|
|
1353
|
+
]
|