scitex 2.0.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +73 -0
- scitex/__main__.py +89 -0
- scitex/__version__.py +14 -0
- scitex/_sh.py +59 -0
- scitex/ai/_LearningCurveLogger.py +583 -0
- scitex/ai/__Classifiers.py +101 -0
- scitex/ai/__init__.py +55 -0
- scitex/ai/_gen_ai/_Anthropic.py +173 -0
- scitex/ai/_gen_ai/_BaseGenAI.py +336 -0
- scitex/ai/_gen_ai/_DeepSeek.py +175 -0
- scitex/ai/_gen_ai/_Google.py +161 -0
- scitex/ai/_gen_ai/_Groq.py +97 -0
- scitex/ai/_gen_ai/_Llama.py +142 -0
- scitex/ai/_gen_ai/_OpenAI.py +230 -0
- scitex/ai/_gen_ai/_PARAMS.py +565 -0
- scitex/ai/_gen_ai/_Perplexity.py +191 -0
- scitex/ai/_gen_ai/__init__.py +32 -0
- scitex/ai/_gen_ai/_calc_cost.py +78 -0
- scitex/ai/_gen_ai/_format_output_func.py +183 -0
- scitex/ai/_gen_ai/_genai_factory.py +71 -0
- scitex/ai/act/__init__.py +8 -0
- scitex/ai/act/_define.py +11 -0
- scitex/ai/classification/__init__.py +7 -0
- scitex/ai/classification/classification_reporter.py +1137 -0
- scitex/ai/classification/classifier_server.py +131 -0
- scitex/ai/classification/classifiers.py +101 -0
- scitex/ai/classification_reporter.py +1161 -0
- scitex/ai/classifier_server.py +131 -0
- scitex/ai/clustering/__init__.py +11 -0
- scitex/ai/clustering/_pca.py +115 -0
- scitex/ai/clustering/_umap.py +376 -0
- scitex/ai/early_stopping.py +149 -0
- scitex/ai/feature_extraction/__init__.py +56 -0
- scitex/ai/feature_extraction/vit.py +148 -0
- scitex/ai/genai/__init__.py +277 -0
- scitex/ai/genai/anthropic.py +177 -0
- scitex/ai/genai/anthropic_provider.py +320 -0
- scitex/ai/genai/anthropic_refactored.py +109 -0
- scitex/ai/genai/auth_manager.py +200 -0
- scitex/ai/genai/base_genai.py +336 -0
- scitex/ai/genai/base_provider.py +291 -0
- scitex/ai/genai/calc_cost.py +78 -0
- scitex/ai/genai/chat_history.py +307 -0
- scitex/ai/genai/cost_tracker.py +276 -0
- scitex/ai/genai/deepseek.py +188 -0
- scitex/ai/genai/deepseek_provider.py +251 -0
- scitex/ai/genai/format_output_func.py +183 -0
- scitex/ai/genai/genai_factory.py +71 -0
- scitex/ai/genai/google.py +169 -0
- scitex/ai/genai/google_provider.py +228 -0
- scitex/ai/genai/groq.py +104 -0
- scitex/ai/genai/groq_provider.py +248 -0
- scitex/ai/genai/image_processor.py +250 -0
- scitex/ai/genai/llama.py +155 -0
- scitex/ai/genai/llama_provider.py +214 -0
- scitex/ai/genai/mock_provider.py +127 -0
- scitex/ai/genai/model_registry.py +304 -0
- scitex/ai/genai/openai.py +230 -0
- scitex/ai/genai/openai_provider.py +293 -0
- scitex/ai/genai/params.py +565 -0
- scitex/ai/genai/perplexity.py +202 -0
- scitex/ai/genai/perplexity_provider.py +205 -0
- scitex/ai/genai/provider_base.py +302 -0
- scitex/ai/genai/provider_factory.py +370 -0
- scitex/ai/genai/response_handler.py +235 -0
- scitex/ai/layer/_Pass.py +21 -0
- scitex/ai/layer/__init__.py +10 -0
- scitex/ai/layer/_switch.py +8 -0
- scitex/ai/loss/_L1L2Losses.py +34 -0
- scitex/ai/loss/__init__.py +12 -0
- scitex/ai/loss/multi_task_loss.py +47 -0
- scitex/ai/metrics/__init__.py +9 -0
- scitex/ai/metrics/_bACC.py +51 -0
- scitex/ai/metrics/silhoute_score_block.py +496 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/__init__.py +0 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/__init__.py +3 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger.py +207 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger2020.py +238 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger913A.py +215 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/rangerqh.py +184 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/setup.py +24 -0
- scitex/ai/optim/__init__.py +13 -0
- scitex/ai/optim/_get_set.py +31 -0
- scitex/ai/optim/_optimizers.py +71 -0
- scitex/ai/plt/__init__.py +21 -0
- scitex/ai/plt/_conf_mat.py +592 -0
- scitex/ai/plt/_learning_curve.py +194 -0
- scitex/ai/plt/_optuna_study.py +111 -0
- scitex/ai/plt/aucs/__init__.py +2 -0
- scitex/ai/plt/aucs/example.py +60 -0
- scitex/ai/plt/aucs/pre_rec_auc.py +223 -0
- scitex/ai/plt/aucs/roc_auc.py +246 -0
- scitex/ai/sampling/undersample.py +29 -0
- scitex/ai/sk/__init__.py +11 -0
- scitex/ai/sk/_clf.py +58 -0
- scitex/ai/sk/_to_sktime.py +100 -0
- scitex/ai/sklearn/__init__.py +26 -0
- scitex/ai/sklearn/clf.py +58 -0
- scitex/ai/sklearn/to_sktime.py +100 -0
- scitex/ai/training/__init__.py +7 -0
- scitex/ai/training/early_stopping.py +150 -0
- scitex/ai/training/learning_curve_logger.py +555 -0
- scitex/ai/utils/__init__.py +22 -0
- scitex/ai/utils/_check_params.py +50 -0
- scitex/ai/utils/_default_dataset.py +46 -0
- scitex/ai/utils/_format_samples_for_sktime.py +26 -0
- scitex/ai/utils/_label_encoder.py +134 -0
- scitex/ai/utils/_merge_labels.py +22 -0
- scitex/ai/utils/_sliding_window_data_augmentation.py +11 -0
- scitex/ai/utils/_under_sample.py +51 -0
- scitex/ai/utils/_verify_n_gpus.py +16 -0
- scitex/ai/utils/grid_search.py +148 -0
- scitex/context/__init__.py +9 -0
- scitex/context/_suppress_output.py +38 -0
- scitex/db/_BaseMixins/_BaseBackupMixin.py +30 -0
- scitex/db/_BaseMixins/_BaseBatchMixin.py +31 -0
- scitex/db/_BaseMixins/_BaseBlobMixin.py +81 -0
- scitex/db/_BaseMixins/_BaseConnectionMixin.py +43 -0
- scitex/db/_BaseMixins/_BaseImportExportMixin.py +39 -0
- scitex/db/_BaseMixins/_BaseIndexMixin.py +29 -0
- scitex/db/_BaseMixins/_BaseMaintenanceMixin.py +33 -0
- scitex/db/_BaseMixins/_BaseQueryMixin.py +52 -0
- scitex/db/_BaseMixins/_BaseRowMixin.py +32 -0
- scitex/db/_BaseMixins/_BaseSchemaMixin.py +44 -0
- scitex/db/_BaseMixins/_BaseTableMixin.py +66 -0
- scitex/db/_BaseMixins/_BaseTransactionMixin.py +52 -0
- scitex/db/_BaseMixins/__init__.py +30 -0
- scitex/db/_PostgreSQL.py +126 -0
- scitex/db/_PostgreSQLMixins/_BackupMixin.py +166 -0
- scitex/db/_PostgreSQLMixins/_BatchMixin.py +82 -0
- scitex/db/_PostgreSQLMixins/_BlobMixin.py +231 -0
- scitex/db/_PostgreSQLMixins/_ConnectionMixin.py +92 -0
- scitex/db/_PostgreSQLMixins/_ImportExportMixin.py +59 -0
- scitex/db/_PostgreSQLMixins/_IndexMixin.py +64 -0
- scitex/db/_PostgreSQLMixins/_MaintenanceMixin.py +175 -0
- scitex/db/_PostgreSQLMixins/_QueryMixin.py +108 -0
- scitex/db/_PostgreSQLMixins/_RowMixin.py +75 -0
- scitex/db/_PostgreSQLMixins/_SchemaMixin.py +126 -0
- scitex/db/_PostgreSQLMixins/_TableMixin.py +176 -0
- scitex/db/_PostgreSQLMixins/_TransactionMixin.py +57 -0
- scitex/db/_PostgreSQLMixins/__init__.py +34 -0
- scitex/db/_SQLite3.py +2136 -0
- scitex/db/_SQLite3Mixins/_BatchMixin.py +243 -0
- scitex/db/_SQLite3Mixins/_BlobMixin.py +229 -0
- scitex/db/_SQLite3Mixins/_ConnectionMixin.py +108 -0
- scitex/db/_SQLite3Mixins/_ImportExportMixin.py +80 -0
- scitex/db/_SQLite3Mixins/_IndexMixin.py +32 -0
- scitex/db/_SQLite3Mixins/_MaintenanceMixin.py +176 -0
- scitex/db/_SQLite3Mixins/_QueryMixin.py +83 -0
- scitex/db/_SQLite3Mixins/_RowMixin.py +75 -0
- scitex/db/_SQLite3Mixins/_TableMixin.py +183 -0
- scitex/db/_SQLite3Mixins/_TransactionMixin.py +71 -0
- scitex/db/_SQLite3Mixins/__init__.py +30 -0
- scitex/db/__init__.py +14 -0
- scitex/db/_delete_duplicates.py +397 -0
- scitex/db/_inspect.py +163 -0
- scitex/decorators/__init__.py +54 -0
- scitex/decorators/_auto_order.py +172 -0
- scitex/decorators/_batch_fn.py +127 -0
- scitex/decorators/_cache_disk.py +32 -0
- scitex/decorators/_cache_mem.py +12 -0
- scitex/decorators/_combined.py +98 -0
- scitex/decorators/_converters.py +282 -0
- scitex/decorators/_deprecated.py +26 -0
- scitex/decorators/_not_implemented.py +30 -0
- scitex/decorators/_numpy_fn.py +86 -0
- scitex/decorators/_pandas_fn.py +121 -0
- scitex/decorators/_preserve_doc.py +19 -0
- scitex/decorators/_signal_fn.py +95 -0
- scitex/decorators/_timeout.py +55 -0
- scitex/decorators/_torch_fn.py +136 -0
- scitex/decorators/_wrap.py +39 -0
- scitex/decorators/_xarray_fn.py +88 -0
- scitex/dev/__init__.py +15 -0
- scitex/dev/_analyze_code_flow.py +284 -0
- scitex/dev/_reload.py +59 -0
- scitex/dict/_DotDict.py +442 -0
- scitex/dict/__init__.py +18 -0
- scitex/dict/_listed_dict.py +42 -0
- scitex/dict/_pop_keys.py +36 -0
- scitex/dict/_replace.py +13 -0
- scitex/dict/_safe_merge.py +62 -0
- scitex/dict/_to_str.py +32 -0
- scitex/dsp/__init__.py +72 -0
- scitex/dsp/_crop.py +122 -0
- scitex/dsp/_demo_sig.py +331 -0
- scitex/dsp/_detect_ripples.py +212 -0
- scitex/dsp/_ensure_3d.py +18 -0
- scitex/dsp/_hilbert.py +78 -0
- scitex/dsp/_listen.py +702 -0
- scitex/dsp/_misc.py +30 -0
- scitex/dsp/_mne.py +32 -0
- scitex/dsp/_modulation_index.py +79 -0
- scitex/dsp/_pac.py +319 -0
- scitex/dsp/_psd.py +102 -0
- scitex/dsp/_resample.py +65 -0
- scitex/dsp/_time.py +36 -0
- scitex/dsp/_transform.py +68 -0
- scitex/dsp/_wavelet.py +212 -0
- scitex/dsp/add_noise.py +111 -0
- scitex/dsp/example.py +253 -0
- scitex/dsp/filt.py +155 -0
- scitex/dsp/norm.py +18 -0
- scitex/dsp/params.py +51 -0
- scitex/dsp/reference.py +43 -0
- scitex/dsp/template.py +25 -0
- scitex/dsp/utils/__init__.py +15 -0
- scitex/dsp/utils/_differential_bandpass_filters.py +120 -0
- scitex/dsp/utils/_ensure_3d.py +18 -0
- scitex/dsp/utils/_ensure_even_len.py +10 -0
- scitex/dsp/utils/_zero_pad.py +48 -0
- scitex/dsp/utils/filter.py +408 -0
- scitex/dsp/utils/pac.py +177 -0
- scitex/dt/__init__.py +8 -0
- scitex/dt/_linspace.py +130 -0
- scitex/etc/__init__.py +15 -0
- scitex/etc/wait_key.py +34 -0
- scitex/gen/_DimHandler.py +196 -0
- scitex/gen/_TimeStamper.py +244 -0
- scitex/gen/__init__.py +95 -0
- scitex/gen/_alternate_kwarg.py +13 -0
- scitex/gen/_cache.py +11 -0
- scitex/gen/_check_host.py +34 -0
- scitex/gen/_ci.py +12 -0
- scitex/gen/_close.py +222 -0
- scitex/gen/_embed.py +78 -0
- scitex/gen/_inspect_module.py +257 -0
- scitex/gen/_is_ipython.py +12 -0
- scitex/gen/_less.py +48 -0
- scitex/gen/_list_packages.py +139 -0
- scitex/gen/_mat2py.py +88 -0
- scitex/gen/_norm.py +170 -0
- scitex/gen/_paste.py +18 -0
- scitex/gen/_print_config.py +84 -0
- scitex/gen/_shell.py +48 -0
- scitex/gen/_src.py +111 -0
- scitex/gen/_start.py +451 -0
- scitex/gen/_symlink.py +55 -0
- scitex/gen/_symlog.py +27 -0
- scitex/gen/_tee.py +238 -0
- scitex/gen/_title2path.py +60 -0
- scitex/gen/_title_case.py +88 -0
- scitex/gen/_to_even.py +84 -0
- scitex/gen/_to_odd.py +34 -0
- scitex/gen/_to_rank.py +39 -0
- scitex/gen/_transpose.py +37 -0
- scitex/gen/_type.py +78 -0
- scitex/gen/_var_info.py +73 -0
- scitex/gen/_wrap.py +17 -0
- scitex/gen/_xml2dict.py +76 -0
- scitex/gen/misc.py +730 -0
- scitex/gen/path.py +0 -0
- scitex/general/__init__.py +5 -0
- scitex/gists/_SigMacro_processFigure_S.py +128 -0
- scitex/gists/_SigMacro_toBlue.py +172 -0
- scitex/gists/__init__.py +12 -0
- scitex/io/_H5Explorer.py +292 -0
- scitex/io/__init__.py +82 -0
- scitex/io/_cache.py +101 -0
- scitex/io/_flush.py +24 -0
- scitex/io/_glob.py +103 -0
- scitex/io/_json2md.py +113 -0
- scitex/io/_load.py +168 -0
- scitex/io/_load_configs.py +146 -0
- scitex/io/_load_modules/__init__.py +38 -0
- scitex/io/_load_modules/_catboost.py +66 -0
- scitex/io/_load_modules/_con.py +20 -0
- scitex/io/_load_modules/_db.py +24 -0
- scitex/io/_load_modules/_docx.py +42 -0
- scitex/io/_load_modules/_eeg.py +110 -0
- scitex/io/_load_modules/_hdf5.py +196 -0
- scitex/io/_load_modules/_image.py +19 -0
- scitex/io/_load_modules/_joblib.py +19 -0
- scitex/io/_load_modules/_json.py +18 -0
- scitex/io/_load_modules/_markdown.py +103 -0
- scitex/io/_load_modules/_matlab.py +37 -0
- scitex/io/_load_modules/_numpy.py +39 -0
- scitex/io/_load_modules/_optuna.py +155 -0
- scitex/io/_load_modules/_pandas.py +69 -0
- scitex/io/_load_modules/_pdf.py +31 -0
- scitex/io/_load_modules/_pickle.py +24 -0
- scitex/io/_load_modules/_torch.py +16 -0
- scitex/io/_load_modules/_txt.py +126 -0
- scitex/io/_load_modules/_xml.py +49 -0
- scitex/io/_load_modules/_yaml.py +23 -0
- scitex/io/_mv_to_tmp.py +19 -0
- scitex/io/_path.py +286 -0
- scitex/io/_reload.py +78 -0
- scitex/io/_save.py +539 -0
- scitex/io/_save_modules/__init__.py +66 -0
- scitex/io/_save_modules/_catboost.py +22 -0
- scitex/io/_save_modules/_csv.py +89 -0
- scitex/io/_save_modules/_excel.py +49 -0
- scitex/io/_save_modules/_hdf5.py +249 -0
- scitex/io/_save_modules/_html.py +48 -0
- scitex/io/_save_modules/_image.py +140 -0
- scitex/io/_save_modules/_joblib.py +25 -0
- scitex/io/_save_modules/_json.py +25 -0
- scitex/io/_save_modules/_listed_dfs_as_csv.py +57 -0
- scitex/io/_save_modules/_listed_scalars_as_csv.py +42 -0
- scitex/io/_save_modules/_matlab.py +24 -0
- scitex/io/_save_modules/_mp4.py +29 -0
- scitex/io/_save_modules/_numpy.py +57 -0
- scitex/io/_save_modules/_optuna_study_as_csv_and_pngs.py +38 -0
- scitex/io/_save_modules/_pickle.py +45 -0
- scitex/io/_save_modules/_plotly.py +27 -0
- scitex/io/_save_modules/_text.py +23 -0
- scitex/io/_save_modules/_torch.py +26 -0
- scitex/io/_save_modules/_yaml.py +29 -0
- scitex/life/__init__.py +10 -0
- scitex/life/_monitor_rain.py +49 -0
- scitex/linalg/__init__.py +17 -0
- scitex/linalg/_distance.py +63 -0
- scitex/linalg/_geometric_median.py +64 -0
- scitex/linalg/_misc.py +73 -0
- scitex/nn/_AxiswiseDropout.py +27 -0
- scitex/nn/_BNet.py +126 -0
- scitex/nn/_BNet_Res.py +164 -0
- scitex/nn/_ChannelGainChanger.py +44 -0
- scitex/nn/_DropoutChannels.py +50 -0
- scitex/nn/_Filters.py +489 -0
- scitex/nn/_FreqGainChanger.py +110 -0
- scitex/nn/_GaussianFilter.py +48 -0
- scitex/nn/_Hilbert.py +111 -0
- scitex/nn/_MNet_1000.py +157 -0
- scitex/nn/_ModulationIndex.py +221 -0
- scitex/nn/_PAC.py +414 -0
- scitex/nn/_PSD.py +40 -0
- scitex/nn/_ResNet1D.py +120 -0
- scitex/nn/_SpatialAttention.py +25 -0
- scitex/nn/_Spectrogram.py +161 -0
- scitex/nn/_SwapChannels.py +50 -0
- scitex/nn/_TransposeLayer.py +19 -0
- scitex/nn/_Wavelet.py +183 -0
- scitex/nn/__init__.py +63 -0
- scitex/os/__init__.py +8 -0
- scitex/os/_mv.py +50 -0
- scitex/parallel/__init__.py +8 -0
- scitex/parallel/_run.py +151 -0
- scitex/path/__init__.py +33 -0
- scitex/path/_clean.py +52 -0
- scitex/path/_find.py +108 -0
- scitex/path/_get_module_path.py +51 -0
- scitex/path/_get_spath.py +35 -0
- scitex/path/_getsize.py +18 -0
- scitex/path/_increment_version.py +87 -0
- scitex/path/_mk_spath.py +51 -0
- scitex/path/_path.py +19 -0
- scitex/path/_split.py +23 -0
- scitex/path/_this_path.py +19 -0
- scitex/path/_version.py +101 -0
- scitex/pd/__init__.py +41 -0
- scitex/pd/_find_indi.py +126 -0
- scitex/pd/_find_pval.py +113 -0
- scitex/pd/_force_df.py +154 -0
- scitex/pd/_from_xyz.py +71 -0
- scitex/pd/_ignore_SettingWithCopyWarning.py +34 -0
- scitex/pd/_melt_cols.py +81 -0
- scitex/pd/_merge_columns.py +221 -0
- scitex/pd/_mv.py +63 -0
- scitex/pd/_replace.py +62 -0
- scitex/pd/_round.py +93 -0
- scitex/pd/_slice.py +63 -0
- scitex/pd/_sort.py +91 -0
- scitex/pd/_to_numeric.py +53 -0
- scitex/pd/_to_xy.py +59 -0
- scitex/pd/_to_xyz.py +110 -0
- scitex/plt/__init__.py +36 -0
- scitex/plt/_subplots/_AxesWrapper.py +182 -0
- scitex/plt/_subplots/_AxisWrapper.py +249 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin.py +414 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin.py +896 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin.py +368 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_TrackingMixin.py +185 -0
- scitex/plt/_subplots/_AxisWrapperMixins/__init__.py +16 -0
- scitex/plt/_subplots/_FigWrapper.py +226 -0
- scitex/plt/_subplots/_SubplotsWrapper.py +171 -0
- scitex/plt/_subplots/__init__.py +111 -0
- scitex/plt/_subplots/_export_as_csv.py +232 -0
- scitex/plt/_subplots/_export_as_csv_formatters/__init__.py +61 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_bar.py +90 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_barh.py +49 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_boxplot.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_contour.py +39 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_errorbar.py +125 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_eventplot.py +72 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill.py +34 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill_between.py +36 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_hist.py +79 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow.py +59 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow2d.py +32 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +79 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_box.py +75 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_conf_mat.py +64 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_ecdf.py +44 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_fillv.py +70 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_heatmap.py +66 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_image.py +95 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_joyplot.py +67 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_kde.py +52 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_line.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_ci.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_std.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_median_iqr.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_raster.py +44 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_rectangle.py +103 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter_hist.py +82 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_shaded_line.py +58 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_violin.py +117 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_scatter.py +30 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_barplot.py +51 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_boxplot.py +93 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_heatmap.py +94 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_histplot.py +92 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_jointplot.py +65 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_kdeplot.py +59 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_lineplot.py +58 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_pairplot.py +45 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_scatterplot.py +70 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_stripplot.py +75 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_swarmplot.py +75 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_violinplot.py +155 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violin.py +64 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violinplot.py +77 -0
- scitex/plt/_subplots/_export_as_csv_formatters/test_formatters.py +210 -0
- scitex/plt/_subplots/_export_as_csv_formatters/verify_formatters.py +342 -0
- scitex/plt/_subplots/_export_as_csv_formatters.py +115 -0
- scitex/plt/_tpl.py +28 -0
- scitex/plt/ax/__init__.py +114 -0
- scitex/plt/ax/_plot/__init__.py +53 -0
- scitex/plt/ax/_plot/_plot_circular_hist.py +124 -0
- scitex/plt/ax/_plot/_plot_conf_mat.py +136 -0
- scitex/plt/ax/_plot/_plot_cube.py +57 -0
- scitex/plt/ax/_plot/_plot_ecdf.py +84 -0
- scitex/plt/ax/_plot/_plot_fillv.py +55 -0
- scitex/plt/ax/_plot/_plot_heatmap.py +266 -0
- scitex/plt/ax/_plot/_plot_image.py +94 -0
- scitex/plt/ax/_plot/_plot_joyplot.py +76 -0
- scitex/plt/ax/_plot/_plot_raster.py +172 -0
- scitex/plt/ax/_plot/_plot_rectangle.py +69 -0
- scitex/plt/ax/_plot/_plot_scatter_hist.py +133 -0
- scitex/plt/ax/_plot/_plot_shaded_line.py +142 -0
- scitex/plt/ax/_plot/_plot_statistical_shaded_line.py +221 -0
- scitex/plt/ax/_plot/_plot_violin.py +343 -0
- scitex/plt/ax/_style/__init__.py +38 -0
- scitex/plt/ax/_style/_add_marginal_ax.py +44 -0
- scitex/plt/ax/_style/_add_panel.py +92 -0
- scitex/plt/ax/_style/_extend.py +64 -0
- scitex/plt/ax/_style/_force_aspect.py +37 -0
- scitex/plt/ax/_style/_format_label.py +23 -0
- scitex/plt/ax/_style/_hide_spines.py +84 -0
- scitex/plt/ax/_style/_map_ticks.py +182 -0
- scitex/plt/ax/_style/_rotate_labels.py +215 -0
- scitex/plt/ax/_style/_sci_note.py +279 -0
- scitex/plt/ax/_style/_set_log_scale.py +299 -0
- scitex/plt/ax/_style/_set_meta.py +261 -0
- scitex/plt/ax/_style/_set_n_ticks.py +37 -0
- scitex/plt/ax/_style/_set_size.py +16 -0
- scitex/plt/ax/_style/_set_supxyt.py +116 -0
- scitex/plt/ax/_style/_set_ticks.py +276 -0
- scitex/plt/ax/_style/_set_xyt.py +121 -0
- scitex/plt/ax/_style/_share_axes.py +264 -0
- scitex/plt/ax/_style/_shift.py +139 -0
- scitex/plt/ax/_style/_show_spines.py +333 -0
- scitex/plt/color/_PARAMS.py +70 -0
- scitex/plt/color/__init__.py +52 -0
- scitex/plt/color/_add_hue_col.py +41 -0
- scitex/plt/color/_colors.py +205 -0
- scitex/plt/color/_get_colors_from_cmap.py +134 -0
- scitex/plt/color/_interpolate.py +29 -0
- scitex/plt/color/_vizualize_colors.py +54 -0
- scitex/plt/utils/__init__.py +44 -0
- scitex/plt/utils/_calc_bacc_from_conf_mat.py +46 -0
- scitex/plt/utils/_calc_nice_ticks.py +101 -0
- scitex/plt/utils/_close.py +68 -0
- scitex/plt/utils/_colorbar.py +96 -0
- scitex/plt/utils/_configure_mpl.py +295 -0
- scitex/plt/utils/_histogram_utils.py +132 -0
- scitex/plt/utils/_im2grid.py +70 -0
- scitex/plt/utils/_is_valid_axis.py +78 -0
- scitex/plt/utils/_mk_colorbar.py +65 -0
- scitex/plt/utils/_mk_patches.py +26 -0
- scitex/plt/utils/_scientific_captions.py +638 -0
- scitex/plt/utils/_scitex_config.py +223 -0
- scitex/reproduce/__init__.py +14 -0
- scitex/reproduce/_fix_seeds.py +45 -0
- scitex/reproduce/_gen_ID.py +55 -0
- scitex/reproduce/_gen_timestamp.py +35 -0
- scitex/res/__init__.py +5 -0
- scitex/resource/__init__.py +13 -0
- scitex/resource/_get_processor_usages.py +281 -0
- scitex/resource/_get_specs.py +280 -0
- scitex/resource/_log_processor_usages.py +190 -0
- scitex/resource/_utils/__init__.py +31 -0
- scitex/resource/_utils/_get_env_info.py +481 -0
- scitex/resource/limit_ram.py +33 -0
- scitex/scholar/__init__.py +24 -0
- scitex/scholar/_local_search.py +454 -0
- scitex/scholar/_paper.py +244 -0
- scitex/scholar/_pdf_downloader.py +325 -0
- scitex/scholar/_search.py +393 -0
- scitex/scholar/_vector_search.py +370 -0
- scitex/scholar/_web_sources.py +457 -0
- scitex/stats/__init__.py +31 -0
- scitex/stats/_calc_partial_corr.py +17 -0
- scitex/stats/_corr_test_multi.py +94 -0
- scitex/stats/_corr_test_wrapper.py +115 -0
- scitex/stats/_describe_wrapper.py +90 -0
- scitex/stats/_multiple_corrections.py +63 -0
- scitex/stats/_nan_stats.py +93 -0
- scitex/stats/_p2stars.py +116 -0
- scitex/stats/_p2stars_wrapper.py +56 -0
- scitex/stats/_statistical_tests.py +73 -0
- scitex/stats/desc/__init__.py +40 -0
- scitex/stats/desc/_describe.py +189 -0
- scitex/stats/desc/_nan.py +289 -0
- scitex/stats/desc/_real.py +94 -0
- scitex/stats/multiple/__init__.py +14 -0
- scitex/stats/multiple/_bonferroni_correction.py +72 -0
- scitex/stats/multiple/_fdr_correction.py +400 -0
- scitex/stats/multiple/_multicompair.py +28 -0
- scitex/stats/tests/__corr_test.py +277 -0
- scitex/stats/tests/__corr_test_multi.py +343 -0
- scitex/stats/tests/__corr_test_single.py +277 -0
- scitex/stats/tests/__init__.py +22 -0
- scitex/stats/tests/_brunner_munzel_test.py +192 -0
- scitex/stats/tests/_nocorrelation_test.py +28 -0
- scitex/stats/tests/_smirnov_grubbs.py +98 -0
- scitex/str/__init__.py +113 -0
- scitex/str/_clean_path.py +75 -0
- scitex/str/_color_text.py +52 -0
- scitex/str/_decapitalize.py +58 -0
- scitex/str/_factor_out_digits.py +281 -0
- scitex/str/_format_plot_text.py +498 -0
- scitex/str/_grep.py +48 -0
- scitex/str/_latex.py +155 -0
- scitex/str/_latex_fallback.py +471 -0
- scitex/str/_mask_api.py +39 -0
- scitex/str/_mask_api_key.py +8 -0
- scitex/str/_parse.py +158 -0
- scitex/str/_print_block.py +47 -0
- scitex/str/_print_debug.py +68 -0
- scitex/str/_printc.py +62 -0
- scitex/str/_readable_bytes.py +38 -0
- scitex/str/_remove_ansi.py +23 -0
- scitex/str/_replace.py +134 -0
- scitex/str/_search.py +125 -0
- scitex/str/_squeeze_space.py +36 -0
- scitex/tex/__init__.py +10 -0
- scitex/tex/_preview.py +103 -0
- scitex/tex/_to_vec.py +116 -0
- scitex/torch/__init__.py +18 -0
- scitex/torch/_apply_to.py +34 -0
- scitex/torch/_nan_funcs.py +77 -0
- scitex/types/_ArrayLike.py +44 -0
- scitex/types/_ColorLike.py +21 -0
- scitex/types/__init__.py +14 -0
- scitex/types/_is_listed_X.py +70 -0
- scitex/utils/__init__.py +22 -0
- scitex/utils/_compress_hdf5.py +116 -0
- scitex/utils/_email.py +120 -0
- scitex/utils/_grid.py +148 -0
- scitex/utils/_notify.py +247 -0
- scitex/utils/_search.py +121 -0
- scitex/web/__init__.py +38 -0
- scitex/web/_search_pubmed.py +438 -0
- scitex/web/_summarize_url.py +158 -0
- scitex-2.0.0.dist-info/METADATA +307 -0
- scitex-2.0.0.dist-info/RECORD +572 -0
- scitex-2.0.0.dist-info/WHEEL +6 -0
- scitex-2.0.0.dist-info/licenses/LICENSE +7 -0
- scitex-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Time-stamp: "2024-12-06 10:20:00"
|
|
4
|
+
# Author: Claude
|
|
5
|
+
# Filename: _pdf_downloader.py
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
PDF downloader for scientific papers.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import aiohttp
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional, Dict, Any, List
|
|
15
|
+
import logging
|
|
16
|
+
import re
|
|
17
|
+
from urllib.parse import urlparse, quote
|
|
18
|
+
|
|
19
|
+
from ._paper import Paper
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PDFDownloader:
|
|
26
|
+
"""Download PDFs for scientific papers."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
download_dir: Optional[Path] = None,
|
|
31
|
+
timeout: int = 30,
|
|
32
|
+
max_concurrent: int = 3,
|
|
33
|
+
):
|
|
34
|
+
"""Initialize PDF downloader.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
download_dir : Path, optional
|
|
39
|
+
Directory to save PDFs (default: current directory)
|
|
40
|
+
timeout : int
|
|
41
|
+
Download timeout in seconds
|
|
42
|
+
max_concurrent : int
|
|
43
|
+
Maximum concurrent downloads
|
|
44
|
+
"""
|
|
45
|
+
self.download_dir = Path(download_dir) if download_dir else Path.cwd()
|
|
46
|
+
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
self.timeout = timeout
|
|
48
|
+
self.max_concurrent = max_concurrent
|
|
49
|
+
|
|
50
|
+
# Headers for requests
|
|
51
|
+
self.headers = {
|
|
52
|
+
"User-Agent": "Mozilla/5.0 (compatible; SciTeX Scholar/1.0; +https://github.com/ywatanabe/scitex)"
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async def download_paper(
|
|
56
|
+
self,
|
|
57
|
+
paper: Paper,
|
|
58
|
+
session: Optional[aiohttp.ClientSession] = None,
|
|
59
|
+
force: bool = False,
|
|
60
|
+
) -> Optional[Path]:
|
|
61
|
+
"""Download PDF for a single paper.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
paper : Paper
|
|
66
|
+
Paper to download
|
|
67
|
+
session : aiohttp.ClientSession, optional
|
|
68
|
+
Session for connection pooling
|
|
69
|
+
force : bool
|
|
70
|
+
Force re-download even if file exists
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
Path or None
|
|
75
|
+
Path to downloaded PDF, or None if failed
|
|
76
|
+
"""
|
|
77
|
+
# Check if already has PDF
|
|
78
|
+
if paper.pdf_path and paper.pdf_path.exists() and not force:
|
|
79
|
+
logger.info(f"PDF already exists: {paper.pdf_path}")
|
|
80
|
+
return paper.pdf_path
|
|
81
|
+
|
|
82
|
+
# Generate filename
|
|
83
|
+
filename = self._generate_filename(paper)
|
|
84
|
+
pdf_path = self.download_dir / filename
|
|
85
|
+
|
|
86
|
+
# Check if already downloaded
|
|
87
|
+
if pdf_path.exists() and not force:
|
|
88
|
+
paper.pdf_path = pdf_path
|
|
89
|
+
logger.info(f"PDF already downloaded: {pdf_path}")
|
|
90
|
+
return pdf_path
|
|
91
|
+
|
|
92
|
+
# Get PDF URL
|
|
93
|
+
pdf_url = self._get_pdf_url(paper)
|
|
94
|
+
if not pdf_url:
|
|
95
|
+
logger.warning(f"No PDF URL available for: {paper.title}")
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
# Download
|
|
99
|
+
close_session = False
|
|
100
|
+
if session is None:
|
|
101
|
+
session = aiohttp.ClientSession()
|
|
102
|
+
close_session = True
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
logger.info(f"Downloading PDF from: {pdf_url}")
|
|
106
|
+
|
|
107
|
+
async with session.get(
|
|
108
|
+
pdf_url,
|
|
109
|
+
headers=self.headers,
|
|
110
|
+
timeout=aiohttp.ClientTimeout(total=self.timeout),
|
|
111
|
+
) as response:
|
|
112
|
+
if response.status == 200:
|
|
113
|
+
content = await response.read()
|
|
114
|
+
|
|
115
|
+
# Verify it's a PDF
|
|
116
|
+
if not content.startswith(b"%PDF"):
|
|
117
|
+
logger.error(f"Downloaded content is not a PDF for: {paper.title}")
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
# Save PDF
|
|
121
|
+
with open(pdf_path, "wb") as f:
|
|
122
|
+
f.write(content)
|
|
123
|
+
|
|
124
|
+
paper.pdf_path = pdf_path
|
|
125
|
+
logger.info(f"Downloaded PDF to: {pdf_path}")
|
|
126
|
+
return pdf_path
|
|
127
|
+
else:
|
|
128
|
+
logger.error(f"Failed to download PDF (status {response.status}): {paper.title}")
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
except asyncio.TimeoutError:
|
|
132
|
+
logger.error(f"Timeout downloading PDF: {paper.title}")
|
|
133
|
+
return None
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error(f"Error downloading PDF: {e}")
|
|
136
|
+
return None
|
|
137
|
+
finally:
|
|
138
|
+
if close_session:
|
|
139
|
+
await session.close()
|
|
140
|
+
|
|
141
|
+
async def download_papers(
|
|
142
|
+
self,
|
|
143
|
+
papers: List[Paper],
|
|
144
|
+
force: bool = False,
|
|
145
|
+
progress_callback: Optional[callable] = None,
|
|
146
|
+
) -> Dict[str, Path]:
|
|
147
|
+
"""Download PDFs for multiple papers.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
papers : List[Paper]
|
|
152
|
+
Papers to download
|
|
153
|
+
force : bool
|
|
154
|
+
Force re-download even if files exist
|
|
155
|
+
progress_callback : callable, optional
|
|
156
|
+
Callback function(completed, total)
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
Dict[str, Path]
|
|
161
|
+
Mapping of paper identifiers to PDF paths
|
|
162
|
+
"""
|
|
163
|
+
results = {}
|
|
164
|
+
|
|
165
|
+
# Create semaphore for concurrent downloads
|
|
166
|
+
semaphore = asyncio.Semaphore(self.max_concurrent)
|
|
167
|
+
|
|
168
|
+
async def download_with_semaphore(paper):
|
|
169
|
+
async with semaphore:
|
|
170
|
+
path = await self.download_paper(paper, session, force)
|
|
171
|
+
if path:
|
|
172
|
+
results[paper.get_identifier()] = path
|
|
173
|
+
|
|
174
|
+
if progress_callback:
|
|
175
|
+
progress_callback(len(results), len(papers))
|
|
176
|
+
|
|
177
|
+
return path
|
|
178
|
+
|
|
179
|
+
# Download all papers
|
|
180
|
+
async with aiohttp.ClientSession() as session:
|
|
181
|
+
tasks = [download_with_semaphore(paper) for paper in papers]
|
|
182
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
183
|
+
|
|
184
|
+
return results
|
|
185
|
+
|
|
186
|
+
def _generate_filename(self, paper: Paper) -> str:
|
|
187
|
+
"""Generate filename for PDF."""
|
|
188
|
+
# Clean title for filename
|
|
189
|
+
title = re.sub(r"[^\w\s-]", "", paper.title)
|
|
190
|
+
title = re.sub(r"[-\s]+", "-", title)
|
|
191
|
+
title = title[:100] # Limit length
|
|
192
|
+
|
|
193
|
+
# Add year if available
|
|
194
|
+
if paper.year:
|
|
195
|
+
filename = f"{paper.year}_{title}.pdf"
|
|
196
|
+
else:
|
|
197
|
+
filename = f"{title}.pdf"
|
|
198
|
+
|
|
199
|
+
return filename
|
|
200
|
+
|
|
201
|
+
def _get_pdf_url(self, paper: Paper) -> Optional[str]:
|
|
202
|
+
"""Get PDF URL for a paper."""
|
|
203
|
+
# Check metadata for PDF URL
|
|
204
|
+
if paper.metadata and "pdf_url" in paper.metadata:
|
|
205
|
+
return paper.metadata["pdf_url"]
|
|
206
|
+
|
|
207
|
+
# Source-specific URL generation
|
|
208
|
+
if paper.source == "arxiv" and paper.arxiv_id:
|
|
209
|
+
# arXiv PDF URL
|
|
210
|
+
return f"https://arxiv.org/pdf/{paper.arxiv_id}.pdf"
|
|
211
|
+
|
|
212
|
+
elif paper.source == "pubmed" and paper.pmid:
|
|
213
|
+
# PubMed Central PDF (if available)
|
|
214
|
+
# Note: This requires checking PMC availability
|
|
215
|
+
return self._get_pmc_pdf_url(paper.pmid)
|
|
216
|
+
|
|
217
|
+
elif paper.doi:
|
|
218
|
+
# Try Sci-Hub (for educational purposes only)
|
|
219
|
+
# Note: Use responsibly and check local regulations
|
|
220
|
+
return f"https://sci-hub.se/{paper.doi}"
|
|
221
|
+
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
def _get_pmc_pdf_url(self, pmid: str) -> Optional[str]:
|
|
225
|
+
"""Get PMC PDF URL from PMID (if available)."""
|
|
226
|
+
# This would require an API call to check PMC availability
|
|
227
|
+
# For now, return None
|
|
228
|
+
# In a full implementation, you would:
|
|
229
|
+
# 1. Query PMC to check if full text is available
|
|
230
|
+
# 2. Get the PMC ID
|
|
231
|
+
# 3. Construct the PDF URL
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
async def download_from_url(
|
|
235
|
+
self,
|
|
236
|
+
url: str,
|
|
237
|
+
filename: Optional[str] = None,
|
|
238
|
+
session: Optional[aiohttp.ClientSession] = None,
|
|
239
|
+
) -> Optional[Path]:
|
|
240
|
+
"""Download PDF from a direct URL.
|
|
241
|
+
|
|
242
|
+
Parameters
|
|
243
|
+
----------
|
|
244
|
+
url : str
|
|
245
|
+
PDF URL
|
|
246
|
+
filename : str, optional
|
|
247
|
+
Filename to save as
|
|
248
|
+
session : aiohttp.ClientSession, optional
|
|
249
|
+
Session for connection pooling
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
Path or None
|
|
254
|
+
Path to downloaded PDF
|
|
255
|
+
"""
|
|
256
|
+
if not filename:
|
|
257
|
+
# Extract filename from URL
|
|
258
|
+
parsed = urlparse(url)
|
|
259
|
+
filename = Path(parsed.path).name
|
|
260
|
+
if not filename.endswith(".pdf"):
|
|
261
|
+
filename = "downloaded_paper.pdf"
|
|
262
|
+
|
|
263
|
+
pdf_path = self.download_dir / filename
|
|
264
|
+
|
|
265
|
+
close_session = False
|
|
266
|
+
if session is None:
|
|
267
|
+
session = aiohttp.ClientSession()
|
|
268
|
+
close_session = True
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
async with session.get(
|
|
272
|
+
url,
|
|
273
|
+
headers=self.headers,
|
|
274
|
+
timeout=aiohttp.ClientTimeout(total=self.timeout),
|
|
275
|
+
) as response:
|
|
276
|
+
if response.status == 200:
|
|
277
|
+
content = await response.read()
|
|
278
|
+
|
|
279
|
+
# Save PDF
|
|
280
|
+
with open(pdf_path, "wb") as f:
|
|
281
|
+
f.write(content)
|
|
282
|
+
|
|
283
|
+
logger.info(f"Downloaded PDF to: {pdf_path}")
|
|
284
|
+
return pdf_path
|
|
285
|
+
else:
|
|
286
|
+
logger.error(f"Failed to download from {url} (status {response.status})")
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f"Error downloading from {url}: {e}")
|
|
291
|
+
return None
|
|
292
|
+
finally:
|
|
293
|
+
if close_session:
|
|
294
|
+
await session.close()
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# Example usage
|
|
298
|
+
if __name__ == "__main__":
|
|
299
|
+
async def main():
|
|
300
|
+
# Create downloader
|
|
301
|
+
downloader = PDFDownloader(download_dir=Path("./papers"))
|
|
302
|
+
|
|
303
|
+
# Example paper
|
|
304
|
+
paper = Paper(
|
|
305
|
+
title="Attention Is All You Need",
|
|
306
|
+
authors=["Ashish Vaswani", "Noam Shazeer", "et al."],
|
|
307
|
+
abstract="The dominant sequence transduction models...",
|
|
308
|
+
source="arxiv",
|
|
309
|
+
year=2017,
|
|
310
|
+
arxiv_id="1706.03762",
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Download single paper
|
|
314
|
+
pdf_path = await downloader.download_paper(paper)
|
|
315
|
+
if pdf_path:
|
|
316
|
+
print(f"Downloaded to: {pdf_path}")
|
|
317
|
+
|
|
318
|
+
# Download from URL
|
|
319
|
+
url = "https://arxiv.org/pdf/1706.03762.pdf"
|
|
320
|
+
path = await downloader.download_from_url(url, "attention_paper.pdf")
|
|
321
|
+
if path:
|
|
322
|
+
print(f"Downloaded from URL to: {path}")
|
|
323
|
+
|
|
324
|
+
# Run example
|
|
325
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Time-stamp: "2024-12-06 10:25:00"
|
|
4
|
+
# Author: Claude
|
|
5
|
+
# Filename: _search.py
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Unified search interface for SciTeX Scholar.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import asyncio
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List, Optional, Union, Dict, Any, Tuple
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
from ._paper import Paper
|
|
18
|
+
from ._vector_search import VectorSearchEngine
|
|
19
|
+
from ._web_sources import search_all_sources
|
|
20
|
+
from ._local_search import LocalSearchEngine
|
|
21
|
+
from ._pdf_downloader import PDFDownloader
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_scholar_dir() -> Path:
|
|
28
|
+
"""Get the SciTeX Scholar directory from environment or default."""
|
|
29
|
+
scholar_dir = os.environ.get('SciTeX_SCHOLAR_DIR', '~/.scitex/scholar')
|
|
30
|
+
path = Path(scholar_dir).expanduser()
|
|
31
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
return path
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def search(
|
|
36
|
+
query: str,
|
|
37
|
+
web: bool = True,
|
|
38
|
+
local: Optional[List[Union[str, Path]]] = None,
|
|
39
|
+
max_results: int = 20,
|
|
40
|
+
download_pdfs: bool = False,
|
|
41
|
+
use_vector_search: bool = True,
|
|
42
|
+
web_sources: Optional[List[str]] = None,
|
|
43
|
+
) -> List[Paper]:
|
|
44
|
+
"""Search for scientific papers from web and local sources.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
query : str
|
|
49
|
+
Search query
|
|
50
|
+
web : bool
|
|
51
|
+
Whether to search web sources (PubMed, arXiv, etc.)
|
|
52
|
+
local : List[str or Path], optional
|
|
53
|
+
Local directories to search. If None or empty list, no local search.
|
|
54
|
+
If provided, searches these specific paths.
|
|
55
|
+
max_results : int
|
|
56
|
+
Maximum number of results to return
|
|
57
|
+
download_pdfs : bool
|
|
58
|
+
Whether to download PDFs for web results
|
|
59
|
+
use_vector_search : bool
|
|
60
|
+
Whether to use vector similarity search
|
|
61
|
+
web_sources : List[str], optional
|
|
62
|
+
Web sources to search (default: all available)
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
List[Paper]
|
|
67
|
+
List of papers matching the query
|
|
68
|
+
|
|
69
|
+
Examples
|
|
70
|
+
--------
|
|
71
|
+
>>> import asyncio
|
|
72
|
+
>>> import scitex.scholar
|
|
73
|
+
>>>
|
|
74
|
+
>>> # Search web only (no local)
|
|
75
|
+
>>> papers = asyncio.run(scitex.scholar.search("deep learning"))
|
|
76
|
+
>>>
|
|
77
|
+
>>> # Search specific local directories
|
|
78
|
+
>>> papers = asyncio.run(scitex.scholar.search(
|
|
79
|
+
... "neural networks",
|
|
80
|
+
... web=False,
|
|
81
|
+
... local=["./papers", "~/Documents/papers"]
|
|
82
|
+
... ))
|
|
83
|
+
>>>
|
|
84
|
+
>>> # Search both web and local
|
|
85
|
+
>>> papers = asyncio.run(scitex.scholar.search(
|
|
86
|
+
... "transformer architecture",
|
|
87
|
+
... local=["./my_papers"],
|
|
88
|
+
... download_pdfs=True
|
|
89
|
+
... ))
|
|
90
|
+
"""
|
|
91
|
+
all_papers = []
|
|
92
|
+
scholar_dir = get_scholar_dir()
|
|
93
|
+
|
|
94
|
+
# Search web sources
|
|
95
|
+
if web:
|
|
96
|
+
web_papers = await _search_web_sources(
|
|
97
|
+
query,
|
|
98
|
+
max_results_per_source=max(5, max_results // 3),
|
|
99
|
+
sources=web_sources
|
|
100
|
+
)
|
|
101
|
+
all_papers.extend(web_papers)
|
|
102
|
+
logger.info(f"Found {len(web_papers)} papers from web sources")
|
|
103
|
+
|
|
104
|
+
# Search local sources if paths provided
|
|
105
|
+
if local:
|
|
106
|
+
local_paths = [Path(p).expanduser() for p in local]
|
|
107
|
+
local_papers = await _search_local_sources(
|
|
108
|
+
query,
|
|
109
|
+
local_paths,
|
|
110
|
+
max_results=max_results
|
|
111
|
+
)
|
|
112
|
+
all_papers.extend(local_papers)
|
|
113
|
+
logger.info(f"Found {len(local_papers)} papers from local sources")
|
|
114
|
+
|
|
115
|
+
# Remove duplicates based on title similarity
|
|
116
|
+
papers = _deduplicate_papers(all_papers)
|
|
117
|
+
|
|
118
|
+
# Apply vector search if enabled
|
|
119
|
+
if use_vector_search and papers:
|
|
120
|
+
papers = await _apply_vector_search(query, papers, max_results, scholar_dir)
|
|
121
|
+
else:
|
|
122
|
+
# Simple relevance sorting
|
|
123
|
+
papers = papers[:max_results]
|
|
124
|
+
|
|
125
|
+
# Download PDFs if requested
|
|
126
|
+
if download_pdfs and web:
|
|
127
|
+
await _download_pdfs(papers, scholar_dir / "pdfs")
|
|
128
|
+
|
|
129
|
+
return papers
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
async def _search_web_sources(
|
|
133
|
+
query: str,
|
|
134
|
+
max_results_per_source: int,
|
|
135
|
+
sources: Optional[List[str]] = None
|
|
136
|
+
) -> List[Paper]:
|
|
137
|
+
"""Search web sources for papers."""
|
|
138
|
+
try:
|
|
139
|
+
results = await search_all_sources(
|
|
140
|
+
query,
|
|
141
|
+
max_results_per_source=max_results_per_source,
|
|
142
|
+
sources=sources
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
papers = []
|
|
146
|
+
for source, source_papers in results.items():
|
|
147
|
+
papers.extend(source_papers)
|
|
148
|
+
|
|
149
|
+
return papers
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f"Error in web search: {e}")
|
|
152
|
+
return []
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
async def _search_local_sources(
|
|
156
|
+
query: str,
|
|
157
|
+
paths: List[Path],
|
|
158
|
+
max_results: int
|
|
159
|
+
) -> List[Paper]:
|
|
160
|
+
"""Search local sources for papers."""
|
|
161
|
+
try:
|
|
162
|
+
scholar_dir = get_scholar_dir()
|
|
163
|
+
local_engine = LocalSearchEngine(
|
|
164
|
+
index_path=scholar_dir / "local_index.json",
|
|
165
|
+
cache_metadata=True
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
results = local_engine.search(
|
|
169
|
+
query,
|
|
170
|
+
paths,
|
|
171
|
+
recursive=True,
|
|
172
|
+
max_results=max_results
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
papers = [paper for paper, score in results]
|
|
176
|
+
return papers
|
|
177
|
+
except Exception as e:
|
|
178
|
+
logger.error(f"Error in local search: {e}")
|
|
179
|
+
return []
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
async def _apply_vector_search(
|
|
183
|
+
query: str,
|
|
184
|
+
papers: List[Paper],
|
|
185
|
+
max_results: int,
|
|
186
|
+
scholar_dir: Path
|
|
187
|
+
) -> List[Paper]:
|
|
188
|
+
"""Apply vector similarity search to rank papers."""
|
|
189
|
+
try:
|
|
190
|
+
vector_engine = VectorSearchEngine(
|
|
191
|
+
index_path=scholar_dir / "vector_index.pkl",
|
|
192
|
+
embedding_dim=384, # Using smaller model by default
|
|
193
|
+
similarity_metric="cosine"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Add papers to engine if not already indexed
|
|
197
|
+
for paper in papers:
|
|
198
|
+
vector_engine.add_paper(paper, update_embedding=True)
|
|
199
|
+
|
|
200
|
+
# Search and re-rank
|
|
201
|
+
results = vector_engine.search(query, top_k=max_results)
|
|
202
|
+
|
|
203
|
+
# Save updated index
|
|
204
|
+
vector_engine.save_index()
|
|
205
|
+
|
|
206
|
+
return [paper for paper, score in results]
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.error(f"Error in vector search: {e}")
|
|
209
|
+
# Fallback to original order
|
|
210
|
+
return papers[:max_results]
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
async def _download_pdfs(papers: List[Paper], download_dir: Path) -> None:
|
|
214
|
+
"""Download PDFs for papers that don't have local copies."""
|
|
215
|
+
try:
|
|
216
|
+
downloader = PDFDownloader(download_dir=download_dir)
|
|
217
|
+
|
|
218
|
+
# Filter papers that need PDFs
|
|
219
|
+
papers_to_download = [
|
|
220
|
+
p for p in papers
|
|
221
|
+
if not p.has_pdf() and p.source in ["arxiv", "pubmed"]
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
if papers_to_download:
|
|
225
|
+
logger.info(f"Downloading PDFs for {len(papers_to_download)} papers...")
|
|
226
|
+
|
|
227
|
+
def progress_callback(completed, total):
|
|
228
|
+
if completed % 5 == 0 or completed == total:
|
|
229
|
+
logger.info(f"Downloaded {completed}/{total} PDFs")
|
|
230
|
+
|
|
231
|
+
await downloader.download_papers(
|
|
232
|
+
papers_to_download,
|
|
233
|
+
progress_callback=progress_callback
|
|
234
|
+
)
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.error(f"Error downloading PDFs: {e}")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _deduplicate_papers(papers: List[Paper]) -> List[Paper]:
|
|
240
|
+
"""Remove duplicate papers based on title similarity."""
|
|
241
|
+
if not papers:
|
|
242
|
+
return papers
|
|
243
|
+
|
|
244
|
+
unique_papers = []
|
|
245
|
+
seen_identifiers = set()
|
|
246
|
+
|
|
247
|
+
for paper in papers:
|
|
248
|
+
# Check exact identifier match
|
|
249
|
+
identifier = paper.get_identifier()
|
|
250
|
+
if identifier in seen_identifiers:
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
# Check title similarity with existing papers
|
|
254
|
+
is_duplicate = False
|
|
255
|
+
for existing in unique_papers:
|
|
256
|
+
if paper.similarity_score(existing) > 0.8: # 80% similarity threshold
|
|
257
|
+
is_duplicate = True
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
if not is_duplicate:
|
|
261
|
+
unique_papers.append(paper)
|
|
262
|
+
seen_identifiers.add(identifier)
|
|
263
|
+
|
|
264
|
+
return unique_papers
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def build_index(
|
|
268
|
+
paths: Optional[List[Union[str, Path]]] = None,
|
|
269
|
+
recursive: bool = True,
|
|
270
|
+
build_vector_index: bool = True,
|
|
271
|
+
) -> Dict[str, Any]:
|
|
272
|
+
"""Build search index for local papers.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
paths : List[str or Path], optional
|
|
277
|
+
Paths to index (default: current directory)
|
|
278
|
+
recursive : bool
|
|
279
|
+
Whether to search directories recursively
|
|
280
|
+
build_vector_index : bool
|
|
281
|
+
Whether to build vector embeddings
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
Dict[str, Any]
|
|
286
|
+
Index statistics
|
|
287
|
+
|
|
288
|
+
Examples
|
|
289
|
+
--------
|
|
290
|
+
>>> import scitex.scholar
|
|
291
|
+
>>>
|
|
292
|
+
>>> # Index current directory
|
|
293
|
+
>>> stats = scitex.scholar.build_index()
|
|
294
|
+
>>>
|
|
295
|
+
>>> # Index multiple directories
|
|
296
|
+
>>> stats = scitex.scholar.build_index([
|
|
297
|
+
... "./papers",
|
|
298
|
+
... "~/Documents/research"
|
|
299
|
+
... ])
|
|
300
|
+
"""
|
|
301
|
+
if paths is None:
|
|
302
|
+
paths = [Path(".")]
|
|
303
|
+
else:
|
|
304
|
+
paths = [Path(p).expanduser() for p in paths]
|
|
305
|
+
|
|
306
|
+
scholar_dir = get_scholar_dir()
|
|
307
|
+
stats = {}
|
|
308
|
+
|
|
309
|
+
# Build local search index
|
|
310
|
+
logger.info(f"Building local search index for {len(paths)} paths...")
|
|
311
|
+
local_engine = LocalSearchEngine(
|
|
312
|
+
index_path=scholar_dir / "local_index.json",
|
|
313
|
+
cache_metadata=True
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
num_files = local_engine.build_index(paths, recursive=recursive)
|
|
317
|
+
stats["local_files_indexed"] = num_files
|
|
318
|
+
|
|
319
|
+
# Build vector index if requested
|
|
320
|
+
if build_vector_index and num_files > 0:
|
|
321
|
+
logger.info("Building vector embeddings...")
|
|
322
|
+
|
|
323
|
+
# Get all indexed papers
|
|
324
|
+
all_papers = []
|
|
325
|
+
for path in paths:
|
|
326
|
+
results = local_engine.search("*", [path], max_results=None)
|
|
327
|
+
all_papers.extend([paper for paper, score in results])
|
|
328
|
+
|
|
329
|
+
# Create vector index
|
|
330
|
+
vector_engine = VectorSearchEngine(
|
|
331
|
+
index_path=scholar_dir / "vector_index.pkl",
|
|
332
|
+
embedding_dim=384,
|
|
333
|
+
similarity_metric="cosine"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Add papers with progress logging
|
|
337
|
+
for i, paper in enumerate(all_papers):
|
|
338
|
+
vector_engine.add_paper(paper, update_embedding=True)
|
|
339
|
+
if (i + 1) % 10 == 0:
|
|
340
|
+
logger.info(f"Generated embeddings for {i + 1}/{len(all_papers)} papers")
|
|
341
|
+
|
|
342
|
+
# Save index
|
|
343
|
+
vector_engine.save_index()
|
|
344
|
+
stats["vector_embeddings_created"] = len(all_papers)
|
|
345
|
+
stats.update(vector_engine.get_statistics())
|
|
346
|
+
|
|
347
|
+
logger.info(f"Index building complete: {stats}")
|
|
348
|
+
return stats
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
# Synchronous wrapper for convenience
|
|
352
|
+
def search_sync(
|
|
353
|
+
query: str,
|
|
354
|
+
web: bool = True,
|
|
355
|
+
local: Optional[List[Union[str, Path]]] = None,
|
|
356
|
+
max_results: int = 20,
|
|
357
|
+
download_pdfs: bool = False,
|
|
358
|
+
use_vector_search: bool = True,
|
|
359
|
+
web_sources: Optional[List[str]] = None,
|
|
360
|
+
) -> List[Paper]:
|
|
361
|
+
"""Synchronous wrapper for search function.
|
|
362
|
+
|
|
363
|
+
See `search` for parameter documentation.
|
|
364
|
+
|
|
365
|
+
Examples
|
|
366
|
+
--------
|
|
367
|
+
>>> import scitex.scholar
|
|
368
|
+
>>>
|
|
369
|
+
>>> # Simple synchronous search (web only)
|
|
370
|
+
>>> papers = scitex.scholar.search_sync("machine learning")
|
|
371
|
+
>>>
|
|
372
|
+
>>> # Search with local directories
|
|
373
|
+
>>> papers = scitex.scholar.search_sync(
|
|
374
|
+
... "deep learning",
|
|
375
|
+
... local=["./papers", "~/Documents/research"]
|
|
376
|
+
... )
|
|
377
|
+
>>>
|
|
378
|
+
>>> # Local only search
|
|
379
|
+
>>> papers = scitex.scholar.search_sync(
|
|
380
|
+
... "neural networks",
|
|
381
|
+
... web=False,
|
|
382
|
+
... local=["./my_papers"]
|
|
383
|
+
... )
|
|
384
|
+
"""
|
|
385
|
+
return asyncio.run(search(
|
|
386
|
+
query=query,
|
|
387
|
+
web=web,
|
|
388
|
+
local=local,
|
|
389
|
+
max_results=max_results,
|
|
390
|
+
download_pdfs=download_pdfs,
|
|
391
|
+
use_vector_search=use_vector_search,
|
|
392
|
+
web_sources=web_sources,
|
|
393
|
+
))
|