scitex 2.0.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +73 -0
- scitex/__main__.py +89 -0
- scitex/__version__.py +14 -0
- scitex/_sh.py +59 -0
- scitex/ai/_LearningCurveLogger.py +583 -0
- scitex/ai/__Classifiers.py +101 -0
- scitex/ai/__init__.py +55 -0
- scitex/ai/_gen_ai/_Anthropic.py +173 -0
- scitex/ai/_gen_ai/_BaseGenAI.py +336 -0
- scitex/ai/_gen_ai/_DeepSeek.py +175 -0
- scitex/ai/_gen_ai/_Google.py +161 -0
- scitex/ai/_gen_ai/_Groq.py +97 -0
- scitex/ai/_gen_ai/_Llama.py +142 -0
- scitex/ai/_gen_ai/_OpenAI.py +230 -0
- scitex/ai/_gen_ai/_PARAMS.py +565 -0
- scitex/ai/_gen_ai/_Perplexity.py +191 -0
- scitex/ai/_gen_ai/__init__.py +32 -0
- scitex/ai/_gen_ai/_calc_cost.py +78 -0
- scitex/ai/_gen_ai/_format_output_func.py +183 -0
- scitex/ai/_gen_ai/_genai_factory.py +71 -0
- scitex/ai/act/__init__.py +8 -0
- scitex/ai/act/_define.py +11 -0
- scitex/ai/classification/__init__.py +7 -0
- scitex/ai/classification/classification_reporter.py +1137 -0
- scitex/ai/classification/classifier_server.py +131 -0
- scitex/ai/classification/classifiers.py +101 -0
- scitex/ai/classification_reporter.py +1161 -0
- scitex/ai/classifier_server.py +131 -0
- scitex/ai/clustering/__init__.py +11 -0
- scitex/ai/clustering/_pca.py +115 -0
- scitex/ai/clustering/_umap.py +376 -0
- scitex/ai/early_stopping.py +149 -0
- scitex/ai/feature_extraction/__init__.py +56 -0
- scitex/ai/feature_extraction/vit.py +148 -0
- scitex/ai/genai/__init__.py +277 -0
- scitex/ai/genai/anthropic.py +177 -0
- scitex/ai/genai/anthropic_provider.py +320 -0
- scitex/ai/genai/anthropic_refactored.py +109 -0
- scitex/ai/genai/auth_manager.py +200 -0
- scitex/ai/genai/base_genai.py +336 -0
- scitex/ai/genai/base_provider.py +291 -0
- scitex/ai/genai/calc_cost.py +78 -0
- scitex/ai/genai/chat_history.py +307 -0
- scitex/ai/genai/cost_tracker.py +276 -0
- scitex/ai/genai/deepseek.py +188 -0
- scitex/ai/genai/deepseek_provider.py +251 -0
- scitex/ai/genai/format_output_func.py +183 -0
- scitex/ai/genai/genai_factory.py +71 -0
- scitex/ai/genai/google.py +169 -0
- scitex/ai/genai/google_provider.py +228 -0
- scitex/ai/genai/groq.py +104 -0
- scitex/ai/genai/groq_provider.py +248 -0
- scitex/ai/genai/image_processor.py +250 -0
- scitex/ai/genai/llama.py +155 -0
- scitex/ai/genai/llama_provider.py +214 -0
- scitex/ai/genai/mock_provider.py +127 -0
- scitex/ai/genai/model_registry.py +304 -0
- scitex/ai/genai/openai.py +230 -0
- scitex/ai/genai/openai_provider.py +293 -0
- scitex/ai/genai/params.py +565 -0
- scitex/ai/genai/perplexity.py +202 -0
- scitex/ai/genai/perplexity_provider.py +205 -0
- scitex/ai/genai/provider_base.py +302 -0
- scitex/ai/genai/provider_factory.py +370 -0
- scitex/ai/genai/response_handler.py +235 -0
- scitex/ai/layer/_Pass.py +21 -0
- scitex/ai/layer/__init__.py +10 -0
- scitex/ai/layer/_switch.py +8 -0
- scitex/ai/loss/_L1L2Losses.py +34 -0
- scitex/ai/loss/__init__.py +12 -0
- scitex/ai/loss/multi_task_loss.py +47 -0
- scitex/ai/metrics/__init__.py +9 -0
- scitex/ai/metrics/_bACC.py +51 -0
- scitex/ai/metrics/silhoute_score_block.py +496 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/__init__.py +0 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/__init__.py +3 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger.py +207 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger2020.py +238 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/ranger913A.py +215 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/ranger/rangerqh.py +184 -0
- scitex/ai/optim/Ranger_Deep_Learning_Optimizer/setup.py +24 -0
- scitex/ai/optim/__init__.py +13 -0
- scitex/ai/optim/_get_set.py +31 -0
- scitex/ai/optim/_optimizers.py +71 -0
- scitex/ai/plt/__init__.py +21 -0
- scitex/ai/plt/_conf_mat.py +592 -0
- scitex/ai/plt/_learning_curve.py +194 -0
- scitex/ai/plt/_optuna_study.py +111 -0
- scitex/ai/plt/aucs/__init__.py +2 -0
- scitex/ai/plt/aucs/example.py +60 -0
- scitex/ai/plt/aucs/pre_rec_auc.py +223 -0
- scitex/ai/plt/aucs/roc_auc.py +246 -0
- scitex/ai/sampling/undersample.py +29 -0
- scitex/ai/sk/__init__.py +11 -0
- scitex/ai/sk/_clf.py +58 -0
- scitex/ai/sk/_to_sktime.py +100 -0
- scitex/ai/sklearn/__init__.py +26 -0
- scitex/ai/sklearn/clf.py +58 -0
- scitex/ai/sklearn/to_sktime.py +100 -0
- scitex/ai/training/__init__.py +7 -0
- scitex/ai/training/early_stopping.py +150 -0
- scitex/ai/training/learning_curve_logger.py +555 -0
- scitex/ai/utils/__init__.py +22 -0
- scitex/ai/utils/_check_params.py +50 -0
- scitex/ai/utils/_default_dataset.py +46 -0
- scitex/ai/utils/_format_samples_for_sktime.py +26 -0
- scitex/ai/utils/_label_encoder.py +134 -0
- scitex/ai/utils/_merge_labels.py +22 -0
- scitex/ai/utils/_sliding_window_data_augmentation.py +11 -0
- scitex/ai/utils/_under_sample.py +51 -0
- scitex/ai/utils/_verify_n_gpus.py +16 -0
- scitex/ai/utils/grid_search.py +148 -0
- scitex/context/__init__.py +9 -0
- scitex/context/_suppress_output.py +38 -0
- scitex/db/_BaseMixins/_BaseBackupMixin.py +30 -0
- scitex/db/_BaseMixins/_BaseBatchMixin.py +31 -0
- scitex/db/_BaseMixins/_BaseBlobMixin.py +81 -0
- scitex/db/_BaseMixins/_BaseConnectionMixin.py +43 -0
- scitex/db/_BaseMixins/_BaseImportExportMixin.py +39 -0
- scitex/db/_BaseMixins/_BaseIndexMixin.py +29 -0
- scitex/db/_BaseMixins/_BaseMaintenanceMixin.py +33 -0
- scitex/db/_BaseMixins/_BaseQueryMixin.py +52 -0
- scitex/db/_BaseMixins/_BaseRowMixin.py +32 -0
- scitex/db/_BaseMixins/_BaseSchemaMixin.py +44 -0
- scitex/db/_BaseMixins/_BaseTableMixin.py +66 -0
- scitex/db/_BaseMixins/_BaseTransactionMixin.py +52 -0
- scitex/db/_BaseMixins/__init__.py +30 -0
- scitex/db/_PostgreSQL.py +126 -0
- scitex/db/_PostgreSQLMixins/_BackupMixin.py +166 -0
- scitex/db/_PostgreSQLMixins/_BatchMixin.py +82 -0
- scitex/db/_PostgreSQLMixins/_BlobMixin.py +231 -0
- scitex/db/_PostgreSQLMixins/_ConnectionMixin.py +92 -0
- scitex/db/_PostgreSQLMixins/_ImportExportMixin.py +59 -0
- scitex/db/_PostgreSQLMixins/_IndexMixin.py +64 -0
- scitex/db/_PostgreSQLMixins/_MaintenanceMixin.py +175 -0
- scitex/db/_PostgreSQLMixins/_QueryMixin.py +108 -0
- scitex/db/_PostgreSQLMixins/_RowMixin.py +75 -0
- scitex/db/_PostgreSQLMixins/_SchemaMixin.py +126 -0
- scitex/db/_PostgreSQLMixins/_TableMixin.py +176 -0
- scitex/db/_PostgreSQLMixins/_TransactionMixin.py +57 -0
- scitex/db/_PostgreSQLMixins/__init__.py +34 -0
- scitex/db/_SQLite3.py +2136 -0
- scitex/db/_SQLite3Mixins/_BatchMixin.py +243 -0
- scitex/db/_SQLite3Mixins/_BlobMixin.py +229 -0
- scitex/db/_SQLite3Mixins/_ConnectionMixin.py +108 -0
- scitex/db/_SQLite3Mixins/_ImportExportMixin.py +80 -0
- scitex/db/_SQLite3Mixins/_IndexMixin.py +32 -0
- scitex/db/_SQLite3Mixins/_MaintenanceMixin.py +176 -0
- scitex/db/_SQLite3Mixins/_QueryMixin.py +83 -0
- scitex/db/_SQLite3Mixins/_RowMixin.py +75 -0
- scitex/db/_SQLite3Mixins/_TableMixin.py +183 -0
- scitex/db/_SQLite3Mixins/_TransactionMixin.py +71 -0
- scitex/db/_SQLite3Mixins/__init__.py +30 -0
- scitex/db/__init__.py +14 -0
- scitex/db/_delete_duplicates.py +397 -0
- scitex/db/_inspect.py +163 -0
- scitex/decorators/__init__.py +54 -0
- scitex/decorators/_auto_order.py +172 -0
- scitex/decorators/_batch_fn.py +127 -0
- scitex/decorators/_cache_disk.py +32 -0
- scitex/decorators/_cache_mem.py +12 -0
- scitex/decorators/_combined.py +98 -0
- scitex/decorators/_converters.py +282 -0
- scitex/decorators/_deprecated.py +26 -0
- scitex/decorators/_not_implemented.py +30 -0
- scitex/decorators/_numpy_fn.py +86 -0
- scitex/decorators/_pandas_fn.py +121 -0
- scitex/decorators/_preserve_doc.py +19 -0
- scitex/decorators/_signal_fn.py +95 -0
- scitex/decorators/_timeout.py +55 -0
- scitex/decorators/_torch_fn.py +136 -0
- scitex/decorators/_wrap.py +39 -0
- scitex/decorators/_xarray_fn.py +88 -0
- scitex/dev/__init__.py +15 -0
- scitex/dev/_analyze_code_flow.py +284 -0
- scitex/dev/_reload.py +59 -0
- scitex/dict/_DotDict.py +442 -0
- scitex/dict/__init__.py +18 -0
- scitex/dict/_listed_dict.py +42 -0
- scitex/dict/_pop_keys.py +36 -0
- scitex/dict/_replace.py +13 -0
- scitex/dict/_safe_merge.py +62 -0
- scitex/dict/_to_str.py +32 -0
- scitex/dsp/__init__.py +72 -0
- scitex/dsp/_crop.py +122 -0
- scitex/dsp/_demo_sig.py +331 -0
- scitex/dsp/_detect_ripples.py +212 -0
- scitex/dsp/_ensure_3d.py +18 -0
- scitex/dsp/_hilbert.py +78 -0
- scitex/dsp/_listen.py +702 -0
- scitex/dsp/_misc.py +30 -0
- scitex/dsp/_mne.py +32 -0
- scitex/dsp/_modulation_index.py +79 -0
- scitex/dsp/_pac.py +319 -0
- scitex/dsp/_psd.py +102 -0
- scitex/dsp/_resample.py +65 -0
- scitex/dsp/_time.py +36 -0
- scitex/dsp/_transform.py +68 -0
- scitex/dsp/_wavelet.py +212 -0
- scitex/dsp/add_noise.py +111 -0
- scitex/dsp/example.py +253 -0
- scitex/dsp/filt.py +155 -0
- scitex/dsp/norm.py +18 -0
- scitex/dsp/params.py +51 -0
- scitex/dsp/reference.py +43 -0
- scitex/dsp/template.py +25 -0
- scitex/dsp/utils/__init__.py +15 -0
- scitex/dsp/utils/_differential_bandpass_filters.py +120 -0
- scitex/dsp/utils/_ensure_3d.py +18 -0
- scitex/dsp/utils/_ensure_even_len.py +10 -0
- scitex/dsp/utils/_zero_pad.py +48 -0
- scitex/dsp/utils/filter.py +408 -0
- scitex/dsp/utils/pac.py +177 -0
- scitex/dt/__init__.py +8 -0
- scitex/dt/_linspace.py +130 -0
- scitex/etc/__init__.py +15 -0
- scitex/etc/wait_key.py +34 -0
- scitex/gen/_DimHandler.py +196 -0
- scitex/gen/_TimeStamper.py +244 -0
- scitex/gen/__init__.py +95 -0
- scitex/gen/_alternate_kwarg.py +13 -0
- scitex/gen/_cache.py +11 -0
- scitex/gen/_check_host.py +34 -0
- scitex/gen/_ci.py +12 -0
- scitex/gen/_close.py +222 -0
- scitex/gen/_embed.py +78 -0
- scitex/gen/_inspect_module.py +257 -0
- scitex/gen/_is_ipython.py +12 -0
- scitex/gen/_less.py +48 -0
- scitex/gen/_list_packages.py +139 -0
- scitex/gen/_mat2py.py +88 -0
- scitex/gen/_norm.py +170 -0
- scitex/gen/_paste.py +18 -0
- scitex/gen/_print_config.py +84 -0
- scitex/gen/_shell.py +48 -0
- scitex/gen/_src.py +111 -0
- scitex/gen/_start.py +451 -0
- scitex/gen/_symlink.py +55 -0
- scitex/gen/_symlog.py +27 -0
- scitex/gen/_tee.py +238 -0
- scitex/gen/_title2path.py +60 -0
- scitex/gen/_title_case.py +88 -0
- scitex/gen/_to_even.py +84 -0
- scitex/gen/_to_odd.py +34 -0
- scitex/gen/_to_rank.py +39 -0
- scitex/gen/_transpose.py +37 -0
- scitex/gen/_type.py +78 -0
- scitex/gen/_var_info.py +73 -0
- scitex/gen/_wrap.py +17 -0
- scitex/gen/_xml2dict.py +76 -0
- scitex/gen/misc.py +730 -0
- scitex/gen/path.py +0 -0
- scitex/general/__init__.py +5 -0
- scitex/gists/_SigMacro_processFigure_S.py +128 -0
- scitex/gists/_SigMacro_toBlue.py +172 -0
- scitex/gists/__init__.py +12 -0
- scitex/io/_H5Explorer.py +292 -0
- scitex/io/__init__.py +82 -0
- scitex/io/_cache.py +101 -0
- scitex/io/_flush.py +24 -0
- scitex/io/_glob.py +103 -0
- scitex/io/_json2md.py +113 -0
- scitex/io/_load.py +168 -0
- scitex/io/_load_configs.py +146 -0
- scitex/io/_load_modules/__init__.py +38 -0
- scitex/io/_load_modules/_catboost.py +66 -0
- scitex/io/_load_modules/_con.py +20 -0
- scitex/io/_load_modules/_db.py +24 -0
- scitex/io/_load_modules/_docx.py +42 -0
- scitex/io/_load_modules/_eeg.py +110 -0
- scitex/io/_load_modules/_hdf5.py +196 -0
- scitex/io/_load_modules/_image.py +19 -0
- scitex/io/_load_modules/_joblib.py +19 -0
- scitex/io/_load_modules/_json.py +18 -0
- scitex/io/_load_modules/_markdown.py +103 -0
- scitex/io/_load_modules/_matlab.py +37 -0
- scitex/io/_load_modules/_numpy.py +39 -0
- scitex/io/_load_modules/_optuna.py +155 -0
- scitex/io/_load_modules/_pandas.py +69 -0
- scitex/io/_load_modules/_pdf.py +31 -0
- scitex/io/_load_modules/_pickle.py +24 -0
- scitex/io/_load_modules/_torch.py +16 -0
- scitex/io/_load_modules/_txt.py +126 -0
- scitex/io/_load_modules/_xml.py +49 -0
- scitex/io/_load_modules/_yaml.py +23 -0
- scitex/io/_mv_to_tmp.py +19 -0
- scitex/io/_path.py +286 -0
- scitex/io/_reload.py +78 -0
- scitex/io/_save.py +539 -0
- scitex/io/_save_modules/__init__.py +66 -0
- scitex/io/_save_modules/_catboost.py +22 -0
- scitex/io/_save_modules/_csv.py +89 -0
- scitex/io/_save_modules/_excel.py +49 -0
- scitex/io/_save_modules/_hdf5.py +249 -0
- scitex/io/_save_modules/_html.py +48 -0
- scitex/io/_save_modules/_image.py +140 -0
- scitex/io/_save_modules/_joblib.py +25 -0
- scitex/io/_save_modules/_json.py +25 -0
- scitex/io/_save_modules/_listed_dfs_as_csv.py +57 -0
- scitex/io/_save_modules/_listed_scalars_as_csv.py +42 -0
- scitex/io/_save_modules/_matlab.py +24 -0
- scitex/io/_save_modules/_mp4.py +29 -0
- scitex/io/_save_modules/_numpy.py +57 -0
- scitex/io/_save_modules/_optuna_study_as_csv_and_pngs.py +38 -0
- scitex/io/_save_modules/_pickle.py +45 -0
- scitex/io/_save_modules/_plotly.py +27 -0
- scitex/io/_save_modules/_text.py +23 -0
- scitex/io/_save_modules/_torch.py +26 -0
- scitex/io/_save_modules/_yaml.py +29 -0
- scitex/life/__init__.py +10 -0
- scitex/life/_monitor_rain.py +49 -0
- scitex/linalg/__init__.py +17 -0
- scitex/linalg/_distance.py +63 -0
- scitex/linalg/_geometric_median.py +64 -0
- scitex/linalg/_misc.py +73 -0
- scitex/nn/_AxiswiseDropout.py +27 -0
- scitex/nn/_BNet.py +126 -0
- scitex/nn/_BNet_Res.py +164 -0
- scitex/nn/_ChannelGainChanger.py +44 -0
- scitex/nn/_DropoutChannels.py +50 -0
- scitex/nn/_Filters.py +489 -0
- scitex/nn/_FreqGainChanger.py +110 -0
- scitex/nn/_GaussianFilter.py +48 -0
- scitex/nn/_Hilbert.py +111 -0
- scitex/nn/_MNet_1000.py +157 -0
- scitex/nn/_ModulationIndex.py +221 -0
- scitex/nn/_PAC.py +414 -0
- scitex/nn/_PSD.py +40 -0
- scitex/nn/_ResNet1D.py +120 -0
- scitex/nn/_SpatialAttention.py +25 -0
- scitex/nn/_Spectrogram.py +161 -0
- scitex/nn/_SwapChannels.py +50 -0
- scitex/nn/_TransposeLayer.py +19 -0
- scitex/nn/_Wavelet.py +183 -0
- scitex/nn/__init__.py +63 -0
- scitex/os/__init__.py +8 -0
- scitex/os/_mv.py +50 -0
- scitex/parallel/__init__.py +8 -0
- scitex/parallel/_run.py +151 -0
- scitex/path/__init__.py +33 -0
- scitex/path/_clean.py +52 -0
- scitex/path/_find.py +108 -0
- scitex/path/_get_module_path.py +51 -0
- scitex/path/_get_spath.py +35 -0
- scitex/path/_getsize.py +18 -0
- scitex/path/_increment_version.py +87 -0
- scitex/path/_mk_spath.py +51 -0
- scitex/path/_path.py +19 -0
- scitex/path/_split.py +23 -0
- scitex/path/_this_path.py +19 -0
- scitex/path/_version.py +101 -0
- scitex/pd/__init__.py +41 -0
- scitex/pd/_find_indi.py +126 -0
- scitex/pd/_find_pval.py +113 -0
- scitex/pd/_force_df.py +154 -0
- scitex/pd/_from_xyz.py +71 -0
- scitex/pd/_ignore_SettingWithCopyWarning.py +34 -0
- scitex/pd/_melt_cols.py +81 -0
- scitex/pd/_merge_columns.py +221 -0
- scitex/pd/_mv.py +63 -0
- scitex/pd/_replace.py +62 -0
- scitex/pd/_round.py +93 -0
- scitex/pd/_slice.py +63 -0
- scitex/pd/_sort.py +91 -0
- scitex/pd/_to_numeric.py +53 -0
- scitex/pd/_to_xy.py +59 -0
- scitex/pd/_to_xyz.py +110 -0
- scitex/plt/__init__.py +36 -0
- scitex/plt/_subplots/_AxesWrapper.py +182 -0
- scitex/plt/_subplots/_AxisWrapper.py +249 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin.py +414 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin.py +896 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin.py +368 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_TrackingMixin.py +185 -0
- scitex/plt/_subplots/_AxisWrapperMixins/__init__.py +16 -0
- scitex/plt/_subplots/_FigWrapper.py +226 -0
- scitex/plt/_subplots/_SubplotsWrapper.py +171 -0
- scitex/plt/_subplots/__init__.py +111 -0
- scitex/plt/_subplots/_export_as_csv.py +232 -0
- scitex/plt/_subplots/_export_as_csv_formatters/__init__.py +61 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_bar.py +90 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_barh.py +49 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_boxplot.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_contour.py +39 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_errorbar.py +125 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_eventplot.py +72 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill.py +34 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_fill_between.py +36 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_hist.py +79 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow.py +59 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow2d.py +32 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot.py +79 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_box.py +75 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_conf_mat.py +64 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_ecdf.py +44 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_fillv.py +70 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_heatmap.py +66 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_image.py +95 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_joyplot.py +67 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_kde.py +52 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_line.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_ci.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_mean_std.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_median_iqr.py +46 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_raster.py +44 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_rectangle.py +103 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter_hist.py +82 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_shaded_line.py +58 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_violin.py +117 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_scatter.py +30 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_barplot.py +51 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_boxplot.py +93 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_heatmap.py +94 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_histplot.py +92 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_jointplot.py +65 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_kdeplot.py +59 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_lineplot.py +58 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_pairplot.py +45 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_scatterplot.py +70 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_stripplot.py +75 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_swarmplot.py +75 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_violinplot.py +155 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violin.py +64 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_violinplot.py +77 -0
- scitex/plt/_subplots/_export_as_csv_formatters/test_formatters.py +210 -0
- scitex/plt/_subplots/_export_as_csv_formatters/verify_formatters.py +342 -0
- scitex/plt/_subplots/_export_as_csv_formatters.py +115 -0
- scitex/plt/_tpl.py +28 -0
- scitex/plt/ax/__init__.py +114 -0
- scitex/plt/ax/_plot/__init__.py +53 -0
- scitex/plt/ax/_plot/_plot_circular_hist.py +124 -0
- scitex/plt/ax/_plot/_plot_conf_mat.py +136 -0
- scitex/plt/ax/_plot/_plot_cube.py +57 -0
- scitex/plt/ax/_plot/_plot_ecdf.py +84 -0
- scitex/plt/ax/_plot/_plot_fillv.py +55 -0
- scitex/plt/ax/_plot/_plot_heatmap.py +266 -0
- scitex/plt/ax/_plot/_plot_image.py +94 -0
- scitex/plt/ax/_plot/_plot_joyplot.py +76 -0
- scitex/plt/ax/_plot/_plot_raster.py +172 -0
- scitex/plt/ax/_plot/_plot_rectangle.py +69 -0
- scitex/plt/ax/_plot/_plot_scatter_hist.py +133 -0
- scitex/plt/ax/_plot/_plot_shaded_line.py +142 -0
- scitex/plt/ax/_plot/_plot_statistical_shaded_line.py +221 -0
- scitex/plt/ax/_plot/_plot_violin.py +343 -0
- scitex/plt/ax/_style/__init__.py +38 -0
- scitex/plt/ax/_style/_add_marginal_ax.py +44 -0
- scitex/plt/ax/_style/_add_panel.py +92 -0
- scitex/plt/ax/_style/_extend.py +64 -0
- scitex/plt/ax/_style/_force_aspect.py +37 -0
- scitex/plt/ax/_style/_format_label.py +23 -0
- scitex/plt/ax/_style/_hide_spines.py +84 -0
- scitex/plt/ax/_style/_map_ticks.py +182 -0
- scitex/plt/ax/_style/_rotate_labels.py +215 -0
- scitex/plt/ax/_style/_sci_note.py +279 -0
- scitex/plt/ax/_style/_set_log_scale.py +299 -0
- scitex/plt/ax/_style/_set_meta.py +261 -0
- scitex/plt/ax/_style/_set_n_ticks.py +37 -0
- scitex/plt/ax/_style/_set_size.py +16 -0
- scitex/plt/ax/_style/_set_supxyt.py +116 -0
- scitex/plt/ax/_style/_set_ticks.py +276 -0
- scitex/plt/ax/_style/_set_xyt.py +121 -0
- scitex/plt/ax/_style/_share_axes.py +264 -0
- scitex/plt/ax/_style/_shift.py +139 -0
- scitex/plt/ax/_style/_show_spines.py +333 -0
- scitex/plt/color/_PARAMS.py +70 -0
- scitex/plt/color/__init__.py +52 -0
- scitex/plt/color/_add_hue_col.py +41 -0
- scitex/plt/color/_colors.py +205 -0
- scitex/plt/color/_get_colors_from_cmap.py +134 -0
- scitex/plt/color/_interpolate.py +29 -0
- scitex/plt/color/_vizualize_colors.py +54 -0
- scitex/plt/utils/__init__.py +44 -0
- scitex/plt/utils/_calc_bacc_from_conf_mat.py +46 -0
- scitex/plt/utils/_calc_nice_ticks.py +101 -0
- scitex/plt/utils/_close.py +68 -0
- scitex/plt/utils/_colorbar.py +96 -0
- scitex/plt/utils/_configure_mpl.py +295 -0
- scitex/plt/utils/_histogram_utils.py +132 -0
- scitex/plt/utils/_im2grid.py +70 -0
- scitex/plt/utils/_is_valid_axis.py +78 -0
- scitex/plt/utils/_mk_colorbar.py +65 -0
- scitex/plt/utils/_mk_patches.py +26 -0
- scitex/plt/utils/_scientific_captions.py +638 -0
- scitex/plt/utils/_scitex_config.py +223 -0
- scitex/reproduce/__init__.py +14 -0
- scitex/reproduce/_fix_seeds.py +45 -0
- scitex/reproduce/_gen_ID.py +55 -0
- scitex/reproduce/_gen_timestamp.py +35 -0
- scitex/res/__init__.py +5 -0
- scitex/resource/__init__.py +13 -0
- scitex/resource/_get_processor_usages.py +281 -0
- scitex/resource/_get_specs.py +280 -0
- scitex/resource/_log_processor_usages.py +190 -0
- scitex/resource/_utils/__init__.py +31 -0
- scitex/resource/_utils/_get_env_info.py +481 -0
- scitex/resource/limit_ram.py +33 -0
- scitex/scholar/__init__.py +24 -0
- scitex/scholar/_local_search.py +454 -0
- scitex/scholar/_paper.py +244 -0
- scitex/scholar/_pdf_downloader.py +325 -0
- scitex/scholar/_search.py +393 -0
- scitex/scholar/_vector_search.py +370 -0
- scitex/scholar/_web_sources.py +457 -0
- scitex/stats/__init__.py +31 -0
- scitex/stats/_calc_partial_corr.py +17 -0
- scitex/stats/_corr_test_multi.py +94 -0
- scitex/stats/_corr_test_wrapper.py +115 -0
- scitex/stats/_describe_wrapper.py +90 -0
- scitex/stats/_multiple_corrections.py +63 -0
- scitex/stats/_nan_stats.py +93 -0
- scitex/stats/_p2stars.py +116 -0
- scitex/stats/_p2stars_wrapper.py +56 -0
- scitex/stats/_statistical_tests.py +73 -0
- scitex/stats/desc/__init__.py +40 -0
- scitex/stats/desc/_describe.py +189 -0
- scitex/stats/desc/_nan.py +289 -0
- scitex/stats/desc/_real.py +94 -0
- scitex/stats/multiple/__init__.py +14 -0
- scitex/stats/multiple/_bonferroni_correction.py +72 -0
- scitex/stats/multiple/_fdr_correction.py +400 -0
- scitex/stats/multiple/_multicompair.py +28 -0
- scitex/stats/tests/__corr_test.py +277 -0
- scitex/stats/tests/__corr_test_multi.py +343 -0
- scitex/stats/tests/__corr_test_single.py +277 -0
- scitex/stats/tests/__init__.py +22 -0
- scitex/stats/tests/_brunner_munzel_test.py +192 -0
- scitex/stats/tests/_nocorrelation_test.py +28 -0
- scitex/stats/tests/_smirnov_grubbs.py +98 -0
- scitex/str/__init__.py +113 -0
- scitex/str/_clean_path.py +75 -0
- scitex/str/_color_text.py +52 -0
- scitex/str/_decapitalize.py +58 -0
- scitex/str/_factor_out_digits.py +281 -0
- scitex/str/_format_plot_text.py +498 -0
- scitex/str/_grep.py +48 -0
- scitex/str/_latex.py +155 -0
- scitex/str/_latex_fallback.py +471 -0
- scitex/str/_mask_api.py +39 -0
- scitex/str/_mask_api_key.py +8 -0
- scitex/str/_parse.py +158 -0
- scitex/str/_print_block.py +47 -0
- scitex/str/_print_debug.py +68 -0
- scitex/str/_printc.py +62 -0
- scitex/str/_readable_bytes.py +38 -0
- scitex/str/_remove_ansi.py +23 -0
- scitex/str/_replace.py +134 -0
- scitex/str/_search.py +125 -0
- scitex/str/_squeeze_space.py +36 -0
- scitex/tex/__init__.py +10 -0
- scitex/tex/_preview.py +103 -0
- scitex/tex/_to_vec.py +116 -0
- scitex/torch/__init__.py +18 -0
- scitex/torch/_apply_to.py +34 -0
- scitex/torch/_nan_funcs.py +77 -0
- scitex/types/_ArrayLike.py +44 -0
- scitex/types/_ColorLike.py +21 -0
- scitex/types/__init__.py +14 -0
- scitex/types/_is_listed_X.py +70 -0
- scitex/utils/__init__.py +22 -0
- scitex/utils/_compress_hdf5.py +116 -0
- scitex/utils/_email.py +120 -0
- scitex/utils/_grid.py +148 -0
- scitex/utils/_notify.py +247 -0
- scitex/utils/_search.py +121 -0
- scitex/web/__init__.py +38 -0
- scitex/web/_search_pubmed.py +438 -0
- scitex/web/_summarize_url.py +158 -0
- scitex-2.0.0.dist-info/METADATA +307 -0
- scitex-2.0.0.dist-info/RECORD +572 -0
- scitex-2.0.0.dist-info/WHEEL +6 -0
- scitex-2.0.0.dist-info/licenses/LICENSE +7 -0
- scitex-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Time-stamp: "2024-12-06 10:15:00"
|
|
4
|
+
# Author: Claude
|
|
5
|
+
# Filename: _local_search.py
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Local search engine for PDF papers and documents.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List, Optional, Dict, Any, Tuple
|
|
15
|
+
import logging
|
|
16
|
+
import hashlib
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
import json
|
|
19
|
+
|
|
20
|
+
from ._paper import Paper
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LocalSearchEngine:
|
|
27
|
+
"""Search engine for local PDF papers and documents."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
index_path: Optional[Path] = None,
|
|
32
|
+
cache_metadata: bool = True,
|
|
33
|
+
):
|
|
34
|
+
"""Initialize local search engine.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
index_path : Path, optional
|
|
39
|
+
Path to save/load the metadata index
|
|
40
|
+
cache_metadata : bool
|
|
41
|
+
Whether to cache extracted metadata
|
|
42
|
+
"""
|
|
43
|
+
self.index_path = Path(index_path) if index_path else None
|
|
44
|
+
self.cache_metadata = cache_metadata
|
|
45
|
+
|
|
46
|
+
# Metadata cache: file_path -> metadata
|
|
47
|
+
self.metadata_cache: Dict[str, Dict[str, Any]] = {}
|
|
48
|
+
|
|
49
|
+
# Load existing cache if available
|
|
50
|
+
if self.index_path and self.index_path.exists():
|
|
51
|
+
self.load_cache()
|
|
52
|
+
|
|
53
|
+
# PDF reader (lazy loaded)
|
|
54
|
+
self._pdf_reader = None
|
|
55
|
+
|
|
56
|
+
def _get_pdf_reader(self):
|
|
57
|
+
"""Lazy load PDF reader."""
|
|
58
|
+
if self._pdf_reader is None:
|
|
59
|
+
try:
|
|
60
|
+
import fitz # PyMuPDF
|
|
61
|
+
self._pdf_reader = "pymupdf"
|
|
62
|
+
except ImportError:
|
|
63
|
+
try:
|
|
64
|
+
import PyPDF2
|
|
65
|
+
self._pdf_reader = "pypdf2"
|
|
66
|
+
except ImportError:
|
|
67
|
+
logger.warning("No PDF reader available (install pymupdf or PyPDF2)")
|
|
68
|
+
self._pdf_reader = "none"
|
|
69
|
+
|
|
70
|
+
return self._pdf_reader
|
|
71
|
+
|
|
72
|
+
def search(
|
|
73
|
+
self,
|
|
74
|
+
query: str,
|
|
75
|
+
paths: List[Path],
|
|
76
|
+
recursive: bool = True,
|
|
77
|
+
file_pattern: str = "*.pdf",
|
|
78
|
+
max_results: Optional[int] = None,
|
|
79
|
+
) -> List[Tuple[Paper, float]]:
|
|
80
|
+
"""Search for papers in local directories.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
query : str
|
|
85
|
+
Search query
|
|
86
|
+
paths : List[Path]
|
|
87
|
+
Directories to search
|
|
88
|
+
recursive : bool
|
|
89
|
+
Whether to search recursively
|
|
90
|
+
file_pattern : str
|
|
91
|
+
File pattern to match (e.g., "*.pdf")
|
|
92
|
+
max_results : int, optional
|
|
93
|
+
Maximum number of results
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
List[Tuple[Paper, float]]
|
|
98
|
+
List of (paper, relevance_score) tuples
|
|
99
|
+
"""
|
|
100
|
+
results = []
|
|
101
|
+
query_lower = query.lower()
|
|
102
|
+
query_terms = set(query_lower.split())
|
|
103
|
+
|
|
104
|
+
# Find all matching files
|
|
105
|
+
pdf_files = []
|
|
106
|
+
for path in paths:
|
|
107
|
+
path = Path(path)
|
|
108
|
+
if not path.exists():
|
|
109
|
+
logger.warning(f"Path does not exist: {path}")
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
if path.is_file():
|
|
113
|
+
pdf_files.append(path)
|
|
114
|
+
else:
|
|
115
|
+
if recursive:
|
|
116
|
+
pdf_files.extend(path.rglob(file_pattern))
|
|
117
|
+
else:
|
|
118
|
+
pdf_files.extend(path.glob(file_pattern))
|
|
119
|
+
|
|
120
|
+
# Search through files
|
|
121
|
+
for pdf_path in pdf_files:
|
|
122
|
+
try:
|
|
123
|
+
# Get metadata (from cache or extract)
|
|
124
|
+
metadata = self._get_pdf_metadata(pdf_path)
|
|
125
|
+
|
|
126
|
+
if not metadata:
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
# Calculate relevance score
|
|
130
|
+
score = self._calculate_relevance(query_lower, query_terms, metadata)
|
|
131
|
+
|
|
132
|
+
if score > 0:
|
|
133
|
+
# Create Paper object
|
|
134
|
+
paper = self._create_paper_from_metadata(pdf_path, metadata)
|
|
135
|
+
results.append((paper, score))
|
|
136
|
+
|
|
137
|
+
if max_results and len(results) >= max_results:
|
|
138
|
+
break
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f"Error processing {pdf_path}: {e}")
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# Sort by relevance score
|
|
145
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
146
|
+
|
|
147
|
+
if max_results:
|
|
148
|
+
results = results[:max_results]
|
|
149
|
+
|
|
150
|
+
return results
|
|
151
|
+
|
|
152
|
+
def _get_pdf_metadata(self, pdf_path: Path) -> Optional[Dict[str, Any]]:
|
|
153
|
+
"""Extract or retrieve metadata from PDF."""
|
|
154
|
+
# Check cache first
|
|
155
|
+
cache_key = str(pdf_path.absolute())
|
|
156
|
+
|
|
157
|
+
if self.cache_metadata and cache_key in self.metadata_cache:
|
|
158
|
+
cached = self.metadata_cache[cache_key]
|
|
159
|
+
# Check if file has been modified
|
|
160
|
+
if cached.get("mtime") == pdf_path.stat().st_mtime:
|
|
161
|
+
return cached
|
|
162
|
+
|
|
163
|
+
# Extract metadata
|
|
164
|
+
metadata = self._extract_pdf_metadata(pdf_path)
|
|
165
|
+
|
|
166
|
+
if metadata and self.cache_metadata:
|
|
167
|
+
# Add file modification time
|
|
168
|
+
metadata["mtime"] = pdf_path.stat().st_mtime
|
|
169
|
+
self.metadata_cache[cache_key] = metadata
|
|
170
|
+
# Save cache periodically
|
|
171
|
+
if len(self.metadata_cache) % 10 == 0:
|
|
172
|
+
self.save_cache()
|
|
173
|
+
|
|
174
|
+
return metadata
|
|
175
|
+
|
|
176
|
+
def _extract_pdf_metadata(self, pdf_path: Path) -> Optional[Dict[str, Any]]:
|
|
177
|
+
"""Extract metadata from PDF file."""
|
|
178
|
+
reader_type = self._get_pdf_reader()
|
|
179
|
+
|
|
180
|
+
metadata = {
|
|
181
|
+
"filename": pdf_path.name,
|
|
182
|
+
"path": str(pdf_path),
|
|
183
|
+
"size": pdf_path.stat().st_size,
|
|
184
|
+
"modified": datetime.fromtimestamp(pdf_path.stat().st_mtime).isoformat(),
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if reader_type == "none":
|
|
188
|
+
# Fallback: use filename
|
|
189
|
+
metadata["title"] = pdf_path.stem.replace("_", " ").replace("-", " ")
|
|
190
|
+
metadata["content"] = metadata["title"]
|
|
191
|
+
return metadata
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
if reader_type == "pymupdf":
|
|
195
|
+
import fitz
|
|
196
|
+
|
|
197
|
+
with fitz.open(pdf_path) as doc:
|
|
198
|
+
# Get document metadata
|
|
199
|
+
info = doc.metadata
|
|
200
|
+
metadata["title"] = info.get("title", "") or pdf_path.stem
|
|
201
|
+
metadata["author"] = info.get("author", "")
|
|
202
|
+
metadata["subject"] = info.get("subject", "")
|
|
203
|
+
metadata["keywords"] = info.get("keywords", "")
|
|
204
|
+
|
|
205
|
+
# Extract text from first few pages
|
|
206
|
+
text_parts = []
|
|
207
|
+
for i in range(min(3, len(doc))): # First 3 pages
|
|
208
|
+
page = doc[i]
|
|
209
|
+
text = page.get_text()
|
|
210
|
+
if text:
|
|
211
|
+
text_parts.append(text)
|
|
212
|
+
|
|
213
|
+
metadata["content"] = " ".join(text_parts)[:5000] # Limit content
|
|
214
|
+
|
|
215
|
+
# Try to extract abstract
|
|
216
|
+
full_text = " ".join(text_parts)
|
|
217
|
+
abstract = self._extract_abstract(full_text)
|
|
218
|
+
if abstract:
|
|
219
|
+
metadata["abstract"] = abstract
|
|
220
|
+
|
|
221
|
+
elif reader_type == "pypdf2":
|
|
222
|
+
import PyPDF2
|
|
223
|
+
|
|
224
|
+
with open(pdf_path, "rb") as f:
|
|
225
|
+
reader = PyPDF2.PdfReader(f)
|
|
226
|
+
|
|
227
|
+
# Get metadata
|
|
228
|
+
info = reader.metadata
|
|
229
|
+
if info:
|
|
230
|
+
metadata["title"] = info.get("/Title", "") or pdf_path.stem
|
|
231
|
+
metadata["author"] = info.get("/Author", "")
|
|
232
|
+
metadata["subject"] = info.get("/Subject", "")
|
|
233
|
+
metadata["keywords"] = info.get("/Keywords", "")
|
|
234
|
+
|
|
235
|
+
# Extract text from first few pages
|
|
236
|
+
text_parts = []
|
|
237
|
+
for i in range(min(3, len(reader.pages))):
|
|
238
|
+
page = reader.pages[i]
|
|
239
|
+
text = page.extract_text()
|
|
240
|
+
if text:
|
|
241
|
+
text_parts.append(text)
|
|
242
|
+
|
|
243
|
+
metadata["content"] = " ".join(text_parts)[:5000]
|
|
244
|
+
|
|
245
|
+
# Try to extract abstract
|
|
246
|
+
full_text = " ".join(text_parts)
|
|
247
|
+
abstract = self._extract_abstract(full_text)
|
|
248
|
+
if abstract:
|
|
249
|
+
metadata["abstract"] = abstract
|
|
250
|
+
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.error(f"Error extracting metadata from {pdf_path}: {e}")
|
|
253
|
+
# Fallback to filename
|
|
254
|
+
metadata["title"] = pdf_path.stem.replace("_", " ").replace("-", " ")
|
|
255
|
+
metadata["content"] = metadata["title"]
|
|
256
|
+
|
|
257
|
+
return metadata
|
|
258
|
+
|
|
259
|
+
def _extract_abstract(self, text: str) -> Optional[str]:
|
|
260
|
+
"""Try to extract abstract from text."""
|
|
261
|
+
# Look for abstract section
|
|
262
|
+
patterns = [
|
|
263
|
+
r"abstract[:\s]*(.+?)(?=introduction|keywords|1\.|1\s+introduction)",
|
|
264
|
+
r"summary[:\s]*(.+?)(?=introduction|keywords|1\.|1\s+introduction)",
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
for pattern in patterns:
|
|
268
|
+
match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
|
|
269
|
+
if match:
|
|
270
|
+
abstract = match.group(1).strip()
|
|
271
|
+
# Clean up
|
|
272
|
+
abstract = re.sub(r"\s+", " ", abstract)
|
|
273
|
+
if len(abstract) > 100: # Reasonable abstract length
|
|
274
|
+
return abstract[:1000] # Limit length
|
|
275
|
+
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
def _calculate_relevance(
|
|
279
|
+
self,
|
|
280
|
+
query_lower: str,
|
|
281
|
+
query_terms: set,
|
|
282
|
+
metadata: Dict[str, Any]
|
|
283
|
+
) -> float:
|
|
284
|
+
"""Calculate relevance score for a document."""
|
|
285
|
+
score = 0.0
|
|
286
|
+
|
|
287
|
+
# Search in different fields with different weights
|
|
288
|
+
fields = [
|
|
289
|
+
("title", 5.0),
|
|
290
|
+
("abstract", 3.0),
|
|
291
|
+
("keywords", 3.0),
|
|
292
|
+
("author", 2.0),
|
|
293
|
+
("subject", 2.0),
|
|
294
|
+
("content", 1.0),
|
|
295
|
+
("filename", 1.0),
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
for field, weight in fields:
|
|
299
|
+
field_value = metadata.get(field, "").lower()
|
|
300
|
+
if not field_value:
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
# Exact match
|
|
304
|
+
if query_lower in field_value:
|
|
305
|
+
score += weight * 2
|
|
306
|
+
|
|
307
|
+
# Term matches
|
|
308
|
+
field_terms = set(field_value.split())
|
|
309
|
+
matching_terms = query_terms & field_terms
|
|
310
|
+
if matching_terms:
|
|
311
|
+
score += weight * len(matching_terms) / len(query_terms)
|
|
312
|
+
|
|
313
|
+
return score
|
|
314
|
+
|
|
315
|
+
def _create_paper_from_metadata(
|
|
316
|
+
self,
|
|
317
|
+
pdf_path: Path,
|
|
318
|
+
metadata: Dict[str, Any]
|
|
319
|
+
) -> Paper:
|
|
320
|
+
"""Create Paper object from metadata."""
|
|
321
|
+
# Parse authors
|
|
322
|
+
authors = []
|
|
323
|
+
author_str = metadata.get("author", "")
|
|
324
|
+
if author_str:
|
|
325
|
+
# Try to split authors
|
|
326
|
+
if ";" in author_str:
|
|
327
|
+
authors = [a.strip() for a in author_str.split(";")]
|
|
328
|
+
elif "," in author_str and author_str.count(",") > 1:
|
|
329
|
+
authors = [a.strip() for a in author_str.split(",")]
|
|
330
|
+
else:
|
|
331
|
+
authors = [author_str.strip()]
|
|
332
|
+
|
|
333
|
+
# Parse keywords
|
|
334
|
+
keywords = []
|
|
335
|
+
keyword_str = metadata.get("keywords", "")
|
|
336
|
+
if keyword_str:
|
|
337
|
+
keywords = [k.strip() for k in re.split(r"[;,]", keyword_str) if k.strip()]
|
|
338
|
+
|
|
339
|
+
# Create paper
|
|
340
|
+
paper = Paper(
|
|
341
|
+
title=metadata.get("title", pdf_path.stem),
|
|
342
|
+
authors=authors,
|
|
343
|
+
abstract=metadata.get("abstract", metadata.get("content", "")[:500]),
|
|
344
|
+
source="local",
|
|
345
|
+
keywords=keywords,
|
|
346
|
+
pdf_path=pdf_path,
|
|
347
|
+
metadata={
|
|
348
|
+
"filename": metadata.get("filename"),
|
|
349
|
+
"size": metadata.get("size"),
|
|
350
|
+
"modified": metadata.get("modified"),
|
|
351
|
+
"subject": metadata.get("subject"),
|
|
352
|
+
},
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
return paper
|
|
356
|
+
|
|
357
|
+
def build_index(self, paths: List[Path], recursive: bool = True) -> int:
|
|
358
|
+
"""Build metadata index for given paths.
|
|
359
|
+
|
|
360
|
+
Parameters
|
|
361
|
+
----------
|
|
362
|
+
paths : List[Path]
|
|
363
|
+
Directories to index
|
|
364
|
+
recursive : bool
|
|
365
|
+
Whether to search recursively
|
|
366
|
+
|
|
367
|
+
Returns
|
|
368
|
+
-------
|
|
369
|
+
int
|
|
370
|
+
Number of files indexed
|
|
371
|
+
"""
|
|
372
|
+
count = 0
|
|
373
|
+
|
|
374
|
+
for path in paths:
|
|
375
|
+
path = Path(path)
|
|
376
|
+
if not path.exists():
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
if path.is_file() and path.suffix.lower() == ".pdf":
|
|
380
|
+
pdf_files = [path]
|
|
381
|
+
else:
|
|
382
|
+
if recursive:
|
|
383
|
+
pdf_files = list(path.rglob("*.pdf"))
|
|
384
|
+
else:
|
|
385
|
+
pdf_files = list(path.glob("*.pdf"))
|
|
386
|
+
|
|
387
|
+
for pdf_path in pdf_files:
|
|
388
|
+
try:
|
|
389
|
+
metadata = self._get_pdf_metadata(pdf_path)
|
|
390
|
+
if metadata:
|
|
391
|
+
count += 1
|
|
392
|
+
if count % 10 == 0:
|
|
393
|
+
logger.info(f"Indexed {count} files...")
|
|
394
|
+
except Exception as e:
|
|
395
|
+
logger.error(f"Error indexing {pdf_path}: {e}")
|
|
396
|
+
|
|
397
|
+
# Save cache
|
|
398
|
+
if self.cache_metadata:
|
|
399
|
+
self.save_cache()
|
|
400
|
+
|
|
401
|
+
logger.info(f"Indexed {count} PDF files")
|
|
402
|
+
return count
|
|
403
|
+
|
|
404
|
+
def save_cache(self) -> None:
|
|
405
|
+
"""Save metadata cache to disk."""
|
|
406
|
+
if not self.index_path:
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
410
|
+
|
|
411
|
+
with open(self.index_path, "w") as f:
|
|
412
|
+
json.dump(self.metadata_cache, f, indent=2)
|
|
413
|
+
|
|
414
|
+
logger.debug(f"Saved cache with {len(self.metadata_cache)} entries")
|
|
415
|
+
|
|
416
|
+
def load_cache(self) -> None:
|
|
417
|
+
"""Load metadata cache from disk."""
|
|
418
|
+
if not self.index_path or not self.index_path.exists():
|
|
419
|
+
return
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
with open(self.index_path, "r") as f:
|
|
423
|
+
self.metadata_cache = json.load(f)
|
|
424
|
+
logger.debug(f"Loaded cache with {len(self.metadata_cache)} entries")
|
|
425
|
+
except Exception as e:
|
|
426
|
+
logger.error(f"Error loading cache: {e}")
|
|
427
|
+
|
|
428
|
+
def clear_cache(self) -> None:
|
|
429
|
+
"""Clear the metadata cache."""
|
|
430
|
+
self.metadata_cache.clear()
|
|
431
|
+
if self.index_path and self.index_path.exists():
|
|
432
|
+
self.index_path.unlink()
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
# Example usage
|
|
436
|
+
if __name__ == "__main__":
|
|
437
|
+
# Create local search engine
|
|
438
|
+
engine = LocalSearchEngine()
|
|
439
|
+
|
|
440
|
+
# Search in current directory
|
|
441
|
+
results = engine.search("machine learning", [Path(".")], max_results=5)
|
|
442
|
+
|
|
443
|
+
print(f"Found {len(results)} papers:")
|
|
444
|
+
for paper, score in results:
|
|
445
|
+
print(f"\nScore: {score:.2f}")
|
|
446
|
+
print(f"Title: {paper.title}")
|
|
447
|
+
print(f"Path: {paper.pdf_path}")
|
|
448
|
+
if paper.authors:
|
|
449
|
+
print(f"Authors: {', '.join(paper.authors)}")
|
|
450
|
+
|
|
451
|
+
# Build index for faster future searches
|
|
452
|
+
print("\nBuilding index...")
|
|
453
|
+
count = engine.build_index([Path(".")])
|
|
454
|
+
print(f"Indexed {count} files")
|
scitex/scholar/_paper.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Time-stamp: "2024-12-06 10:00:00"
|
|
4
|
+
# Author: Claude
|
|
5
|
+
# Filename: _paper.py
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Paper class for representing scientific papers.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Dict, Optional, List, Any
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
import json
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Paper:
|
|
18
|
+
"""Represents a scientific paper with metadata and content."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
title: str,
|
|
23
|
+
authors: List[str],
|
|
24
|
+
abstract: str,
|
|
25
|
+
source: str, # 'pubmed', 'arxiv', 'local', etc.
|
|
26
|
+
year: Optional[int] = None,
|
|
27
|
+
doi: Optional[str] = None,
|
|
28
|
+
pmid: Optional[str] = None,
|
|
29
|
+
arxiv_id: Optional[str] = None,
|
|
30
|
+
journal: Optional[str] = None,
|
|
31
|
+
keywords: Optional[List[str]] = None,
|
|
32
|
+
pdf_path: Optional[Path] = None,
|
|
33
|
+
embedding: Optional[Any] = None, # numpy array or tensor
|
|
34
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
35
|
+
):
|
|
36
|
+
"""Initialize a Paper object.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
title : str
|
|
41
|
+
Paper title
|
|
42
|
+
authors : List[str]
|
|
43
|
+
List of author names
|
|
44
|
+
abstract : str
|
|
45
|
+
Paper abstract
|
|
46
|
+
source : str
|
|
47
|
+
Source of the paper (pubmed, arxiv, local, etc.)
|
|
48
|
+
year : int, optional
|
|
49
|
+
Publication year
|
|
50
|
+
doi : str, optional
|
|
51
|
+
Digital Object Identifier
|
|
52
|
+
pmid : str, optional
|
|
53
|
+
PubMed ID
|
|
54
|
+
arxiv_id : str, optional
|
|
55
|
+
arXiv identifier
|
|
56
|
+
journal : str, optional
|
|
57
|
+
Journal name
|
|
58
|
+
keywords : List[str], optional
|
|
59
|
+
Keywords/tags
|
|
60
|
+
pdf_path : Path, optional
|
|
61
|
+
Path to local PDF file
|
|
62
|
+
embedding : Any, optional
|
|
63
|
+
Vector embedding of the paper
|
|
64
|
+
metadata : Dict[str, Any], optional
|
|
65
|
+
Additional metadata
|
|
66
|
+
"""
|
|
67
|
+
self.title = title
|
|
68
|
+
self.authors = authors
|
|
69
|
+
self.abstract = abstract
|
|
70
|
+
self.source = source
|
|
71
|
+
self.year = year
|
|
72
|
+
self.doi = doi
|
|
73
|
+
self.pmid = pmid
|
|
74
|
+
self.arxiv_id = arxiv_id
|
|
75
|
+
self.journal = journal
|
|
76
|
+
self.keywords = keywords or []
|
|
77
|
+
self.pdf_path = Path(pdf_path) if pdf_path else None
|
|
78
|
+
self.embedding = embedding
|
|
79
|
+
self.metadata = metadata or {}
|
|
80
|
+
self.retrieved_at = datetime.now()
|
|
81
|
+
|
|
82
|
+
def __repr__(self) -> str:
|
|
83
|
+
"""String representation of the paper."""
|
|
84
|
+
return f"Paper(title='{self.title[:50]}...', authors={len(self.authors)}, source='{self.source}')"
|
|
85
|
+
|
|
86
|
+
def __str__(self) -> str:
|
|
87
|
+
"""Human-readable string representation."""
|
|
88
|
+
authors_str = ", ".join(self.authors[:3])
|
|
89
|
+
if len(self.authors) > 3:
|
|
90
|
+
authors_str += f" et al. ({len(self.authors)} authors)"
|
|
91
|
+
|
|
92
|
+
parts = [
|
|
93
|
+
f"Title: {self.title}",
|
|
94
|
+
f"Authors: {authors_str}",
|
|
95
|
+
f"Year: {self.year or 'Unknown'}",
|
|
96
|
+
f"Source: {self.source}",
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
if self.journal:
|
|
100
|
+
parts.append(f"Journal: {self.journal}")
|
|
101
|
+
if self.doi:
|
|
102
|
+
parts.append(f"DOI: {self.doi}")
|
|
103
|
+
if self.pmid:
|
|
104
|
+
parts.append(f"PMID: {self.pmid}")
|
|
105
|
+
if self.arxiv_id:
|
|
106
|
+
parts.append(f"arXiv: {self.arxiv_id}")
|
|
107
|
+
|
|
108
|
+
return "\n".join(parts)
|
|
109
|
+
|
|
110
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
111
|
+
"""Convert paper to dictionary representation."""
|
|
112
|
+
return {
|
|
113
|
+
"title": self.title,
|
|
114
|
+
"authors": self.authors,
|
|
115
|
+
"abstract": self.abstract,
|
|
116
|
+
"source": self.source,
|
|
117
|
+
"year": self.year,
|
|
118
|
+
"doi": self.doi,
|
|
119
|
+
"pmid": self.pmid,
|
|
120
|
+
"arxiv_id": self.arxiv_id,
|
|
121
|
+
"journal": self.journal,
|
|
122
|
+
"keywords": self.keywords,
|
|
123
|
+
"pdf_path": str(self.pdf_path) if self.pdf_path else None,
|
|
124
|
+
"has_embedding": self.embedding is not None,
|
|
125
|
+
"metadata": self.metadata,
|
|
126
|
+
"retrieved_at": self.retrieved_at.isoformat(),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def from_dict(cls, data: Dict[str, Any]) -> "Paper":
|
|
131
|
+
"""Create Paper from dictionary representation."""
|
|
132
|
+
# Remove computed fields
|
|
133
|
+
data = data.copy()
|
|
134
|
+
data.pop("has_embedding", None)
|
|
135
|
+
data.pop("retrieved_at", None)
|
|
136
|
+
|
|
137
|
+
# Convert pdf_path back to Path if present
|
|
138
|
+
if data.get("pdf_path"):
|
|
139
|
+
data["pdf_path"] = Path(data["pdf_path"])
|
|
140
|
+
|
|
141
|
+
return cls(**data)
|
|
142
|
+
|
|
143
|
+
def to_bibtex(self) -> str:
|
|
144
|
+
"""Generate BibTeX entry for the paper."""
|
|
145
|
+
# Generate citation key
|
|
146
|
+
first_author = self.authors[0].split()[-1] if self.authors else "Unknown"
|
|
147
|
+
year = self.year or "0000"
|
|
148
|
+
title_word = self.title.split()[0].lower() if self.title else "paper"
|
|
149
|
+
cite_key = f"{first_author}{year}{title_word}"
|
|
150
|
+
|
|
151
|
+
# Determine entry type
|
|
152
|
+
if self.arxiv_id:
|
|
153
|
+
entry_type = "@misc"
|
|
154
|
+
else:
|
|
155
|
+
entry_type = "@article"
|
|
156
|
+
|
|
157
|
+
# Build BibTeX entry
|
|
158
|
+
lines = [f"{entry_type}{{{cite_key},"]
|
|
159
|
+
lines.append(f' title = "{{{self.title}}}",')
|
|
160
|
+
|
|
161
|
+
if self.authors:
|
|
162
|
+
authors_str = " and ".join(self.authors)
|
|
163
|
+
lines.append(f' author = "{{{authors_str}}}",')
|
|
164
|
+
|
|
165
|
+
if self.year:
|
|
166
|
+
lines.append(f' year = "{{{self.year}}}",')
|
|
167
|
+
|
|
168
|
+
if self.journal:
|
|
169
|
+
lines.append(f' journal = "{{{self.journal}}}",')
|
|
170
|
+
|
|
171
|
+
if self.doi:
|
|
172
|
+
lines.append(f' doi = "{{{self.doi}}}",')
|
|
173
|
+
|
|
174
|
+
if self.arxiv_id:
|
|
175
|
+
lines.append(f' eprint = "{{{self.arxiv_id}}}",')
|
|
176
|
+
lines.append(' archivePrefix = "{arXiv}",')
|
|
177
|
+
|
|
178
|
+
lines.append("}")
|
|
179
|
+
|
|
180
|
+
return "\n".join(lines)
|
|
181
|
+
|
|
182
|
+
def get_identifier(self) -> str:
|
|
183
|
+
"""Get a unique identifier for the paper."""
|
|
184
|
+
if self.doi:
|
|
185
|
+
return f"doi:{self.doi}"
|
|
186
|
+
elif self.pmid:
|
|
187
|
+
return f"pmid:{self.pmid}"
|
|
188
|
+
elif self.arxiv_id:
|
|
189
|
+
return f"arxiv:{self.arxiv_id}"
|
|
190
|
+
else:
|
|
191
|
+
# Create a hash from title and authors
|
|
192
|
+
import hashlib
|
|
193
|
+
content = f"{self.title}_{';'.join(self.authors)}"
|
|
194
|
+
return f"hash:{hashlib.md5(content.encode()).hexdigest()[:12]}"
|
|
195
|
+
|
|
196
|
+
def has_pdf(self) -> bool:
|
|
197
|
+
"""Check if paper has an associated PDF file."""
|
|
198
|
+
return self.pdf_path is not None and self.pdf_path.exists()
|
|
199
|
+
|
|
200
|
+
def similarity_score(self, other: "Paper") -> float:
|
|
201
|
+
"""Calculate similarity score with another paper (0-1)."""
|
|
202
|
+
score = 0.0
|
|
203
|
+
weights = {"title": 0.4, "authors": 0.3, "abstract": 0.3}
|
|
204
|
+
|
|
205
|
+
# Title similarity (simple word overlap)
|
|
206
|
+
title_words1 = set(self.title.lower().split())
|
|
207
|
+
title_words2 = set(other.title.lower().split())
|
|
208
|
+
if title_words1 and title_words2:
|
|
209
|
+
title_overlap = len(title_words1 & title_words2) / len(title_words1 | title_words2)
|
|
210
|
+
score += weights["title"] * title_overlap
|
|
211
|
+
|
|
212
|
+
# Author similarity
|
|
213
|
+
authors1 = set(self.authors)
|
|
214
|
+
authors2 = set(other.authors)
|
|
215
|
+
if authors1 and authors2:
|
|
216
|
+
author_overlap = len(authors1 & authors2) / len(authors1 | authors2)
|
|
217
|
+
score += weights["authors"] * author_overlap
|
|
218
|
+
|
|
219
|
+
# Abstract similarity (simple word overlap)
|
|
220
|
+
abstract_words1 = set(self.abstract.lower().split())
|
|
221
|
+
abstract_words2 = set(other.abstract.lower().split())
|
|
222
|
+
if abstract_words1 and abstract_words2:
|
|
223
|
+
abstract_overlap = len(abstract_words1 & abstract_words2) / len(abstract_words1 | abstract_words2)
|
|
224
|
+
score += weights["abstract"] * abstract_overlap
|
|
225
|
+
|
|
226
|
+
return score
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# Example usage
|
|
230
|
+
if __name__ == "__main__":
|
|
231
|
+
paper = Paper(
|
|
232
|
+
title="Deep Learning for Scientific Discovery",
|
|
233
|
+
authors=["John Doe", "Jane Smith", "Bob Johnson"],
|
|
234
|
+
abstract="This paper explores the application of deep learning...",
|
|
235
|
+
source="arxiv",
|
|
236
|
+
year=2024,
|
|
237
|
+
arxiv_id="2401.12345",
|
|
238
|
+
keywords=["deep learning", "scientific computing", "AI"],
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
print(paper)
|
|
242
|
+
print("\nBibTeX:")
|
|
243
|
+
print(paper.to_bibtex())
|
|
244
|
+
print("\nIdentifier:", paper.get_identifier())
|