scitex 2.10.3__py3-none-any.whl → 2.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +1 -4
- scitex/__main__.py +24 -5
- scitex/__version__.py +1 -1
- scitex/_install_guide.py +14 -2
- scitex/_optional_deps.py +33 -0
- scitex/ai/classification/reporters/_ClassificationReporter.py +1 -1
- scitex/ai/classification/timeseries/_TimeSeriesBlockingSplit.py +2 -2
- scitex/ai/classification/timeseries/_TimeSeriesCalendarSplit.py +2 -2
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +2 -2
- scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +2 -2
- scitex/ai/classification/timeseries/_TimeSeriesStratifiedSplit.py +2 -2
- scitex/ai/classification/timeseries/_normalize_timestamp.py +1 -1
- scitex/ai/metrics/_calc_seizure_prediction_metrics.py +1 -1
- scitex/ai/plt/_plot_feature_importance.py +1 -1
- scitex/ai/plt/_plot_learning_curve.py +1 -1
- scitex/ai/plt/_plot_optuna_study.py +1 -1
- scitex/ai/plt/_plot_pre_rec_curve.py +1 -1
- scitex/ai/plt/_plot_roc_curve.py +1 -1
- scitex/ai/plt/_stx_conf_mat.py +1 -1
- scitex/ai/training/_LearningCurveLogger.py +1 -1
- scitex/audio/mcp_server.py +38 -8
- scitex/bridge/_figrecipe.py +1 -1
- scitex/bridge/_helpers.py +1 -1
- scitex/bridge/_plt_vis.py +1 -1
- scitex/bridge/_stats_plt.py +1 -1
- scitex/bridge/_stats_vis.py +2 -2
- scitex/browser/automation/CookieHandler.py +1 -1
- scitex/browser/core/BrowserMixin.py +1 -1
- scitex/browser/core/ChromeProfileManager.py +1 -1
- scitex/browser/debugging/_browser_logger.py +1 -1
- scitex/browser/debugging/_highlight_element.py +1 -1
- scitex/browser/debugging/_show_grid.py +1 -1
- scitex/browser/interaction/click_center.py +1 -1
- scitex/browser/interaction/click_with_fallbacks.py +1 -1
- scitex/browser/interaction/close_popups.py +1 -1
- scitex/browser/interaction/fill_with_fallbacks.py +1 -1
- scitex/browser/pdf/click_download_for_chrome_pdf_viewer.py +1 -1
- scitex/browser/pdf/detect_chrome_pdf_viewer.py +1 -1
- scitex/browser/stealth/HumanBehavior.py +1 -1
- scitex/browser/stealth/StealthManager.py +1 -1
- scitex/{fig → canvas}/__init__.py +84 -96
- scitex/canvas/_mcp_handlers.py +372 -0
- scitex/canvas/_mcp_tool_schemas.py +219 -0
- scitex/{fig → canvas}/backend/_parser.py +1 -1
- scitex/{fig → canvas}/canvas.py +13 -14
- scitex/{fts/_fig/_editor → canvas/editor}/_defaults.py +2 -2
- scitex/{fig → canvas}/editor/edit/__init__.py +11 -14
- scitex/{fig → canvas}/editor/edit/bundle_resolver.py +56 -48
- scitex/{fig → canvas}/editor/edit/editor_launcher.py +79 -26
- scitex/{fts/_fig/_editor/_cui/_panel_loader.py → canvas/editor/edit/panel_loader.py} +8 -8
- scitex/{fts/_fig/_editor/_gui/_flask_editor → canvas/editor/flask_editor}/_bbox.py +2 -1
- scitex/{fts/_fig/_editor/_gui/_flask_editor → canvas/editor/flask_editor}/_core.py +84 -84
- scitex/{fts/_fig/_editor/_gui/_flask_editor → canvas/editor/flask_editor}/_renderer.py +7 -6
- scitex/{fts/_fig/_editor/_gui/_flask_editor → canvas/editor/flask_editor}/static/css/features/canvas.css +2 -2
- scitex/{fig → canvas}/editor/flask_editor/static/css/features/panel-grid.css +1 -1
- scitex/{fig → canvas}/editor/flask_editor/static/js/core/api.js +3 -4
- scitex/{fig → canvas}/editor/flask_editor/static/js/editor/preview.js +5 -5
- scitex/{fig → canvas}/editor/flask_editor/templates/_html.py +3 -3
- scitex/{fig → canvas}/editor/flask_editor/templates/_scripts.py +10 -10
- scitex/{fig → canvas}/editor/flask_editor/templates/_styles.py +3 -3
- scitex/{fig → canvas}/io/__init__.py +32 -38
- scitex/{fig → canvas}/io/_bundle.py +217 -154
- scitex/{fig → canvas}/io/_canvas.py +1 -1
- scitex/{fig → canvas}/io/_data.py +1 -1
- scitex/{fig → canvas}/io/_export.py +1 -1
- scitex/{fig → canvas}/io/_load.py +1 -1
- scitex/{fig → canvas}/io/_panel.py +1 -1
- scitex/{fig → canvas}/io/_save.py +1 -1
- scitex/canvas/mcp_server.py +151 -0
- scitex/{fig → canvas}/model/__init__.py +1 -1
- scitex/{fig → canvas}/model/_annotations.py +1 -1
- scitex/{fig → canvas}/model/_axes.py +1 -1
- scitex/{fig → canvas}/model/_figure.py +1 -1
- scitex/{fig → canvas}/model/_guides.py +1 -1
- scitex/{fig → canvas}/model/_plot.py +1 -1
- scitex/{fig → canvas}/model/_styles.py +1 -1
- scitex/{fig → canvas}/utils/__init__.py +1 -1
- scitex/capture/mcp_server.py +41 -12
- scitex/cli/audio.py +233 -0
- scitex/cli/capture.py +307 -0
- scitex/cli/convert.py +10 -6
- scitex/cli/main.py +27 -4
- scitex/cli/repro.py +233 -0
- scitex/cli/resource.py +240 -0
- scitex/cli/stats.py +325 -0
- scitex/cli/template.py +236 -0
- scitex/cli/tex.py +286 -0
- scitex/cli/web.py +11 -12
- scitex/dev/__init__.py +3 -0
- scitex/dev/_pyproject.py +405 -0
- scitex/dev/plt/__init__.py +2 -2
- scitex/dev/plt/mpl/get_dir_ax.py +1 -1
- scitex/dev/plt/mpl/get_signatures.py +1 -1
- scitex/dev/plt/mpl/get_signatures_details.py +1 -1
- scitex/diagram/README.md +7 -7
- scitex/diagram/_mcp_handlers.py +400 -0
- scitex/diagram/_mcp_tool_schemas.py +157 -0
- scitex/diagram/mcp_server.py +151 -0
- scitex/dsp/_demo_sig.py +51 -5
- scitex/dsp/_mne.py +13 -2
- scitex/dsp/_modulation_index.py +15 -3
- scitex/dsp/_pac.py +23 -5
- scitex/dsp/_psd.py +16 -4
- scitex/dsp/_resample.py +24 -4
- scitex/dsp/_transform.py +16 -3
- scitex/dsp/add_noise.py +15 -1
- scitex/dsp/norm.py +17 -2
- scitex/dsp/reference.py +17 -1
- scitex/dsp/utils/_differential_bandpass_filters.py +20 -2
- scitex/dsp/utils/_zero_pad.py +18 -4
- scitex/dt/_normalize_timestamp.py +1 -1
- scitex/git/_session.py +1 -1
- scitex/io/__init__.py +7 -19
- scitex/io/_load.py +15 -19
- scitex/io/_load_modules/_canvas.py +2 -2
- scitex/io/_load_modules/_con.py +17 -6
- scitex/io/_load_modules/_eeg.py +28 -13
- scitex/io/_load_modules/_optuna.py +21 -63
- scitex/io/_load_modules/_torch.py +11 -3
- scitex/io/_save.py +11 -16
- scitex/io/_save_modules/__init__.py +6 -10
- scitex/io/_save_modules/_canvas.py +3 -3
- scitex/io/_save_modules/_optuna_study_as_csv_and_pngs.py +13 -2
- scitex/io/_save_modules/_plot_bundle.py +112 -0
- scitex/io/_save_modules/{_pltz_stx.py → _plot_scitex.py} +7 -7
- scitex/io/_save_modules/_stx_bundle.py +16 -16
- scitex/io/_save_modules/_torch.py +11 -3
- scitex/io/bundle/README.md +89 -80
- scitex/{fts/_bundle/_FTS.py → io/bundle/_Bundle.py} +197 -95
- scitex/io/bundle/__init__.py +67 -35
- scitex/{fts/_bundle → io/bundle}/_children.py +32 -40
- scitex/io/bundle/_core.py +184 -97
- scitex/{fts/_bundle/_dataclasses/_Node.py → io/bundle/_dataclasses/_Spec.py} +29 -23
- scitex/{fts/_bundle/_dataclasses/_NodeRefs.py → io/bundle/_dataclasses/_SpecRefs.py} +6 -6
- scitex/{fts/_bundle → io/bundle}/_dataclasses/__init__.py +4 -4
- scitex/{fts/_bundle → io/bundle}/_loader.py +19 -19
- scitex/io/bundle/_manifest.py +99 -0
- scitex/{fts/_bundle → io/bundle}/_mpl_helpers.py +119 -28
- scitex/io/bundle/_nested.py +113 -100
- scitex/{fts/_bundle → io/bundle}/_saver.py +13 -14
- scitex/{fts/_bundle → io/bundle}/_storage.py +3 -3
- scitex/io/bundle/_types.py +41 -16
- scitex/{fts/_bundle → io/bundle}/_validation.py +20 -18
- scitex/io/bundle/_zip.py +21 -31
- scitex/{fts/_kinds → io/bundle/kinds}/_plot/_backend/_parser.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_plot/_models/_Annotations.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_plot/_models/_Axes.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_plot/_models/_Figure.py +1 -1
- scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_Guides.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_plot/_models/_Plot.py +1 -1
- scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_Styles.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_plot/_utils/_plot_layout.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/__init__.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/_editor/_app.py +1 -1
- scitex/{fts/_tables → io/bundle/kinds/_table}/_latex/_export.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/_figure_exporter.py +1 -1
- scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/_table_exporter.py +1 -1
- scitex/io/bundle/schemas/__init__.py +30 -0
- scitex/mcp_server.py +159 -0
- scitex/parallel/_run.py +5 -4
- scitex/path/_find.py +60 -83
- scitex/path/_get_module_path.py +23 -21
- scitex/path/_get_spath.py +6 -27
- scitex/path/_getsize.py +23 -9
- scitex/path/_increment_version.py +31 -38
- scitex/path/_mk_spath.py +26 -29
- scitex/path/_path.py +5 -12
- scitex/path/_split.py +27 -15
- scitex/path/_this_path.py +23 -9
- scitex/plt/_mcp_handlers.py +361 -0
- scitex/plt/_mcp_tool_schemas.py +169 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin/__init__.py +2 -1
- scitex/plt/_subplots/_AxisWrapperMixins/__init__.py +2 -2
- scitex/plt/gallery/_generate.py +76 -50
- scitex/plt/io/__init__.py +17 -19
- scitex/plt/io/_bundle.py +99 -52
- scitex/plt/io/_layered_bundle.py +303 -168
- scitex/plt/mcp_server.py +205 -0
- scitex/plt/utils/_csv_column_naming.py +250 -118
- scitex/repro/README_RandomStateManager.md +3 -3
- scitex/repro/_RandomStateManager.py +14 -14
- scitex/repro/_gen_ID.py +1 -1
- scitex/repro/_gen_timestamp.py +1 -1
- scitex/repro/_hash_array.py +4 -4
- scitex/schema/__init__.py +69 -73
- scitex/schema/_canvas.py +1 -1
- scitex/schema/_stats.py +2 -2
- scitex/scholar/__main__.py +24 -2
- scitex/scholar/_mcp_handlers.py +685 -0
- scitex/scholar/_mcp_tool_schemas.py +339 -0
- scitex/scholar/docs/template.py +1 -1
- scitex/scholar/examples/07_storage_integration.py +1 -1
- scitex/scholar/impact_factor/jcr/ImpactFactorJCREngine.py +1 -1
- scitex/scholar/impact_factor/jcr/build_database.py +1 -1
- scitex/scholar/mcp_server.py +315 -0
- scitex/scholar/pdf_download/ScholarPDFDownloader.py +1 -1
- scitex/scholar/pipelines/ScholarPipelineBibTeX.py +1 -1
- scitex/scholar/pipelines/ScholarPipelineParallel.py +1 -1
- scitex/scholar/pipelines/ScholarPipelineSingle.py +1 -1
- scitex/scholar/storage/PaperIO.py +1 -1
- scitex/session/README.md +4 -4
- scitex/session/__init__.py +1 -1
- scitex/session/_decorator.py +9 -9
- scitex/session/_lifecycle.py +5 -5
- scitex/session/template.py +1 -1
- scitex/stats/__init__.py +30 -33
- scitex/stats/__main__.py +281 -0
- scitex/stats/_mcp_handlers.py +1191 -0
- scitex/stats/_mcp_tool_schemas.py +384 -0
- scitex/stats/_schema.py +1 -1
- scitex/stats/correct/_correct_bonferroni.py +1 -1
- scitex/stats/correct/_correct_fdr.py +1 -1
- scitex/stats/correct/_correct_fdr_.py +1 -1
- scitex/stats/correct/_correct_holm.py +1 -1
- scitex/stats/correct/_correct_sidak.py +1 -1
- scitex/stats/effect_sizes/_cliffs_delta.py +1 -1
- scitex/stats/effect_sizes/_cohens_d.py +1 -1
- scitex/stats/effect_sizes/_epsilon_squared.py +1 -1
- scitex/stats/effect_sizes/_eta_squared.py +1 -1
- scitex/stats/effect_sizes/_prob_superiority.py +1 -1
- scitex/stats/io/__init__.py +10 -11
- scitex/stats/io/_bundle.py +16 -16
- scitex/stats/mcp_server.py +405 -0
- scitex/stats/posthoc/_dunnett.py +1 -1
- scitex/stats/posthoc/_games_howell.py +1 -1
- scitex/stats/posthoc/_tukey_hsd.py +1 -1
- scitex/stats/power/_power.py +1 -1
- scitex/stats/utils/_effect_size.py +1 -1
- scitex/stats/utils/_formatters.py +1 -1
- scitex/stats/utils/_power.py +1 -1
- scitex/template/_mcp_handlers.py +259 -0
- scitex/template/_mcp_tool_schemas.py +112 -0
- scitex/template/mcp_server.py +186 -0
- scitex/utils/_verify_scitex_format.py +2 -2
- scitex/utils/template.py +1 -1
- scitex/web/__init__.py +12 -11
- scitex/web/_scraping.py +26 -265
- scitex/web/download_images.py +316 -0
- scitex/writer/Writer.py +1 -1
- scitex/writer/_clone_writer_project.py +1 -1
- scitex/writer/_validate_tree_structures.py +1 -1
- scitex/writer/dataclasses/config/_WriterConfig.py +1 -1
- scitex/writer/dataclasses/contents/_ManuscriptContents.py +1 -1
- scitex/writer/dataclasses/core/_Document.py +1 -1
- scitex/writer/dataclasses/core/_DocumentSection.py +1 -1
- scitex/writer/dataclasses/results/_CompilationResult.py +1 -1
- scitex/writer/dataclasses/results/_LaTeXIssue.py +1 -1
- scitex/writer/utils/.legacy_git_retry.py +7 -5
- scitex/writer/utils/_parse_latex_logs.py +1 -1
- scitex-2.13.0.dist-info/METADATA +1231 -0
- {scitex-2.10.3.dist-info → scitex-2.13.0.dist-info}/RECORD +376 -470
- scitex-2.13.0.dist-info/entry_points.txt +11 -0
- scitex/fig/editor/_defaults.py +0 -300
- scitex/fig/editor/edit/panel_loader.py +0 -232
- scitex/fig/editor/flask_editor/_bbox.py +0 -1299
- scitex/fig/editor/flask_editor/_core.py +0 -1429
- scitex/fig/editor/flask_editor/_renderer.py +0 -813
- scitex/fig/editor/flask_editor/static/css/features/canvas.css +0 -176
- scitex/fts/README.md +0 -262
- scitex/fts/TODO.md +0 -66
- scitex/fts/__init__.py +0 -90
- scitex/fts/_bundle/README_IN_BUNDLE.md +0 -102
- scitex/fts/_bundle/__init__.py +0 -38
- scitex/fts/_bundle/_utils/__init__.py +0 -55
- scitex/fts/_bundle/_utils/_const.py +0 -26
- scitex/fts/_bundle/_utils/_errors.py +0 -73
- scitex/fts/_bundle/_utils/_generate.py +0 -21
- scitex/fts/_bundle/_utils/_types.py +0 -76
- scitex/fts/_bundle/_zipbundle.py +0 -165
- scitex/fts/_fig/__init__.py +0 -22
- scitex/fts/_fig/_backend/_parser.py +0 -188
- scitex/fts/_fig/_editor/__init__.py +0 -14
- scitex/fts/_fig/_editor/_cui/__init__.py +0 -33
- scitex/fts/_fig/_editor/_cui/_backend_detector.py +0 -39
- scitex/fts/_fig/_editor/_cui/_bundle_resolver.py +0 -366
- scitex/fts/_fig/_editor/_cui/_editor_launcher.py +0 -175
- scitex/fts/_fig/_editor/_cui/_manual_handler.py +0 -52
- scitex/fts/_fig/_editor/_cui/_path_resolver.py +0 -66
- scitex/fts/_fig/_editor/_gui/__init__.py +0 -11
- scitex/fts/_fig/_editor/_gui/_flask_editor/__init__.py +0 -20
- scitex/fts/_fig/_editor/_gui/_flask_editor/_plotter.py +0 -664
- scitex/fts/_fig/_editor/_gui/_flask_editor/_utils.py +0 -79
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/base/reset.css +0 -41
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/base/typography.css +0 -16
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/base/variables.css +0 -85
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/components/buttons.css +0 -217
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/components/context-menu.css +0 -93
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/components/dropdown.css +0 -57
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/components/forms.css +0 -112
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/components/modal.css +0 -59
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/components/sections.css +0 -212
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/features/element-inspector.css +0 -190
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/features/loading.css +0 -59
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/features/overlay.css +0 -45
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/features/panel-grid.css +0 -95
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/features/selection.css +0 -101
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/features/statistics.css +0 -138
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/index.css +0 -31
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/layout/container.css +0 -7
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/layout/controls.css +0 -56
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/css/layout/preview.css +0 -78
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/alignment/axis.js +0 -314
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/alignment/basic.js +0 -107
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/alignment/distribute.js +0 -54
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/canvas/canvas.js +0 -172
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/canvas/dragging.js +0 -258
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/canvas/resize.js +0 -48
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/canvas/selection.js +0 -71
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/core/api.js +0 -288
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/core/state.js +0 -143
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/core/utils.js +0 -245
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/dev/element-inspector.js +0 -992
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/editor/bbox.js +0 -339
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/editor/element-drag.js +0 -286
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/editor/overlay.js +0 -371
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/editor/preview.js +0 -293
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/main.js +0 -426
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/shortcuts/context-menu.js +0 -152
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/shortcuts/keyboard.js +0 -265
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/ui/controls.js +0 -184
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/ui/download.js +0 -57
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/ui/help.js +0 -100
- scitex/fts/_fig/_editor/_gui/_flask_editor/static/js/ui/theme.js +0 -34
- scitex/fts/_fig/_editor/_gui/_flask_editor/templates/__init__.py +0 -124
- scitex/fts/_fig/_editor/_gui/_flask_editor/templates/_html.py +0 -851
- scitex/fts/_fig/_editor/_gui/_flask_editor/templates/_scripts.py +0 -4932
- scitex/fts/_fig/_editor/_gui/_flask_editor/templates/_styles.py +0 -1657
- scitex/fts/_fig/_editor/_gui/_flask_editor.py +0 -36
- scitex/fts/_fig/_models/_Annotations.py +0 -115
- scitex/fts/_fig/_models/_Axes.py +0 -152
- scitex/fts/_fig/_models/_Figure.py +0 -138
- scitex/fts/_fig/_models/_Plot.py +0 -123
- scitex/fts/_fig/_utils/_plot_layout.py +0 -397
- scitex/fts/_kinds/_figure/_composite.py +0 -345
- scitex/fts/_kinds/_plot/_backend/__init__.py +0 -53
- scitex/fts/_kinds/_plot/_backend/_export.py +0 -165
- scitex/fts/_kinds/_plot/_backend/_render.py +0 -538
- scitex/fts/_kinds/_plot/_dataclasses/_ChannelEncoding.py +0 -46
- scitex/fts/_kinds/_plot/_dataclasses/_Encoding.py +0 -82
- scitex/fts/_kinds/_plot/_dataclasses/_Theme.py +0 -441
- scitex/fts/_kinds/_plot/_dataclasses/_TraceEncoding.py +0 -52
- scitex/fts/_kinds/_plot/_dataclasses/__init__.py +0 -47
- scitex/fts/_kinds/_plot/_models/_Guides.py +0 -104
- scitex/fts/_kinds/_plot/_models/_Styles.py +0 -245
- scitex/fts/_kinds/_plot/_models/__init__.py +0 -80
- scitex/fts/_kinds/_plot/_models/_plot_types/__init__.py +0 -156
- scitex/fts/_kinds/_plot/_models/_plot_types/_bar.py +0 -43
- scitex/fts/_kinds/_plot/_models/_plot_types/_box.py +0 -38
- scitex/fts/_kinds/_plot/_models/_plot_types/_distribution.py +0 -36
- scitex/fts/_kinds/_plot/_models/_plot_types/_errorbar.py +0 -60
- scitex/fts/_kinds/_plot/_models/_plot_types/_histogram.py +0 -30
- scitex/fts/_kinds/_plot/_models/_plot_types/_image.py +0 -61
- scitex/fts/_kinds/_plot/_models/_plot_types/_line.py +0 -57
- scitex/fts/_kinds/_plot/_models/_plot_types/_scatter.py +0 -30
- scitex/fts/_kinds/_plot/_models/_plot_types/_seaborn.py +0 -121
- scitex/fts/_kinds/_plot/_models/_plot_types/_violin.py +0 -36
- scitex/fts/_kinds/_plot/_utils/__init__.py +0 -129
- scitex/fts/_kinds/_plot/_utils/_auto_layout.py +0 -127
- scitex/fts/_kinds/_plot/_utils/_calc_bounds.py +0 -111
- scitex/fts/_kinds/_plot/_utils/_const_sizes.py +0 -48
- scitex/fts/_kinds/_plot/_utils/_convert_coords.py +0 -77
- scitex/fts/_kinds/_plot/_utils/_get_template.py +0 -178
- scitex/fts/_kinds/_plot/_utils/_normalize.py +0 -73
- scitex/fts/_kinds/_plot/_utils/_validate.py +0 -197
- scitex/fts/_kinds/_table/_latex/_export.py +0 -279
- scitex/fts/_stats/__init__.py +0 -48
- scitex/fts/_stats/_dataclasses/_Stats.py +0 -423
- scitex/fts/_stats/_dataclasses/__init__.py +0 -48
- scitex/fts/_tables/__init__.py +0 -65
- scitex/fts/_tables/_latex/__init__.py +0 -93
- scitex/fts/_tables/_latex/_editor/__init__.py +0 -11
- scitex/fts/_tables/_latex/_editor/_app.py +0 -725
- scitex/fts/_tables/_latex/_figure_exporter.py +0 -153
- scitex/fts/_tables/_latex/_stats_formatter.py +0 -274
- scitex/fts/_tables/_latex/_table_exporter.py +0 -362
- scitex/fts/_tables/_latex/_utils.py +0 -369
- scitex/fts/_tables/_latex/_validator.py +0 -445
- scitex/io/_save_modules/_pltz_bundle.py +0 -356
- scitex-2.10.3.dist-info/METADATA +0 -952
- scitex-2.10.3.dist-info/entry_points.txt +0 -2
- /scitex/{fig → canvas}/README.md +0 -0
- /scitex/{fig → canvas}/backend/__init__.py +0 -0
- /scitex/{fig → canvas}/backend/_export.py +0 -0
- /scitex/{fig → canvas}/backend/_render.py +0 -0
- /scitex/{fig → canvas}/docs/CANVAS_ARCHITECTURE.md +0 -0
- /scitex/{fig → canvas}/editor/__init__.py +0 -0
- /scitex/{fig → canvas}/editor/_dearpygui_editor.py +0 -0
- /scitex/{fig → canvas}/editor/_flask_editor.py +0 -0
- /scitex/{fig → canvas}/editor/_mpl_editor.py +0 -0
- /scitex/{fig → canvas}/editor/_qt_editor.py +0 -0
- /scitex/{fig → canvas}/editor/_tkinter_editor.py +0 -0
- /scitex/{fig → canvas}/editor/edit/backend_detector.py +0 -0
- /scitex/{fig → canvas}/editor/edit/manual_handler.py +0 -0
- /scitex/{fig → canvas}/editor/edit/path_resolver.py +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/__init__.py +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/_plotter.py +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/_utils.py +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/base/reset.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/base/typography.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/base/variables.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/components/buttons.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/components/context-menu.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/components/dropdown.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/components/forms.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/components/modal.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/components/sections.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/features/element-inspector.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/features/loading.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/features/overlay.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/features/selection.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/features/statistics.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/index.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/layout/container.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/layout/controls.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/css/layout/preview.css +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/alignment/axis.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/alignment/basic.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/alignment/distribute.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/canvas/canvas.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/canvas/dragging.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/canvas/resize.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/canvas/selection.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/core/state.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/core/utils.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/dev/element-inspector.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/editor/bbox.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/editor/element-drag.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/editor/overlay.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/main.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/shortcuts/context-menu.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/shortcuts/keyboard.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/ui/controls.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/ui/download.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/ui/help.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/static/js/ui/theme.js +0 -0
- /scitex/{fig → canvas}/editor/flask_editor/templates/__init__.py +0 -0
- /scitex/{fig → canvas}/io/_directory.py +0 -0
- /scitex/{fig → canvas}/model/_plot_types.py +0 -0
- /scitex/{fig → canvas}/utils/_defaults.py +0 -0
- /scitex/{fig → canvas}/utils/_validate.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_conversion/__init__.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_conversion/_bundle2dict.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_conversion/_dict2bundle.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_dataclasses/_Axes.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_dataclasses/_BBox.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_dataclasses/_ColumnDef.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_dataclasses/_DataFormat.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_dataclasses/_DataInfo.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_dataclasses/_DataSource.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_dataclasses/_SizeMM.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_extractors/__init__.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_extractors/_extract_bar.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_extractors/_extract_line.py +0 -0
- /scitex/{fts/_bundle → io/bundle}/_extractors/_extract_scatter.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/__init__.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_figure/__init__.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_figure}/_composite.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_plot/__init__.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_backend/__init__.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_backend/_export.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_backend/_render.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_dataclasses/_ChannelEncoding.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_dataclasses/_Encoding.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_dataclasses/_Theme.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_dataclasses/_TraceEncoding.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_dataclasses/__init__.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/__init__.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/__init__.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_bar.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_box.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_distribution.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_errorbar.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_histogram.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_image.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_line.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_scatter.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_seaborn.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_models/_plot_types/_violin.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/__init__.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/_auto_layout.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/_calc_bounds.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/_const_sizes.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/_convert_coords.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/_get_template.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/_normalize.py +0 -0
- /scitex/{fts/_fig → io/bundle/kinds/_plot}/_utils/_validate.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_shape/__init__.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_stats/__init__.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_stats/_dataclasses/_Stats.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_stats/_dataclasses/__init__.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_table/__init__.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/_editor/__init__.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/_stats_formatter.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/_utils.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_table/_latex/_validator.py +0 -0
- /scitex/{fts/_kinds → io/bundle/kinds}/_text/__init__.py +0 -0
- /scitex/{fts/_schemas → io/bundle/schemas}/data_info.schema.json +0 -0
- /scitex/{fts/_schemas → io/bundle/schemas}/encoding.schema.json +0 -0
- /scitex/{fts/_schemas → io/bundle/schemas}/node.schema.json +0 -0
- /scitex/{fts/_schemas → io/bundle/schemas}/render_manifest.schema.json +0 -0
- /scitex/{fts/_schemas → io/bundle/schemas}/stats.schema.json +0 -0
- /scitex/{fts/_schemas → io/bundle/schemas}/theme.schema.json +0 -0
- {scitex-2.10.3.dist-info → scitex-2.13.0.dist-info}/WHEEL +0 -0
- {scitex-2.10.3.dist-info → scitex-2.13.0.dist-info}/licenses/LICENSE +0 -0
scitex/web/_scraping.py
CHANGED
|
@@ -1,40 +1,21 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
2
|
# File: ./src/scitex/web/_scraping.py
|
|
4
3
|
|
|
5
|
-
"""Web scraping utilities for extracting URLs
|
|
4
|
+
"""Web scraping utilities for extracting URLs."""
|
|
6
5
|
|
|
7
|
-
import os
|
|
8
6
|
import re
|
|
9
7
|
import urllib.parse
|
|
10
|
-
from
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from typing import List, Optional, Set, Tuple
|
|
13
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
|
+
from typing import List, Optional, Set
|
|
14
9
|
|
|
15
10
|
import requests
|
|
16
11
|
from bs4 import BeautifulSoup
|
|
17
|
-
from tqdm import tqdm
|
|
18
|
-
|
|
19
|
-
try:
|
|
20
|
-
from PIL import Image
|
|
21
|
-
from io import BytesIO
|
|
22
|
-
|
|
23
|
-
PILLOW_AVAILABLE = True
|
|
24
|
-
except ImportError:
|
|
25
|
-
PILLOW_AVAILABLE = False
|
|
26
12
|
|
|
27
13
|
from scitex.logging import getLogger
|
|
28
14
|
|
|
29
15
|
logger = getLogger(__name__)
|
|
30
16
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
"""Get default download directory using SCITEX_DIR if available."""
|
|
34
|
-
scitex_root = os.environ.get("SCITEX_DIR")
|
|
35
|
-
if scitex_root is None:
|
|
36
|
-
scitex_root = os.path.expanduser("~/.scitex")
|
|
37
|
-
return os.path.join(scitex_root, "web", "downloads")
|
|
17
|
+
DEFAULT_TIMEOUT = 10
|
|
18
|
+
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
38
19
|
|
|
39
20
|
|
|
40
21
|
def get_urls(
|
|
@@ -49,7 +30,7 @@ def get_urls(
|
|
|
49
30
|
|
|
50
31
|
Args:
|
|
51
32
|
url: The URL of the webpage to scrape
|
|
52
|
-
pattern: Optional regex pattern to filter URLs (e.g., r'
|
|
33
|
+
pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
|
|
53
34
|
absolute: If True, convert relative URLs to absolute URLs
|
|
54
35
|
same_domain: If True, only return URLs from the same domain
|
|
55
36
|
include_external: If True, include external links (only applies if same_domain=False)
|
|
@@ -58,12 +39,16 @@ def get_urls(
|
|
|
58
39
|
List of URLs found on the page
|
|
59
40
|
|
|
60
41
|
Example:
|
|
61
|
-
>>> urls = get_urls('https://example.com', pattern=r'
|
|
42
|
+
>>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
|
|
62
43
|
>>> urls = get_urls('https://example.com', same_domain=True)
|
|
63
44
|
"""
|
|
64
45
|
try:
|
|
65
46
|
logger.info(f"Fetching URLs from: {url}")
|
|
66
|
-
response = requests.get(
|
|
47
|
+
response = requests.get(
|
|
48
|
+
url,
|
|
49
|
+
timeout=DEFAULT_TIMEOUT,
|
|
50
|
+
headers={"User-Agent": DEFAULT_USER_AGENT},
|
|
51
|
+
)
|
|
67
52
|
response.raise_for_status()
|
|
68
53
|
except requests.RequestException as e:
|
|
69
54
|
logger.error(f"Failed to fetch URL {url}: {e}")
|
|
@@ -72,19 +57,14 @@ def get_urls(
|
|
|
72
57
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
73
58
|
urls_found: Set[str] = set()
|
|
74
59
|
|
|
75
|
-
# Parse the base domain
|
|
76
60
|
parsed_base = urllib.parse.urlparse(url)
|
|
77
|
-
base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
|
|
78
61
|
|
|
79
|
-
# Find all links
|
|
80
62
|
for link in soup.find_all("a", href=True):
|
|
81
63
|
href = link["href"]
|
|
82
64
|
|
|
83
|
-
# Convert to absolute URL if requested
|
|
84
65
|
if absolute:
|
|
85
66
|
href = urllib.parse.urljoin(url, href)
|
|
86
67
|
|
|
87
|
-
# Filter by domain if requested
|
|
88
68
|
if same_domain:
|
|
89
69
|
parsed_href = urllib.parse.urlparse(href)
|
|
90
70
|
if parsed_href.netloc != parsed_base.netloc:
|
|
@@ -94,10 +74,8 @@ def get_urls(
|
|
|
94
74
|
if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
|
|
95
75
|
continue
|
|
96
76
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if not re.search(pattern, href):
|
|
100
|
-
continue
|
|
77
|
+
if pattern and not re.search(pattern, href):
|
|
78
|
+
continue
|
|
101
79
|
|
|
102
80
|
urls_found.add(href)
|
|
103
81
|
|
|
@@ -106,171 +84,6 @@ def get_urls(
|
|
|
106
84
|
return result
|
|
107
85
|
|
|
108
86
|
|
|
109
|
-
def download_images(
|
|
110
|
-
url: str,
|
|
111
|
-
output_dir: Optional[str] = None,
|
|
112
|
-
pattern: Optional[str] = None,
|
|
113
|
-
min_size: Optional[Tuple[int, int]] = None,
|
|
114
|
-
max_workers: int = 5,
|
|
115
|
-
same_domain: bool = False,
|
|
116
|
-
) -> List[str]:
|
|
117
|
-
"""
|
|
118
|
-
Download all images from a webpage.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
url: The URL of the webpage to scrape
|
|
122
|
-
output_dir: Directory to save images. Priority:
|
|
123
|
-
1. This parameter if specified
|
|
124
|
-
2. $SCITEX_WEB_DOWNLOADS_DIR environment variable
|
|
125
|
-
3. $SCITEX_DIR/web/downloads (default)
|
|
126
|
-
pattern: Optional regex pattern to filter image URLs
|
|
127
|
-
min_size: Optional minimum size as (width, height) tuple to filter images
|
|
128
|
-
max_workers: Number of concurrent download threads
|
|
129
|
-
same_domain: If True, only download images from the same domain
|
|
130
|
-
|
|
131
|
-
Returns:
|
|
132
|
-
List of paths to downloaded images
|
|
133
|
-
|
|
134
|
-
Note:
|
|
135
|
-
- SVG files are automatically skipped (vector graphics)
|
|
136
|
-
- Images are saved in timestamped subdirectories: images-YYYYMMDD_HHMMSS/
|
|
137
|
-
|
|
138
|
-
Example:
|
|
139
|
-
>>> paths = download_images('https://example.com', output_dir='./downloads')
|
|
140
|
-
>>> paths = download_images('https://example.com', min_size=(100, 100))
|
|
141
|
-
>>> # Uses $SCITEX_WEB_DOWNLOADS_DIR or $SCITEX_DIR/web/downloads
|
|
142
|
-
>>> paths = download_images('https://example.com')
|
|
143
|
-
"""
|
|
144
|
-
if not PILLOW_AVAILABLE:
|
|
145
|
-
logger.warning("Pillow is not available. min_size filtering will be disabled.")
|
|
146
|
-
|
|
147
|
-
# Set default output directory
|
|
148
|
-
if output_dir is None:
|
|
149
|
-
# Check SCITEX_WEB_DOWNLOADS_DIR first
|
|
150
|
-
output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
|
|
151
|
-
if output_dir is None:
|
|
152
|
-
# Fall back to SCITEX_DIR/web/downloads
|
|
153
|
-
output_dir = _get_default_download_dir()
|
|
154
|
-
|
|
155
|
-
# Create timestamped subdirectory
|
|
156
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
157
|
-
output_path = Path(output_dir).expanduser() / f"images-{timestamp}"
|
|
158
|
-
output_path.mkdir(parents=True, exist_ok=True)
|
|
159
|
-
|
|
160
|
-
logger.info(f"Saving images to: {output_path}")
|
|
161
|
-
|
|
162
|
-
try:
|
|
163
|
-
logger.info(f"Fetching page: {url}")
|
|
164
|
-
response = requests.get(url, timeout=30)
|
|
165
|
-
response.raise_for_status()
|
|
166
|
-
except requests.RequestException as e:
|
|
167
|
-
logger.error(f"Failed to fetch URL {url}: {e}")
|
|
168
|
-
return []
|
|
169
|
-
|
|
170
|
-
soup = BeautifulSoup(response.text, "html.parser")
|
|
171
|
-
image_urls: Set[str] = set()
|
|
172
|
-
|
|
173
|
-
# Parse the base domain
|
|
174
|
-
parsed_base = urllib.parse.urlparse(url)
|
|
175
|
-
|
|
176
|
-
# Find all image tags
|
|
177
|
-
for img in soup.find_all("img", src=True):
|
|
178
|
-
img_url = img["src"]
|
|
179
|
-
|
|
180
|
-
# Convert to absolute URL
|
|
181
|
-
img_url = urllib.parse.urljoin(url, img_url)
|
|
182
|
-
|
|
183
|
-
# Skip SVG files (vector graphics, not raster images)
|
|
184
|
-
if img_url.lower().endswith((".svg", ".svgz")):
|
|
185
|
-
continue
|
|
186
|
-
|
|
187
|
-
# Filter by domain if requested
|
|
188
|
-
if same_domain:
|
|
189
|
-
parsed_img = urllib.parse.urlparse(img_url)
|
|
190
|
-
if parsed_img.netloc != parsed_base.netloc:
|
|
191
|
-
continue
|
|
192
|
-
|
|
193
|
-
# Filter by pattern if provided
|
|
194
|
-
if pattern:
|
|
195
|
-
if not re.search(pattern, img_url):
|
|
196
|
-
continue
|
|
197
|
-
|
|
198
|
-
image_urls.add(img_url)
|
|
199
|
-
|
|
200
|
-
logger.info(f"Found {len(image_urls)} images")
|
|
201
|
-
|
|
202
|
-
# Download images
|
|
203
|
-
downloaded_paths = []
|
|
204
|
-
|
|
205
|
-
def download_image(img_url: str) -> Optional[str]:
|
|
206
|
-
try:
|
|
207
|
-
img_response = requests.get(img_url, timeout=30)
|
|
208
|
-
img_response.raise_for_status()
|
|
209
|
-
|
|
210
|
-
# Check image size if requested and Pillow is available
|
|
211
|
-
if min_size and PILLOW_AVAILABLE:
|
|
212
|
-
try:
|
|
213
|
-
img = Image.open(BytesIO(img_response.content))
|
|
214
|
-
if img.size[0] < min_size[0] or img.size[1] < min_size[1]:
|
|
215
|
-
return None
|
|
216
|
-
except Exception:
|
|
217
|
-
pass
|
|
218
|
-
|
|
219
|
-
# Generate filename from URL
|
|
220
|
-
parsed_url = urllib.parse.urlparse(img_url)
|
|
221
|
-
filename = Path(parsed_url.path).name
|
|
222
|
-
|
|
223
|
-
# If filename is empty or doesn't have extension, generate one
|
|
224
|
-
if not filename or "." not in filename:
|
|
225
|
-
ext = ".jpg" # default extension
|
|
226
|
-
if "content-type" in img_response.headers:
|
|
227
|
-
content_type = img_response.headers["content-type"]
|
|
228
|
-
if "png" in content_type:
|
|
229
|
-
ext = ".png"
|
|
230
|
-
elif "gif" in content_type:
|
|
231
|
-
ext = ".gif"
|
|
232
|
-
elif "webp" in content_type:
|
|
233
|
-
ext = ".webp"
|
|
234
|
-
filename = f"image_{hash(img_url)}{ext}"
|
|
235
|
-
|
|
236
|
-
# Save image
|
|
237
|
-
file_path = output_path / filename
|
|
238
|
-
|
|
239
|
-
# Handle duplicate filenames
|
|
240
|
-
counter = 1
|
|
241
|
-
original_stem = file_path.stem
|
|
242
|
-
while file_path.exists():
|
|
243
|
-
file_path = output_path / f"{original_stem}_{counter}{file_path.suffix}"
|
|
244
|
-
counter += 1
|
|
245
|
-
|
|
246
|
-
with open(file_path, "wb") as f:
|
|
247
|
-
f.write(img_response.content)
|
|
248
|
-
|
|
249
|
-
return str(file_path)
|
|
250
|
-
|
|
251
|
-
except Exception as e:
|
|
252
|
-
logger.warning(f"Failed to download image {img_url}: {e}")
|
|
253
|
-
return None
|
|
254
|
-
|
|
255
|
-
# Download images concurrently
|
|
256
|
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
257
|
-
future_to_url = {
|
|
258
|
-
executor.submit(download_image, img_url): img_url for img_url in image_urls
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
for future in tqdm(
|
|
262
|
-
as_completed(future_to_url),
|
|
263
|
-
total=len(image_urls),
|
|
264
|
-
desc="Downloading images",
|
|
265
|
-
):
|
|
266
|
-
result = future.result()
|
|
267
|
-
if result:
|
|
268
|
-
downloaded_paths.append(result)
|
|
269
|
-
|
|
270
|
-
logger.info(f"Downloaded {len(downloaded_paths)} images to {output_dir}")
|
|
271
|
-
return downloaded_paths
|
|
272
|
-
|
|
273
|
-
|
|
274
87
|
def get_image_urls(
|
|
275
88
|
url: str,
|
|
276
89
|
pattern: Optional[str] = None,
|
|
@@ -289,14 +102,19 @@ def get_image_urls(
|
|
|
289
102
|
|
|
290
103
|
Note:
|
|
291
104
|
- SVG files are automatically skipped (vector graphics)
|
|
105
|
+
- Checks both 'src' and 'data-src' attributes for lazy-loaded images
|
|
292
106
|
|
|
293
107
|
Example:
|
|
294
108
|
>>> img_urls = get_image_urls('https://example.com')
|
|
295
|
-
>>> img_urls = get_image_urls('https://example.com', pattern=r'
|
|
109
|
+
>>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
|
|
296
110
|
"""
|
|
297
111
|
try:
|
|
298
112
|
logger.info(f"Fetching image URLs from: {url}")
|
|
299
|
-
response = requests.get(
|
|
113
|
+
response = requests.get(
|
|
114
|
+
url,
|
|
115
|
+
timeout=DEFAULT_TIMEOUT,
|
|
116
|
+
headers={"User-Agent": DEFAULT_USER_AGENT},
|
|
117
|
+
)
|
|
300
118
|
response.raise_for_status()
|
|
301
119
|
except requests.RequestException as e:
|
|
302
120
|
logger.error(f"Failed to fetch URL {url}: {e}")
|
|
@@ -305,85 +123,28 @@ def get_image_urls(
|
|
|
305
123
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
306
124
|
image_urls: Set[str] = set()
|
|
307
125
|
|
|
308
|
-
# Parse the base domain
|
|
309
126
|
parsed_base = urllib.parse.urlparse(url)
|
|
310
127
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
128
|
+
for img in soup.find_all("img"):
|
|
129
|
+
img_url = img.get("src") or img.get("data-src")
|
|
130
|
+
if not img_url:
|
|
131
|
+
continue
|
|
314
132
|
|
|
315
|
-
# Convert to absolute URL
|
|
316
133
|
img_url = urllib.parse.urljoin(url, img_url)
|
|
317
134
|
|
|
318
|
-
# Skip SVG files (vector graphics, not raster images)
|
|
319
135
|
if img_url.lower().endswith((".svg", ".svgz")):
|
|
320
136
|
continue
|
|
321
137
|
|
|
322
|
-
# Filter by domain if requested
|
|
323
138
|
if same_domain:
|
|
324
139
|
parsed_img = urllib.parse.urlparse(img_url)
|
|
325
140
|
if parsed_img.netloc != parsed_base.netloc:
|
|
326
141
|
continue
|
|
327
142
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
if not re.search(pattern, img_url):
|
|
331
|
-
continue
|
|
143
|
+
if pattern and not re.search(pattern, img_url):
|
|
144
|
+
continue
|
|
332
145
|
|
|
333
146
|
image_urls.add(img_url)
|
|
334
147
|
|
|
335
148
|
result = sorted(list(image_urls))
|
|
336
149
|
logger.info(f"Found {len(result)} image URLs")
|
|
337
150
|
return result
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
if __name__ == "__main__":
|
|
341
|
-
import argparse
|
|
342
|
-
|
|
343
|
-
parser = argparse.ArgumentParser(description="Web scraping utilities")
|
|
344
|
-
parser.add_argument("url", type=str, help="URL to scrape")
|
|
345
|
-
parser.add_argument(
|
|
346
|
-
"--mode",
|
|
347
|
-
"-m",
|
|
348
|
-
choices=["urls", "images", "image_urls"],
|
|
349
|
-
default="urls",
|
|
350
|
-
help="Scraping mode",
|
|
351
|
-
)
|
|
352
|
-
parser.add_argument("--output", "-o", type=str, help="Output directory for images")
|
|
353
|
-
parser.add_argument(
|
|
354
|
-
"--pattern", "-p", type=str, help="Regex pattern to filter URLs"
|
|
355
|
-
)
|
|
356
|
-
parser.add_argument(
|
|
357
|
-
"--same-domain", action="store_true", help="Only include URLs from same domain"
|
|
358
|
-
)
|
|
359
|
-
parser.add_argument(
|
|
360
|
-
"--min-size", type=str, help="Minimum image size as WIDTHxHEIGHT"
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
args = parser.parse_args()
|
|
364
|
-
|
|
365
|
-
if args.mode == "urls":
|
|
366
|
-
urls = get_urls(args.url, pattern=args.pattern, same_domain=args.same_domain)
|
|
367
|
-
for url in urls:
|
|
368
|
-
print(url)
|
|
369
|
-
elif args.mode == "images":
|
|
370
|
-
min_size = None
|
|
371
|
-
if args.min_size:
|
|
372
|
-
width, height = map(int, args.min_size.split("x"))
|
|
373
|
-
min_size = (width, height)
|
|
374
|
-
|
|
375
|
-
paths = download_images(
|
|
376
|
-
args.url,
|
|
377
|
-
output_dir=args.output,
|
|
378
|
-
pattern=args.pattern,
|
|
379
|
-
min_size=min_size,
|
|
380
|
-
same_domain=args.same_domain,
|
|
381
|
-
)
|
|
382
|
-
for path in paths:
|
|
383
|
-
print(path)
|
|
384
|
-
elif args.mode == "image_urls":
|
|
385
|
-
img_urls = get_image_urls(
|
|
386
|
-
args.url, pattern=args.pattern, same_domain=args.same_domain
|
|
387
|
-
)
|
|
388
|
-
for img_url in img_urls:
|
|
389
|
-
print(img_url)
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# File: ./src/scitex/web/download_images.py
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Image Downloader for SciTeX.
|
|
6
|
+
|
|
7
|
+
Downloads images from URLs with minimum size filtering.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python -m scitex.web.download_images https://example.com
|
|
11
|
+
python -m scitex.web.download_images https://example.com -o ./downloads
|
|
12
|
+
python -m scitex.web.download_images https://example.com --min-size 800x600
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import urllib.parse
|
|
18
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import List, Optional, Tuple
|
|
22
|
+
|
|
23
|
+
import requests
|
|
24
|
+
from bs4 import BeautifulSoup
|
|
25
|
+
from tqdm import tqdm
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from io import BytesIO
|
|
29
|
+
|
|
30
|
+
from PIL import Image
|
|
31
|
+
|
|
32
|
+
PILLOW_AVAILABLE = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
PILLOW_AVAILABLE = False
|
|
35
|
+
|
|
36
|
+
from scitex.logging import getLogger
|
|
37
|
+
|
|
38
|
+
logger = getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
# Configuration
|
|
41
|
+
DEFAULT_MIN_WIDTH = 400
|
|
42
|
+
DEFAULT_MIN_HEIGHT = 300
|
|
43
|
+
DEFAULT_TIMEOUT = 10
|
|
44
|
+
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_default_download_dir() -> str:
|
|
48
|
+
"""Get default download directory using SCITEX_DIR if available."""
|
|
49
|
+
scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex"))
|
|
50
|
+
return os.path.join(scitex_root, "web", "downloads")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _normalize_url_for_directory(url: str) -> str:
|
|
54
|
+
"""Convert URL to a safe directory name."""
|
|
55
|
+
parsed = urllib.parse.urlparse(url)
|
|
56
|
+
domain = parsed.netloc.replace("www.", "")
|
|
57
|
+
path = parsed.path.strip("/").replace("/", "-")
|
|
58
|
+
|
|
59
|
+
normalized = f"{domain}-{path}" if path else domain
|
|
60
|
+
normalized = re.sub(r"[^\w\-.]", "-", normalized)
|
|
61
|
+
normalized = re.sub(r"-+", "-", normalized)
|
|
62
|
+
normalized = normalized[:100].strip("-")
|
|
63
|
+
|
|
64
|
+
return normalized
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _is_direct_image_url(url: str) -> bool:
|
|
68
|
+
"""Check if URL appears to be a direct image link."""
|
|
69
|
+
extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
|
|
70
|
+
path = urllib.parse.urlparse(url.lower()).path
|
|
71
|
+
return any(path.endswith(ext) for ext in extensions)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]:
|
|
75
|
+
"""Extract image URLs from a webpage."""
|
|
76
|
+
try:
|
|
77
|
+
logger.info(f"Fetching page: {url}")
|
|
78
|
+
response = requests.get(
|
|
79
|
+
url,
|
|
80
|
+
timeout=DEFAULT_TIMEOUT,
|
|
81
|
+
headers={"User-Agent": DEFAULT_USER_AGENT},
|
|
82
|
+
)
|
|
83
|
+
response.raise_for_status()
|
|
84
|
+
except requests.RequestException as e:
|
|
85
|
+
logger.error(f"Failed to fetch page: {e}")
|
|
86
|
+
return []
|
|
87
|
+
|
|
88
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
89
|
+
parsed_base = urllib.parse.urlparse(url)
|
|
90
|
+
image_urls = set()
|
|
91
|
+
|
|
92
|
+
for img in soup.find_all("img"):
|
|
93
|
+
img_url = img.get("src") or img.get("data-src")
|
|
94
|
+
if not img_url:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
img_url = urllib.parse.urljoin(url, img_url)
|
|
98
|
+
|
|
99
|
+
if img_url.lower().endswith((".svg", ".svgz")):
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
if same_domain:
|
|
103
|
+
parsed_img = urllib.parse.urlparse(img_url)
|
|
104
|
+
if parsed_img.netloc != parsed_base.netloc:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
image_urls.add(img_url)
|
|
108
|
+
|
|
109
|
+
logger.info(f"Found {len(image_urls)} images on page")
|
|
110
|
+
return list(image_urls)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _download_single_image(
|
|
114
|
+
img_url: str,
|
|
115
|
+
output_dir: Path,
|
|
116
|
+
counter: int,
|
|
117
|
+
min_size: Optional[Tuple[int, int]],
|
|
118
|
+
) -> Optional[str]:
|
|
119
|
+
"""Download a single image."""
|
|
120
|
+
try:
|
|
121
|
+
response = requests.get(
|
|
122
|
+
img_url,
|
|
123
|
+
timeout=DEFAULT_TIMEOUT,
|
|
124
|
+
headers={"User-Agent": DEFAULT_USER_AGENT},
|
|
125
|
+
)
|
|
126
|
+
response.raise_for_status()
|
|
127
|
+
|
|
128
|
+
# Validate content-type
|
|
129
|
+
content_type = response.headers.get("content-type", "")
|
|
130
|
+
if not content_type.startswith("image/"):
|
|
131
|
+
logger.debug(f"Skipping non-image: {content_type}")
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
# Check dimensions
|
|
135
|
+
if min_size and PILLOW_AVAILABLE:
|
|
136
|
+
try:
|
|
137
|
+
img = Image.open(BytesIO(response.content))
|
|
138
|
+
width, height = img.size
|
|
139
|
+
if width < min_size[0] or height < min_size[1]:
|
|
140
|
+
logger.debug(
|
|
141
|
+
f"Skipping small image: {width}x{height} "
|
|
142
|
+
f"(min: {min_size[0]}x{min_size[1]})"
|
|
143
|
+
)
|
|
144
|
+
return None
|
|
145
|
+
except Exception:
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
# Determine extension
|
|
149
|
+
ext = "jpg"
|
|
150
|
+
if PILLOW_AVAILABLE:
|
|
151
|
+
try:
|
|
152
|
+
img = Image.open(BytesIO(response.content))
|
|
153
|
+
fmt = img.format.lower() if img.format else "jpeg"
|
|
154
|
+
ext = "jpg" if fmt == "jpeg" else fmt
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
elif "png" in content_type:
|
|
158
|
+
ext = "png"
|
|
159
|
+
elif "gif" in content_type:
|
|
160
|
+
ext = "gif"
|
|
161
|
+
elif "webp" in content_type:
|
|
162
|
+
ext = "webp"
|
|
163
|
+
|
|
164
|
+
filename = f"{counter:04d}.{ext}"
|
|
165
|
+
filepath = output_dir / filename
|
|
166
|
+
|
|
167
|
+
with open(filepath, "wb") as f:
|
|
168
|
+
f.write(response.content)
|
|
169
|
+
|
|
170
|
+
logger.info(f"Downloaded: {filename}")
|
|
171
|
+
return str(filepath)
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.warning(f"Error downloading {img_url}: {e}")
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def download_images(
|
|
179
|
+
url: str,
|
|
180
|
+
output_dir: Optional[str] = None,
|
|
181
|
+
min_size: Optional[Tuple[int, int]] = None,
|
|
182
|
+
max_workers: int = 5,
|
|
183
|
+
same_domain: bool = False,
|
|
184
|
+
) -> List[str]:
|
|
185
|
+
"""
|
|
186
|
+
Download images from a URL.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
url: Webpage URL or direct image URL
|
|
190
|
+
output_dir: Output directory (default: $SCITEX_DIR/web/downloads)
|
|
191
|
+
min_size: Minimum (width, height) to filter small images (default: 400x300)
|
|
192
|
+
max_workers: Concurrent download threads
|
|
193
|
+
same_domain: Only download images from the same domain
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
List of downloaded file paths
|
|
197
|
+
|
|
198
|
+
Example:
|
|
199
|
+
>>> paths = download_images("https://example.com")
|
|
200
|
+
>>> paths = download_images("https://example.com/photo.jpg")
|
|
201
|
+
>>> paths = download_images("https://example.com", min_size=(800, 600))
|
|
202
|
+
"""
|
|
203
|
+
if not PILLOW_AVAILABLE:
|
|
204
|
+
logger.warning("Pillow not available. Size filtering disabled.")
|
|
205
|
+
min_size = None
|
|
206
|
+
elif min_size is None:
|
|
207
|
+
min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT)
|
|
208
|
+
|
|
209
|
+
# Setup output directory
|
|
210
|
+
if output_dir is None:
|
|
211
|
+
output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
|
|
212
|
+
if output_dir is None:
|
|
213
|
+
output_dir = _get_default_download_dir()
|
|
214
|
+
|
|
215
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
216
|
+
normalized = _normalize_url_for_directory(url)
|
|
217
|
+
output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images"
|
|
218
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
219
|
+
|
|
220
|
+
logger.info(f"Output directory: {output_path}")
|
|
221
|
+
|
|
222
|
+
# Get image URLs
|
|
223
|
+
if _is_direct_image_url(url):
|
|
224
|
+
image_urls = [url]
|
|
225
|
+
logger.info("Direct image URL detected")
|
|
226
|
+
else:
|
|
227
|
+
image_urls = _extract_image_urls(url, same_domain=same_domain)
|
|
228
|
+
|
|
229
|
+
if not image_urls:
|
|
230
|
+
logger.warning("No images found")
|
|
231
|
+
return []
|
|
232
|
+
|
|
233
|
+
# Download concurrently
|
|
234
|
+
downloaded = []
|
|
235
|
+
counter = [1]
|
|
236
|
+
|
|
237
|
+
def download_with_counter(img_url: str) -> Optional[str]:
|
|
238
|
+
idx = counter[0]
|
|
239
|
+
counter[0] += 1
|
|
240
|
+
return _download_single_image(img_url, output_path, idx, min_size)
|
|
241
|
+
|
|
242
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
243
|
+
futures = {executor.submit(download_with_counter, u): u for u in image_urls}
|
|
244
|
+
|
|
245
|
+
for future in tqdm(
|
|
246
|
+
as_completed(futures), total=len(image_urls), desc="Downloading"
|
|
247
|
+
):
|
|
248
|
+
result = future.result()
|
|
249
|
+
if result:
|
|
250
|
+
downloaded.append(result)
|
|
251
|
+
|
|
252
|
+
logger.info(f"Downloaded {len(downloaded)} images to {output_path}")
|
|
253
|
+
return downloaded
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def main():
|
|
257
|
+
"""CLI entry point."""
|
|
258
|
+
import argparse
|
|
259
|
+
|
|
260
|
+
parser = argparse.ArgumentParser(
|
|
261
|
+
description="Download images from URL",
|
|
262
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
263
|
+
epilog="""
|
|
264
|
+
Examples:
|
|
265
|
+
python -m scitex.web.download_images https://example.com
|
|
266
|
+
python -m scitex.web.download_images https://example.com -o ./downloads
|
|
267
|
+
python -m scitex.web.download_images https://example.com --min-size 800x600
|
|
268
|
+
python -m scitex.web.download_images https://example.com --no-min-size
|
|
269
|
+
""",
|
|
270
|
+
)
|
|
271
|
+
parser.add_argument("url", help="URL to download images from")
|
|
272
|
+
parser.add_argument("-o", "--output", help="Output directory")
|
|
273
|
+
parser.add_argument(
|
|
274
|
+
"--min-size",
|
|
275
|
+
default="400x300",
|
|
276
|
+
help="Minimum size WIDTHxHEIGHT (default: 400x300)",
|
|
277
|
+
)
|
|
278
|
+
parser.add_argument(
|
|
279
|
+
"--no-min-size",
|
|
280
|
+
action="store_true",
|
|
281
|
+
help="Disable size filtering",
|
|
282
|
+
)
|
|
283
|
+
parser.add_argument(
|
|
284
|
+
"--same-domain",
|
|
285
|
+
action="store_true",
|
|
286
|
+
help="Only download from same domain",
|
|
287
|
+
)
|
|
288
|
+
parser.add_argument(
|
|
289
|
+
"--workers",
|
|
290
|
+
type=int,
|
|
291
|
+
default=5,
|
|
292
|
+
help="Concurrent downloads (default: 5)",
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
args = parser.parse_args()
|
|
296
|
+
|
|
297
|
+
min_size = None
|
|
298
|
+
if not args.no_min_size and args.min_size:
|
|
299
|
+
w, h = map(int, args.min_size.split("x"))
|
|
300
|
+
min_size = (w, h)
|
|
301
|
+
|
|
302
|
+
paths = download_images(
|
|
303
|
+
args.url,
|
|
304
|
+
output_dir=args.output,
|
|
305
|
+
min_size=min_size,
|
|
306
|
+
max_workers=args.workers,
|
|
307
|
+
same_domain=args.same_domain,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
print(f"\nDownloaded {len(paths)} images:")
|
|
311
|
+
for p in paths:
|
|
312
|
+
print(f" {p}")
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
if __name__ == "__main__":
|
|
316
|
+
main()
|