scitex 2.7.0__py3-none-any.whl → 2.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scitex/__init__.py +6 -2
- scitex/__version__.py +1 -1
- scitex/audio/README.md +52 -0
- scitex/audio/__init__.py +384 -0
- scitex/audio/__main__.py +129 -0
- scitex/audio/_tts.py +334 -0
- scitex/audio/engines/__init__.py +44 -0
- scitex/audio/engines/base.py +275 -0
- scitex/audio/engines/elevenlabs_engine.py +143 -0
- scitex/audio/engines/gtts_engine.py +162 -0
- scitex/audio/engines/pyttsx3_engine.py +131 -0
- scitex/audio/mcp_server.py +757 -0
- scitex/bridge/_helpers.py +1 -1
- scitex/bridge/_plt_vis.py +1 -1
- scitex/bridge/_stats_vis.py +1 -1
- scitex/dev/plt/__init__.py +272 -0
- scitex/dev/plt/plot_mpl_axhline.py +28 -0
- scitex/dev/plt/plot_mpl_axhspan.py +28 -0
- scitex/dev/plt/plot_mpl_axvline.py +28 -0
- scitex/dev/plt/plot_mpl_axvspan.py +28 -0
- scitex/dev/plt/plot_mpl_bar.py +29 -0
- scitex/dev/plt/plot_mpl_barh.py +29 -0
- scitex/dev/plt/plot_mpl_boxplot.py +28 -0
- scitex/dev/plt/plot_mpl_contour.py +31 -0
- scitex/dev/plt/plot_mpl_contourf.py +31 -0
- scitex/dev/plt/plot_mpl_errorbar.py +30 -0
- scitex/dev/plt/plot_mpl_eventplot.py +28 -0
- scitex/dev/plt/plot_mpl_fill.py +30 -0
- scitex/dev/plt/plot_mpl_fill_between.py +31 -0
- scitex/dev/plt/plot_mpl_hexbin.py +28 -0
- scitex/dev/plt/plot_mpl_hist.py +28 -0
- scitex/dev/plt/plot_mpl_hist2d.py +28 -0
- scitex/dev/plt/plot_mpl_imshow.py +29 -0
- scitex/dev/plt/plot_mpl_pcolormesh.py +31 -0
- scitex/dev/plt/plot_mpl_pie.py +29 -0
- scitex/dev/plt/plot_mpl_plot.py +29 -0
- scitex/dev/plt/plot_mpl_quiver.py +31 -0
- scitex/dev/plt/plot_mpl_scatter.py +28 -0
- scitex/dev/plt/plot_mpl_stackplot.py +31 -0
- scitex/dev/plt/plot_mpl_stem.py +29 -0
- scitex/dev/plt/plot_mpl_step.py +29 -0
- scitex/dev/plt/plot_mpl_violinplot.py +28 -0
- scitex/dev/plt/plot_sns_barplot.py +29 -0
- scitex/dev/plt/plot_sns_boxplot.py +29 -0
- scitex/dev/plt/plot_sns_heatmap.py +28 -0
- scitex/dev/plt/plot_sns_histplot.py +29 -0
- scitex/dev/plt/plot_sns_kdeplot.py +29 -0
- scitex/dev/plt/plot_sns_lineplot.py +31 -0
- scitex/dev/plt/plot_sns_scatterplot.py +29 -0
- scitex/dev/plt/plot_sns_stripplot.py +29 -0
- scitex/dev/plt/plot_sns_swarmplot.py +29 -0
- scitex/dev/plt/plot_sns_violinplot.py +29 -0
- scitex/dev/plt/plot_stx_bar.py +29 -0
- scitex/dev/plt/plot_stx_barh.py +29 -0
- scitex/dev/plt/plot_stx_box.py +28 -0
- scitex/dev/plt/plot_stx_boxplot.py +28 -0
- scitex/dev/plt/plot_stx_conf_mat.py +28 -0
- scitex/dev/plt/plot_stx_contour.py +31 -0
- scitex/dev/plt/plot_stx_ecdf.py +28 -0
- scitex/dev/plt/plot_stx_errorbar.py +30 -0
- scitex/dev/plt/plot_stx_fill_between.py +31 -0
- scitex/dev/plt/plot_stx_fillv.py +28 -0
- scitex/dev/plt/plot_stx_heatmap.py +28 -0
- scitex/dev/plt/plot_stx_image.py +28 -0
- scitex/dev/plt/plot_stx_imshow.py +28 -0
- scitex/dev/plt/plot_stx_joyplot.py +28 -0
- scitex/dev/plt/plot_stx_kde.py +28 -0
- scitex/dev/plt/plot_stx_line.py +28 -0
- scitex/dev/plt/plot_stx_mean_ci.py +28 -0
- scitex/dev/plt/plot_stx_mean_std.py +28 -0
- scitex/dev/plt/plot_stx_median_iqr.py +28 -0
- scitex/dev/plt/plot_stx_raster.py +28 -0
- scitex/dev/plt/plot_stx_rectangle.py +28 -0
- scitex/dev/plt/plot_stx_scatter.py +29 -0
- scitex/dev/plt/plot_stx_shaded_line.py +29 -0
- scitex/dev/plt/plot_stx_violin.py +28 -0
- scitex/dev/plt/plot_stx_violinplot.py +28 -0
- scitex/diagram/README.md +197 -0
- scitex/diagram/__init__.py +48 -0
- scitex/diagram/_compile.py +312 -0
- scitex/diagram/_diagram.py +355 -0
- scitex/diagram/_presets.py +173 -0
- scitex/diagram/_schema.py +182 -0
- scitex/diagram/_split.py +278 -0
- scitex/fig/__init__.py +352 -0
- scitex/{vis → fig}/backend/_parser.py +1 -1
- scitex/{vis → fig}/canvas.py +1 -1
- scitex/{vis → fig}/editor/__init__.py +5 -2
- scitex/{vis → fig}/editor/_dearpygui_editor.py +1 -1
- scitex/{vis → fig}/editor/_defaults.py +70 -5
- scitex/{vis → fig}/editor/_mpl_editor.py +1 -1
- scitex/{vis → fig}/editor/_qt_editor.py +182 -2
- scitex/{vis → fig}/editor/_tkinter_editor.py +1 -1
- scitex/fig/editor/edit/__init__.py +50 -0
- scitex/fig/editor/edit/backend_detector.py +109 -0
- scitex/fig/editor/edit/bundle_resolver.py +240 -0
- scitex/fig/editor/edit/editor_launcher.py +239 -0
- scitex/fig/editor/edit/manual_handler.py +53 -0
- scitex/fig/editor/edit/panel_loader.py +232 -0
- scitex/fig/editor/edit/path_resolver.py +67 -0
- scitex/fig/editor/flask_editor/_bbox.py +1299 -0
- scitex/fig/editor/flask_editor/_core.py +1429 -0
- scitex/{vis → fig}/editor/flask_editor/_plotter.py +38 -4
- scitex/fig/editor/flask_editor/_renderer.py +813 -0
- scitex/fig/editor/flask_editor/static/css/base/reset.css +41 -0
- scitex/fig/editor/flask_editor/static/css/base/typography.css +16 -0
- scitex/fig/editor/flask_editor/static/css/base/variables.css +85 -0
- scitex/fig/editor/flask_editor/static/css/components/buttons.css +217 -0
- scitex/fig/editor/flask_editor/static/css/components/context-menu.css +93 -0
- scitex/fig/editor/flask_editor/static/css/components/dropdown.css +57 -0
- scitex/fig/editor/flask_editor/static/css/components/forms.css +112 -0
- scitex/fig/editor/flask_editor/static/css/components/modal.css +59 -0
- scitex/fig/editor/flask_editor/static/css/components/sections.css +212 -0
- scitex/fig/editor/flask_editor/static/css/features/canvas.css +176 -0
- scitex/fig/editor/flask_editor/static/css/features/element-inspector.css +190 -0
- scitex/fig/editor/flask_editor/static/css/features/loading.css +59 -0
- scitex/fig/editor/flask_editor/static/css/features/overlay.css +45 -0
- scitex/fig/editor/flask_editor/static/css/features/panel-grid.css +95 -0
- scitex/fig/editor/flask_editor/static/css/features/selection.css +101 -0
- scitex/fig/editor/flask_editor/static/css/features/statistics.css +138 -0
- scitex/fig/editor/flask_editor/static/css/index.css +31 -0
- scitex/fig/editor/flask_editor/static/css/layout/container.css +7 -0
- scitex/fig/editor/flask_editor/static/css/layout/controls.css +56 -0
- scitex/fig/editor/flask_editor/static/css/layout/preview.css +78 -0
- scitex/fig/editor/flask_editor/static/js/alignment/axis.js +314 -0
- scitex/fig/editor/flask_editor/static/js/alignment/basic.js +107 -0
- scitex/fig/editor/flask_editor/static/js/alignment/distribute.js +54 -0
- scitex/fig/editor/flask_editor/static/js/canvas/canvas.js +172 -0
- scitex/fig/editor/flask_editor/static/js/canvas/dragging.js +258 -0
- scitex/fig/editor/flask_editor/static/js/canvas/resize.js +48 -0
- scitex/fig/editor/flask_editor/static/js/canvas/selection.js +71 -0
- scitex/fig/editor/flask_editor/static/js/core/api.js +288 -0
- scitex/fig/editor/flask_editor/static/js/core/state.js +143 -0
- scitex/fig/editor/flask_editor/static/js/core/utils.js +245 -0
- scitex/fig/editor/flask_editor/static/js/dev/element-inspector.js +992 -0
- scitex/fig/editor/flask_editor/static/js/editor/bbox.js +339 -0
- scitex/fig/editor/flask_editor/static/js/editor/element-drag.js +286 -0
- scitex/fig/editor/flask_editor/static/js/editor/overlay.js +371 -0
- scitex/fig/editor/flask_editor/static/js/editor/preview.js +293 -0
- scitex/fig/editor/flask_editor/static/js/main.js +426 -0
- scitex/fig/editor/flask_editor/static/js/shortcuts/context-menu.js +152 -0
- scitex/fig/editor/flask_editor/static/js/shortcuts/keyboard.js +265 -0
- scitex/fig/editor/flask_editor/static/js/ui/controls.js +184 -0
- scitex/fig/editor/flask_editor/static/js/ui/download.js +57 -0
- scitex/fig/editor/flask_editor/static/js/ui/help.js +100 -0
- scitex/fig/editor/flask_editor/static/js/ui/theme.js +34 -0
- scitex/fig/editor/flask_editor/templates/__init__.py +123 -0
- scitex/fig/editor/flask_editor/templates/_html.py +852 -0
- scitex/fig/editor/flask_editor/templates/_scripts.py +4933 -0
- scitex/fig/editor/flask_editor/templates/_styles.py +1658 -0
- scitex/{vis → fig}/io/__init__.py +13 -1
- scitex/fig/io/_bundle.py +1058 -0
- scitex/{vis → fig}/io/_canvas.py +1 -1
- scitex/{vis → fig}/io/_data.py +1 -1
- scitex/{vis → fig}/io/_export.py +1 -1
- scitex/{vis → fig}/io/_load.py +1 -1
- scitex/{vis → fig}/io/_panel.py +1 -1
- scitex/{vis → fig}/io/_save.py +1 -1
- scitex/{vis → fig}/model/__init__.py +1 -1
- scitex/{vis → fig}/model/_annotations.py +1 -1
- scitex/{vis → fig}/model/_axes.py +1 -1
- scitex/{vis → fig}/model/_figure.py +1 -1
- scitex/{vis → fig}/model/_guides.py +1 -1
- scitex/{vis → fig}/model/_plot.py +1 -1
- scitex/{vis → fig}/model/_styles.py +1 -1
- scitex/{vis → fig}/utils/__init__.py +1 -1
- scitex/io/__init__.py +22 -26
- scitex/io/_bundle.py +493 -0
- scitex/io/_flush.py +5 -2
- scitex/io/_load.py +98 -0
- scitex/io/_load_modules/_H5Explorer.py +5 -2
- scitex/io/_load_modules/_canvas.py +2 -2
- scitex/io/_load_modules/_image.py +3 -4
- scitex/io/_load_modules/_txt.py +4 -2
- scitex/io/_metadata.py +34 -324
- scitex/io/_metadata_modules/__init__.py +46 -0
- scitex/io/_metadata_modules/_embed.py +70 -0
- scitex/io/_metadata_modules/_read.py +64 -0
- scitex/io/_metadata_modules/_utils.py +79 -0
- scitex/io/_metadata_modules/embed_metadata_jpeg.py +74 -0
- scitex/io/_metadata_modules/embed_metadata_pdf.py +53 -0
- scitex/io/_metadata_modules/embed_metadata_png.py +26 -0
- scitex/io/_metadata_modules/embed_metadata_svg.py +62 -0
- scitex/io/_metadata_modules/read_metadata_jpeg.py +57 -0
- scitex/io/_metadata_modules/read_metadata_pdf.py +51 -0
- scitex/io/_metadata_modules/read_metadata_png.py +39 -0
- scitex/io/_metadata_modules/read_metadata_svg.py +44 -0
- scitex/io/_qr_utils.py +5 -3
- scitex/io/_save.py +548 -30
- scitex/io/_save_modules/_canvas.py +3 -3
- scitex/io/_save_modules/_image.py +5 -9
- scitex/io/_save_modules/_tex.py +7 -4
- scitex/io/_zip_bundle.py +439 -0
- scitex/io/utils/h5_to_zarr.py +11 -9
- scitex/msword/__init__.py +255 -0
- scitex/msword/profiles.py +357 -0
- scitex/msword/reader.py +753 -0
- scitex/msword/utils.py +289 -0
- scitex/msword/writer.py +362 -0
- scitex/plt/__init__.py +5 -2
- scitex/plt/_subplots/_AxesWrapper.py +6 -6
- scitex/plt/_subplots/_AxisWrapper.py +15 -9
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin/__init__.py +36 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin/_labels.py +264 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin/_metadata.py +213 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin/_visual.py +128 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin/__init__.py +59 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin/_base.py +34 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin/_scientific.py +593 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin/_statistical.py +654 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin/_stx_aliases.py +527 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_RawMatplotlibMixin.py +321 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/__init__.py +33 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_base.py +152 -0
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin/_wrappers.py +600 -0
- scitex/plt/_subplots/_AxisWrapperMixins/__init__.py +79 -5
- scitex/plt/_subplots/_FigWrapper.py +6 -6
- scitex/plt/_subplots/_SubplotsWrapper.py +28 -18
- scitex/plt/_subplots/_export_as_csv.py +35 -5
- scitex/plt/_subplots/_export_as_csv_formatters/__init__.py +8 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_annotate.py +10 -21
- scitex/plt/_subplots/_export_as_csv_formatters/_format_eventplot.py +18 -7
- scitex/plt/_subplots/_export_as_csv_formatters/_format_imshow2d.py +28 -12
- scitex/plt/_subplots/_export_as_csv_formatters/_format_matshow.py +10 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_imshow.py +13 -1
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_kde.py +12 -2
- scitex/plt/_subplots/_export_as_csv_formatters/_format_plot_scatter.py +10 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_quiver.py +10 -4
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_jointplot.py +18 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_lineplot.py +44 -36
- scitex/plt/_subplots/_export_as_csv_formatters/_format_sns_pairplot.py +14 -2
- scitex/plt/_subplots/_export_as_csv_formatters/_format_streamplot.py +11 -5
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_bar.py +84 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_barh.py +85 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_conf_mat.py +14 -3
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_contour.py +54 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_ecdf.py +14 -2
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_errorbar.py +120 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_heatmap.py +16 -6
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_image.py +29 -19
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_imshow.py +63 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_joyplot.py +22 -5
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_mean_ci.py +18 -14
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_mean_std.py +18 -14
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_median_iqr.py +18 -14
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_raster.py +10 -2
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_scatter.py +51 -0
- scitex/plt/_subplots/_export_as_csv_formatters/_format_stx_scatter_hist.py +18 -9
- scitex/plt/ax/_plot/_stx_ecdf.py +4 -2
- scitex/plt/gallery/_generate.py +421 -14
- scitex/plt/io/__init__.py +53 -0
- scitex/plt/io/_bundle.py +490 -0
- scitex/plt/io/_layered_bundle.py +1343 -0
- scitex/plt/styles/SCITEX_STYLE.yaml +26 -0
- scitex/plt/styles/__init__.py +14 -0
- scitex/plt/styles/presets.py +78 -0
- scitex/plt/utils/__init__.py +13 -1
- scitex/plt/utils/_collect_figure_metadata.py +10 -14
- scitex/plt/utils/_configure_mpl.py +6 -18
- scitex/plt/utils/_crop.py +32 -14
- scitex/plt/utils/_csv_column_naming.py +54 -0
- scitex/plt/utils/_figure_mm.py +116 -1
- scitex/plt/utils/_hitmap.py +1643 -0
- scitex/plt/utils/metadata/__init__.py +25 -0
- scitex/plt/utils/metadata/_core.py +9 -10
- scitex/plt/utils/metadata/_dimensions.py +6 -3
- scitex/plt/utils/metadata/_editable_export.py +405 -0
- scitex/plt/utils/metadata/_geometry_extraction.py +570 -0
- scitex/schema/__init__.py +109 -16
- scitex/schema/_canvas.py +1 -1
- scitex/schema/_plot.py +1015 -0
- scitex/schema/_stats.py +2 -2
- scitex/stats/__init__.py +117 -0
- scitex/stats/io/__init__.py +29 -0
- scitex/stats/io/_bundle.py +156 -0
- scitex/tex/__init__.py +4 -0
- scitex/tex/_export.py +890 -0
- {scitex-2.7.0.dist-info → scitex-2.8.1.dist-info}/METADATA +11 -1
- {scitex-2.7.0.dist-info → scitex-2.8.1.dist-info}/RECORD +294 -170
- scitex/io/memo.md +0 -2827
- scitex/plt/REQUESTS.md +0 -191
- scitex/plt/_subplots/TODO.md +0 -53
- scitex/plt/_subplots/_AxisWrapperMixins/_AdjustmentMixin.py +0 -559
- scitex/plt/_subplots/_AxisWrapperMixins/_MatplotlibPlotMixin.py +0 -1609
- scitex/plt/_subplots/_AxisWrapperMixins/_SeabornMixin.py +0 -447
- scitex/plt/templates/research-master/scitex/vis/gallery/area/fill_between.json +0 -110
- scitex/plt/templates/research-master/scitex/vis/gallery/area/fill_betweenx.json +0 -88
- scitex/plt/templates/research-master/scitex/vis/gallery/area/stx_fill_between.json +0 -103
- scitex/plt/templates/research-master/scitex/vis/gallery/area/stx_fillv.json +0 -106
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/bar.json +0 -92
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/barh.json +0 -92
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/boxplot.json +0 -92
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_bar.json +0 -84
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_barh.json +0 -84
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_box.json +0 -83
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_boxplot.json +0 -93
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_violin.json +0 -91
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/stx_violinplot.json +0 -91
- scitex/plt/templates/research-master/scitex/vis/gallery/categorical/violinplot.json +0 -91
- scitex/plt/templates/research-master/scitex/vis/gallery/contour/contour.json +0 -97
- scitex/plt/templates/research-master/scitex/vis/gallery/contour/contourf.json +0 -98
- scitex/plt/templates/research-master/scitex/vis/gallery/contour/stx_contour.json +0 -84
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/hist.json +0 -101
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/hist2d.json +0 -96
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/stx_ecdf.json +0 -95
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/stx_joyplot.json +0 -95
- scitex/plt/templates/research-master/scitex/vis/gallery/distribution/stx_kde.json +0 -93
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/imshow.json +0 -95
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/matshow.json +0 -95
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_conf_mat.json +0 -83
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_heatmap.json +0 -92
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_image.json +0 -121
- scitex/plt/templates/research-master/scitex/vis/gallery/grid/stx_imshow.json +0 -84
- scitex/plt/templates/research-master/scitex/vis/gallery/line/plot.json +0 -110
- scitex/plt/templates/research-master/scitex/vis/gallery/line/step.json +0 -92
- scitex/plt/templates/research-master/scitex/vis/gallery/line/stx_line.json +0 -95
- scitex/plt/templates/research-master/scitex/vis/gallery/line/stx_shaded_line.json +0 -96
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/hexbin.json +0 -95
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/scatter.json +0 -95
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/stem.json +0 -92
- scitex/plt/templates/research-master/scitex/vis/gallery/scatter/stx_scatter.json +0 -84
- scitex/plt/templates/research-master/scitex/vis/gallery/special/pie.json +0 -94
- scitex/plt/templates/research-master/scitex/vis/gallery/special/stx_raster.json +0 -109
- scitex/plt/templates/research-master/scitex/vis/gallery/special/stx_rectangle.json +0 -108
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/errorbar.json +0 -93
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_errorbar.json +0 -84
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_mean_ci.json +0 -96
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_mean_std.json +0 -96
- scitex/plt/templates/research-master/scitex/vis/gallery/statistical/stx_median_iqr.json +0 -96
- scitex/plt/templates/research-master/scitex/vis/gallery/vector/quiver.json +0 -99
- scitex/plt/templates/research-master/scitex/vis/gallery/vector/streamplot.json +0 -100
- scitex/vis/__init__.py +0 -177
- scitex/vis/editor/_edit.py +0 -390
- scitex/vis/editor/flask_editor/_bbox.py +0 -529
- scitex/vis/editor/flask_editor/_core.py +0 -168
- scitex/vis/editor/flask_editor/_renderer.py +0 -393
- scitex/vis/editor/flask_editor/templates/__init__.py +0 -33
- scitex/vis/editor/flask_editor/templates/_html.py +0 -513
- scitex/vis/editor/flask_editor/templates/_scripts.py +0 -1261
- scitex/vis/editor/flask_editor/templates/_styles.py +0 -739
- /scitex/{vis → fig}/README.md +0 -0
- /scitex/{vis → fig}/backend/__init__.py +0 -0
- /scitex/{vis → fig}/backend/_export.py +0 -0
- /scitex/{vis → fig}/backend/_render.py +0 -0
- /scitex/{vis → fig}/docs/CANVAS_ARCHITECTURE.md +0 -0
- /scitex/{vis → fig}/editor/_flask_editor.py +0 -0
- /scitex/{vis → fig}/editor/flask_editor/__init__.py +0 -0
- /scitex/{vis → fig}/editor/flask_editor/_utils.py +0 -0
- /scitex/{vis → fig}/io/_directory.py +0 -0
- /scitex/{vis → fig}/model/_plot_types.py +0 -0
- /scitex/{vis → fig}/utils/_defaults.py +0 -0
- /scitex/{vis → fig}/utils/_validate.py +0 -0
- {scitex-2.7.0.dist-info → scitex-2.8.1.dist-info}/WHEEL +0 -0
- {scitex-2.7.0.dist-info → scitex-2.8.1.dist-info}/entry_points.txt +0 -0
- {scitex-2.7.0.dist-info → scitex-2.8.1.dist-info}/licenses/LICENSE +0 -0
scitex/msword/reader.py
ADDED
|
@@ -0,0 +1,753 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# Timestamp: 2025-12-11 15:15:00
|
|
4
|
+
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/reader.py
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
DOCX -> SciTeX writer document converter.
|
|
8
|
+
|
|
9
|
+
This module reads MS Word .docx files and converts them into
|
|
10
|
+
SciTeX's intermediate document format for further processing.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
import re
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
|
|
21
|
+
from .profiles import BaseWordProfile
|
|
22
|
+
|
|
23
|
+
# Lazy import for python-docx
|
|
24
|
+
try:
|
|
25
|
+
import docx
|
|
26
|
+
from docx.document import Document as DocxDocument
|
|
27
|
+
from docx.oxml.ns import qn
|
|
28
|
+
from docx.shared import Inches, Pt
|
|
29
|
+
|
|
30
|
+
DOCX_AVAILABLE = True
|
|
31
|
+
_DOCX_IMPORT_ERROR = None
|
|
32
|
+
except ImportError as exc:
|
|
33
|
+
DOCX_AVAILABLE = False
|
|
34
|
+
_DOCX_IMPORT_ERROR = exc
|
|
35
|
+
DocxDocument = None
|
|
36
|
+
|
|
37
|
+
# Common academic section headings for heuristic detection
|
|
38
|
+
COMMON_SECTION_HEADINGS = {
|
|
39
|
+
"abstract", "introduction", "background", "literature review",
|
|
40
|
+
"methods", "methodology", "materials and methods", "experimental",
|
|
41
|
+
"results", "findings", "analysis",
|
|
42
|
+
"discussion", "conclusions", "conclusion", "summary",
|
|
43
|
+
"acknowledgements", "acknowledgments", "acknowledgement",
|
|
44
|
+
"references", "bibliography", "works cited",
|
|
45
|
+
"appendix", "appendices", "supplementary", "supplementary material",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Caption patterns for robust detection
|
|
49
|
+
CAPTION_PATTERNS = [
|
|
50
|
+
# Figure patterns
|
|
51
|
+
(r"^(figure|fig\.?)\s*(\d+)[\.:\s]*(.*)$", "figure"),
|
|
52
|
+
(r"^(scheme)\s*(\d+)[\.:\s]*(.*)$", "scheme"),
|
|
53
|
+
(r"^(chart)\s*(\d+)[\.:\s]*(.*)$", "chart"),
|
|
54
|
+
(r"^(graph)\s*(\d+)[\.:\s]*(.*)$", "graph"),
|
|
55
|
+
(r"^(plate)\s*(\d+)[\.:\s]*(.*)$", "plate"),
|
|
56
|
+
(r"^(illustration)\s*(\d+)[\.:\s]*(.*)$", "illustration"),
|
|
57
|
+
# Table patterns
|
|
58
|
+
(r"^(table|tbl\.?)\s*(\d+)[\.:\s]*(.*)$", "table"),
|
|
59
|
+
# Equation patterns
|
|
60
|
+
(r"^(equation|eq\.?)\s*(\d+)[\.:\s]*(.*)$", "equation"),
|
|
61
|
+
# Listing/code patterns
|
|
62
|
+
(r"^(listing|code)\s*(\d+)[\.:\s]*(.*)$", "listing"),
|
|
63
|
+
# Algorithm patterns
|
|
64
|
+
(r"^(algorithm|alg\.?)\s*(\d+)[\.:\s]*(.*)$", "algorithm"),
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class WordReader:
|
|
69
|
+
"""
|
|
70
|
+
Read a DOCX file and convert it into a SciTeX writer document.
|
|
71
|
+
|
|
72
|
+
This reader focuses on:
|
|
73
|
+
- Sections (via heading styles)
|
|
74
|
+
- Plain paragraphs
|
|
75
|
+
- Figure/table captions (via caption style)
|
|
76
|
+
- Embedded images extraction
|
|
77
|
+
- References section boundary detection
|
|
78
|
+
- Basic formatting (bold, italic)
|
|
79
|
+
|
|
80
|
+
The output is a structured intermediate representation that can be
|
|
81
|
+
easily fed into `scitex.writer` or exported to LaTeX/other formats.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
profile: BaseWordProfile,
|
|
87
|
+
extract_images: bool = True,
|
|
88
|
+
):
|
|
89
|
+
"""
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
profile : BaseWordProfile
|
|
93
|
+
Mapping between Word styles and SciTeX writer semantics.
|
|
94
|
+
extract_images : bool
|
|
95
|
+
Whether to extract embedded images from the document.
|
|
96
|
+
"""
|
|
97
|
+
if not DOCX_AVAILABLE:
|
|
98
|
+
raise ImportError(
|
|
99
|
+
"python-docx is required for scitex.msword.WordReader. "
|
|
100
|
+
"Install it via `pip install python-docx`."
|
|
101
|
+
) from _DOCX_IMPORT_ERROR
|
|
102
|
+
self.profile = profile
|
|
103
|
+
self.extract_images = extract_images
|
|
104
|
+
|
|
105
|
+
def read(self, path: Path) -> Dict[str, Any]:
|
|
106
|
+
"""
|
|
107
|
+
Read a DOCX file and return a SciTeX writer document.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
path : Path
|
|
112
|
+
Path to the DOCX file.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
dict
|
|
117
|
+
SciTeX writer document structure with keys:
|
|
118
|
+
- blocks: List of document blocks
|
|
119
|
+
- metadata: Profile and source information
|
|
120
|
+
- images: Extracted image data (if extract_images=True)
|
|
121
|
+
- references: Parsed reference entries
|
|
122
|
+
- warnings: List of conversion warnings
|
|
123
|
+
"""
|
|
124
|
+
doc = docx.Document(str(path))
|
|
125
|
+
|
|
126
|
+
# Initialize result structure
|
|
127
|
+
result: Dict[str, Any] = {
|
|
128
|
+
"blocks": [],
|
|
129
|
+
"metadata": {
|
|
130
|
+
"profile": self.profile.name,
|
|
131
|
+
"source_file": str(path),
|
|
132
|
+
"import_timestamp": datetime.now().isoformat(),
|
|
133
|
+
},
|
|
134
|
+
"images": [],
|
|
135
|
+
"references": [],
|
|
136
|
+
"warnings": [],
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Extract document properties if available
|
|
140
|
+
result["metadata"].update(self._extract_metadata(doc))
|
|
141
|
+
|
|
142
|
+
# Process paragraphs and tables
|
|
143
|
+
blocks = self._process_body(doc, result)
|
|
144
|
+
result["blocks"] = blocks
|
|
145
|
+
|
|
146
|
+
# Extract images
|
|
147
|
+
if self.extract_images:
|
|
148
|
+
result["images"] = self._extract_images(doc, path)
|
|
149
|
+
|
|
150
|
+
# Parse references section
|
|
151
|
+
result["references"] = self._parse_references(blocks)
|
|
152
|
+
|
|
153
|
+
# Run post-import hooks
|
|
154
|
+
for hook in self.profile.post_import_hooks:
|
|
155
|
+
result = hook(result)
|
|
156
|
+
|
|
157
|
+
return result
|
|
158
|
+
|
|
159
|
+
def _extract_metadata(self, doc: DocxDocument) -> Dict[str, Any]:
|
|
160
|
+
"""Extract document metadata (title, author, etc.)."""
|
|
161
|
+
metadata = {}
|
|
162
|
+
try:
|
|
163
|
+
core_props = doc.core_properties
|
|
164
|
+
if core_props.title:
|
|
165
|
+
metadata["title"] = core_props.title
|
|
166
|
+
if core_props.author:
|
|
167
|
+
metadata["author"] = core_props.author
|
|
168
|
+
if core_props.subject:
|
|
169
|
+
metadata["subject"] = core_props.subject
|
|
170
|
+
if core_props.keywords:
|
|
171
|
+
metadata["keywords"] = core_props.keywords
|
|
172
|
+
if core_props.created:
|
|
173
|
+
metadata["created"] = core_props.created.isoformat()
|
|
174
|
+
if core_props.modified:
|
|
175
|
+
metadata["modified"] = core_props.modified.isoformat()
|
|
176
|
+
except Exception:
|
|
177
|
+
pass # Metadata extraction is optional
|
|
178
|
+
return metadata
|
|
179
|
+
|
|
180
|
+
def _process_body(
|
|
181
|
+
self,
|
|
182
|
+
doc: DocxDocument,
|
|
183
|
+
result: Dict[str, Any],
|
|
184
|
+
) -> List[Dict[str, Any]]:
|
|
185
|
+
"""Process document body: paragraphs and tables."""
|
|
186
|
+
blocks: List[Dict[str, Any]] = []
|
|
187
|
+
in_reference_section = False
|
|
188
|
+
block_index = 0
|
|
189
|
+
|
|
190
|
+
# Build rel_id -> hash map for image detection
|
|
191
|
+
rel_to_hash = {}
|
|
192
|
+
if self.extract_images:
|
|
193
|
+
for rel_id, rel in doc.part.rels.items():
|
|
194
|
+
if "image" in rel.reltype:
|
|
195
|
+
image_bytes = rel.target_part.blob
|
|
196
|
+
image_hash = hashlib.md5(image_bytes).hexdigest()[:12]
|
|
197
|
+
rel_to_hash[rel_id] = image_hash
|
|
198
|
+
|
|
199
|
+
# Namespace for picture detection
|
|
200
|
+
pic_ns = {"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture"}
|
|
201
|
+
a_ns = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}
|
|
202
|
+
r_ns = {"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships"}
|
|
203
|
+
|
|
204
|
+
for element in doc.element.body:
|
|
205
|
+
tag = element.tag.split("}")[-1] # Remove namespace
|
|
206
|
+
|
|
207
|
+
if tag == "p":
|
|
208
|
+
# Process paragraph
|
|
209
|
+
para = docx.text.paragraph.Paragraph(element, doc)
|
|
210
|
+
|
|
211
|
+
# Detect inline images in this paragraph
|
|
212
|
+
if self.extract_images:
|
|
213
|
+
for run in para.runs:
|
|
214
|
+
# Check for drawing elements containing pictures
|
|
215
|
+
drawings = run.element.findall(".//a:blip", namespaces=a_ns)
|
|
216
|
+
for blip in drawings:
|
|
217
|
+
embed_attr = qn("r:embed")
|
|
218
|
+
rel_id = blip.get(embed_attr)
|
|
219
|
+
if rel_id and rel_id in rel_to_hash:
|
|
220
|
+
blocks.append({
|
|
221
|
+
"index": block_index,
|
|
222
|
+
"type": "image",
|
|
223
|
+
"image_hash": rel_to_hash[rel_id],
|
|
224
|
+
"rel_id": rel_id,
|
|
225
|
+
})
|
|
226
|
+
block_index += 1
|
|
227
|
+
|
|
228
|
+
block = self._process_paragraph(
|
|
229
|
+
para, in_reference_section, block_index
|
|
230
|
+
)
|
|
231
|
+
if block:
|
|
232
|
+
# Check if entering references section
|
|
233
|
+
if block["type"] == "heading" and block["text"] in (
|
|
234
|
+
self.profile.reference_section_titles
|
|
235
|
+
):
|
|
236
|
+
in_reference_section = True
|
|
237
|
+
block["is_reference_header"] = True
|
|
238
|
+
|
|
239
|
+
blocks.append(block)
|
|
240
|
+
block_index += 1
|
|
241
|
+
|
|
242
|
+
elif tag == "tbl":
|
|
243
|
+
# Process table
|
|
244
|
+
table = docx.table.Table(element, doc)
|
|
245
|
+
block = self._process_table(table, block_index)
|
|
246
|
+
blocks.append(block)
|
|
247
|
+
block_index += 1
|
|
248
|
+
|
|
249
|
+
return blocks
|
|
250
|
+
|
|
251
|
+
def _process_paragraph(
|
|
252
|
+
self,
|
|
253
|
+
para,
|
|
254
|
+
in_reference_section: bool,
|
|
255
|
+
block_index: int,
|
|
256
|
+
) -> Optional[Dict[str, Any]]:
|
|
257
|
+
"""Process a single paragraph."""
|
|
258
|
+
style_name = (para.style.name or "").strip() if para.style else ""
|
|
259
|
+
text = para.text.strip()
|
|
260
|
+
|
|
261
|
+
if not text:
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
# Extract runs with formatting info
|
|
265
|
+
runs = self._extract_runs(para)
|
|
266
|
+
|
|
267
|
+
# Base block structure
|
|
268
|
+
block: Dict[str, Any] = {
|
|
269
|
+
"index": block_index,
|
|
270
|
+
"text": text,
|
|
271
|
+
"style": style_name,
|
|
272
|
+
"runs": runs,
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
# Check for equations (OMML)
|
|
276
|
+
equation_latex = self._extract_equation(para)
|
|
277
|
+
if equation_latex:
|
|
278
|
+
block["type"] = "equation"
|
|
279
|
+
block["latex"] = equation_latex
|
|
280
|
+
return block
|
|
281
|
+
|
|
282
|
+
# Detect heading (style-based first, then heuristic)
|
|
283
|
+
level = self._detect_heading(para, style_name, text, runs)
|
|
284
|
+
if level is not None:
|
|
285
|
+
block["type"] = "heading"
|
|
286
|
+
block["level"] = level
|
|
287
|
+
block["detection_method"] = "style" if self._heading_level_from_style(style_name) else "heuristic"
|
|
288
|
+
return block
|
|
289
|
+
|
|
290
|
+
# Detect caption (improved pattern matching)
|
|
291
|
+
caption_info = self._detect_caption(style_name, text)
|
|
292
|
+
if caption_info:
|
|
293
|
+
block["type"] = "caption"
|
|
294
|
+
block.update(caption_info)
|
|
295
|
+
return block
|
|
296
|
+
|
|
297
|
+
# Reference paragraph
|
|
298
|
+
if in_reference_section:
|
|
299
|
+
block["type"] = "reference-paragraph"
|
|
300
|
+
ref_info = self._parse_reference_entry(text)
|
|
301
|
+
block.update(ref_info)
|
|
302
|
+
return block
|
|
303
|
+
|
|
304
|
+
# List item detection
|
|
305
|
+
if self._is_list_item(para):
|
|
306
|
+
block["type"] = "list-item"
|
|
307
|
+
list_info = self._parse_list_item(para)
|
|
308
|
+
block.update(list_info)
|
|
309
|
+
return block
|
|
310
|
+
|
|
311
|
+
# Normal paragraph
|
|
312
|
+
block["type"] = "paragraph"
|
|
313
|
+
return block
|
|
314
|
+
|
|
315
|
+
def _detect_heading(
|
|
316
|
+
self,
|
|
317
|
+
para,
|
|
318
|
+
style_name: str,
|
|
319
|
+
text: str,
|
|
320
|
+
runs: List[Dict[str, Any]],
|
|
321
|
+
) -> Optional[int]:
|
|
322
|
+
"""
|
|
323
|
+
Detect heading using multiple strategies:
|
|
324
|
+
1. Style-based (most reliable)
|
|
325
|
+
2. Font-based heuristics (bold, larger size)
|
|
326
|
+
3. Content-based (known section titles)
|
|
327
|
+
"""
|
|
328
|
+
# Strategy 1: Style-based detection
|
|
329
|
+
level = self._heading_level_from_style(style_name)
|
|
330
|
+
if level is not None:
|
|
331
|
+
return level
|
|
332
|
+
|
|
333
|
+
# Strategy 2: Font-based heuristics
|
|
334
|
+
# Check if entire paragraph is bold and short
|
|
335
|
+
text_clean = text.strip()
|
|
336
|
+
if len(text_clean) < 100: # Headings are typically short
|
|
337
|
+
all_bold = all(r.get("bold") for r in runs if r.get("text", "").strip())
|
|
338
|
+
if all_bold and runs:
|
|
339
|
+
# Check font size - headings often larger
|
|
340
|
+
avg_size = self._get_average_font_size(runs)
|
|
341
|
+
if avg_size and avg_size >= 12:
|
|
342
|
+
# Check if it looks like a section heading
|
|
343
|
+
if self._looks_like_heading(text_clean):
|
|
344
|
+
return 1 if avg_size >= 14 else 2
|
|
345
|
+
|
|
346
|
+
# Strategy 3: Content-based detection (common section titles)
|
|
347
|
+
text_lower = text_clean.lower().rstrip(".:;")
|
|
348
|
+
# Check numbered sections: "1. Introduction", "2.1 Methods"
|
|
349
|
+
numbered_match = re.match(r"^(\d+(?:\.\d+)*)[\.:\s]+(.+)$", text_clean)
|
|
350
|
+
if numbered_match:
|
|
351
|
+
section_text = numbered_match.group(2).lower().strip()
|
|
352
|
+
if section_text in COMMON_SECTION_HEADINGS:
|
|
353
|
+
depth = numbered_match.group(1).count(".")
|
|
354
|
+
return min(depth + 1, 4)
|
|
355
|
+
|
|
356
|
+
# Check unnumbered common headings (if bold or all caps)
|
|
357
|
+
if text_lower in COMMON_SECTION_HEADINGS:
|
|
358
|
+
is_bold = all(r.get("bold") for r in runs if r.get("text", "").strip())
|
|
359
|
+
is_all_caps = text_clean.isupper() and len(text_clean) > 3
|
|
360
|
+
if is_bold or is_all_caps:
|
|
361
|
+
return 1
|
|
362
|
+
|
|
363
|
+
return None
|
|
364
|
+
|
|
365
|
+
def _looks_like_heading(self, text: str) -> bool:
|
|
366
|
+
"""Check if text looks like a heading based on content patterns."""
|
|
367
|
+
text_lower = text.lower().rstrip(".:;")
|
|
368
|
+
|
|
369
|
+
# Check common section headings
|
|
370
|
+
if text_lower in COMMON_SECTION_HEADINGS:
|
|
371
|
+
return True
|
|
372
|
+
|
|
373
|
+
# Check numbered sections
|
|
374
|
+
if re.match(r"^\d+(?:\.\d+)*\s+\w", text):
|
|
375
|
+
return True
|
|
376
|
+
|
|
377
|
+
# All caps short text
|
|
378
|
+
if text.isupper() and 3 < len(text) < 50:
|
|
379
|
+
return True
|
|
380
|
+
|
|
381
|
+
return False
|
|
382
|
+
|
|
383
|
+
def _get_average_font_size(self, runs: List[Dict[str, Any]]) -> Optional[float]:
|
|
384
|
+
"""Get average font size from runs."""
|
|
385
|
+
sizes = [r["font_size"] for r in runs if r.get("font_size")]
|
|
386
|
+
return sum(sizes) / len(sizes) if sizes else None
|
|
387
|
+
|
|
388
|
+
def _detect_caption(self, style_name: str, text: str) -> Optional[Dict[str, Any]]:
|
|
389
|
+
"""
|
|
390
|
+
Detect and parse captions using multiple patterns.
|
|
391
|
+
Returns caption info dict or None.
|
|
392
|
+
"""
|
|
393
|
+
# Check by style first
|
|
394
|
+
if style_name == self.profile.caption_style:
|
|
395
|
+
return self._parse_caption(text)
|
|
396
|
+
|
|
397
|
+
# Check using comprehensive patterns
|
|
398
|
+
text_stripped = text.strip()
|
|
399
|
+
for pattern, caption_type in CAPTION_PATTERNS:
|
|
400
|
+
match = re.match(pattern, text_stripped, re.IGNORECASE)
|
|
401
|
+
if match:
|
|
402
|
+
return {
|
|
403
|
+
"caption_type": caption_type,
|
|
404
|
+
"number": int(match.group(2)),
|
|
405
|
+
"caption_text": match.group(3).strip(),
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
# Check profile-specific prefixes
|
|
409
|
+
if self._is_caption(style_name, text):
|
|
410
|
+
return self._parse_caption(text)
|
|
411
|
+
|
|
412
|
+
return None
|
|
413
|
+
|
|
414
|
+
def _extract_equation(self, para) -> Optional[str]:
|
|
415
|
+
"""
|
|
416
|
+
Extract equation from paragraph if it contains OMML (Office Math Markup).
|
|
417
|
+
Returns LaTeX representation or None.
|
|
418
|
+
"""
|
|
419
|
+
try:
|
|
420
|
+
# Check for oMath elements
|
|
421
|
+
omml_ns = {"m": "http://schemas.openxmlformats.org/officeDocument/2006/math"}
|
|
422
|
+
math_elements = para._element.findall(".//m:oMath", namespaces=omml_ns)
|
|
423
|
+
|
|
424
|
+
if not math_elements:
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
# Basic OMML to LaTeX conversion
|
|
428
|
+
latex_parts = []
|
|
429
|
+
for math_elem in math_elements:
|
|
430
|
+
latex = self._omml_to_latex(math_elem)
|
|
431
|
+
if latex:
|
|
432
|
+
latex_parts.append(latex)
|
|
433
|
+
|
|
434
|
+
return " ".join(latex_parts) if latex_parts else None
|
|
435
|
+
except Exception:
|
|
436
|
+
return None
|
|
437
|
+
|
|
438
|
+
def _omml_to_latex(self, math_elem) -> str:
|
|
439
|
+
"""
|
|
440
|
+
Convert OMML element to LaTeX string.
|
|
441
|
+
This is a basic converter - handles common cases.
|
|
442
|
+
"""
|
|
443
|
+
omml_ns = {"m": "http://schemas.openxmlformats.org/officeDocument/2006/math"}
|
|
444
|
+
|
|
445
|
+
def get_text(elem) -> str:
|
|
446
|
+
"""Recursively get text from element."""
|
|
447
|
+
texts = []
|
|
448
|
+
if elem.text:
|
|
449
|
+
texts.append(elem.text)
|
|
450
|
+
for child in elem:
|
|
451
|
+
texts.append(get_text(child))
|
|
452
|
+
if child.tail:
|
|
453
|
+
texts.append(child.tail)
|
|
454
|
+
return "".join(texts)
|
|
455
|
+
|
|
456
|
+
def convert_element(elem) -> str:
|
|
457
|
+
"""Convert a single OMML element to LaTeX."""
|
|
458
|
+
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
459
|
+
|
|
460
|
+
if tag == "r": # Run (text)
|
|
461
|
+
return get_text(elem)
|
|
462
|
+
elif tag == "f": # Fraction
|
|
463
|
+
num = elem.find("m:num", namespaces=omml_ns)
|
|
464
|
+
den = elem.find("m:den", namespaces=omml_ns)
|
|
465
|
+
num_tex = convert_children(num) if num is not None else ""
|
|
466
|
+
den_tex = convert_children(den) if den is not None else ""
|
|
467
|
+
return f"\\frac{{{num_tex}}}{{{den_tex}}}"
|
|
468
|
+
elif tag == "rad": # Radical/root
|
|
469
|
+
deg = elem.find("m:deg", namespaces=omml_ns)
|
|
470
|
+
content = elem.find("m:e", namespaces=omml_ns)
|
|
471
|
+
content_tex = convert_children(content) if content is not None else ""
|
|
472
|
+
if deg is not None and get_text(deg).strip():
|
|
473
|
+
deg_tex = convert_children(deg)
|
|
474
|
+
return f"\\sqrt[{deg_tex}]{{{content_tex}}}"
|
|
475
|
+
return f"\\sqrt{{{content_tex}}}"
|
|
476
|
+
elif tag == "sSup": # Superscript
|
|
477
|
+
base = elem.find("m:e", namespaces=omml_ns)
|
|
478
|
+
sup = elem.find("m:sup", namespaces=omml_ns)
|
|
479
|
+
base_tex = convert_children(base) if base is not None else ""
|
|
480
|
+
sup_tex = convert_children(sup) if sup is not None else ""
|
|
481
|
+
return f"{base_tex}^{{{sup_tex}}}"
|
|
482
|
+
elif tag == "sSub": # Subscript
|
|
483
|
+
base = elem.find("m:e", namespaces=omml_ns)
|
|
484
|
+
sub = elem.find("m:sub", namespaces=omml_ns)
|
|
485
|
+
base_tex = convert_children(base) if base is not None else ""
|
|
486
|
+
sub_tex = convert_children(sub) if sub is not None else ""
|
|
487
|
+
return f"{base_tex}_{{{sub_tex}}}"
|
|
488
|
+
elif tag == "sSubSup": # Sub-superscript
|
|
489
|
+
base = elem.find("m:e", namespaces=omml_ns)
|
|
490
|
+
sub = elem.find("m:sub", namespaces=omml_ns)
|
|
491
|
+
sup = elem.find("m:sup", namespaces=omml_ns)
|
|
492
|
+
base_tex = convert_children(base) if base is not None else ""
|
|
493
|
+
sub_tex = convert_children(sub) if sub is not None else ""
|
|
494
|
+
sup_tex = convert_children(sup) if sup is not None else ""
|
|
495
|
+
return f"{base_tex}_{{{sub_tex}}}^{{{sup_tex}}}"
|
|
496
|
+
elif tag == "nary": # N-ary (sum, product, integral)
|
|
497
|
+
chr_elem = elem.find(".//m:chr", namespaces=omml_ns)
|
|
498
|
+
symbol = chr_elem.get(qn("m:val")) if chr_elem is not None else "∑"
|
|
499
|
+
symbol_map = {"∑": "\\sum", "∏": "\\prod", "∫": "\\int", "∮": "\\oint"}
|
|
500
|
+
latex_sym = symbol_map.get(symbol, symbol)
|
|
501
|
+
sub = elem.find("m:sub", namespaces=omml_ns)
|
|
502
|
+
sup = elem.find("m:sup", namespaces=omml_ns)
|
|
503
|
+
content = elem.find("m:e", namespaces=omml_ns)
|
|
504
|
+
result = latex_sym
|
|
505
|
+
if sub is not None:
|
|
506
|
+
result += f"_{{{convert_children(sub)}}}"
|
|
507
|
+
if sup is not None:
|
|
508
|
+
result += f"^{{{convert_children(sup)}}}"
|
|
509
|
+
if content is not None:
|
|
510
|
+
result += f" {convert_children(content)}"
|
|
511
|
+
return result
|
|
512
|
+
elif tag == "d": # Delimiter (parentheses, brackets)
|
|
513
|
+
content = elem.find("m:e", namespaces=omml_ns)
|
|
514
|
+
content_tex = convert_children(content) if content is not None else ""
|
|
515
|
+
beg = elem.find(".//m:begChr", namespaces=omml_ns)
|
|
516
|
+
end = elem.find(".//m:endChr", namespaces=omml_ns)
|
|
517
|
+
left = beg.get(qn("m:val")) if beg is not None else "("
|
|
518
|
+
right = end.get(qn("m:val")) if end is not None else ")"
|
|
519
|
+
return f"\\left{left}{content_tex}\\right{right}"
|
|
520
|
+
elif tag in ("e", "num", "den", "sub", "sup", "deg"):
|
|
521
|
+
# Container elements - just process children
|
|
522
|
+
return convert_children(elem)
|
|
523
|
+
else:
|
|
524
|
+
# Unknown element - try to get text
|
|
525
|
+
return convert_children(elem)
|
|
526
|
+
|
|
527
|
+
def convert_children(elem) -> str:
|
|
528
|
+
"""Convert all children of an element."""
|
|
529
|
+
if elem is None:
|
|
530
|
+
return ""
|
|
531
|
+
parts = []
|
|
532
|
+
for child in elem:
|
|
533
|
+
parts.append(convert_element(child))
|
|
534
|
+
return "".join(parts)
|
|
535
|
+
|
|
536
|
+
return convert_element(math_elem)
|
|
537
|
+
|
|
538
|
+
def _is_list_item(self, para) -> bool:
|
|
539
|
+
"""Check if paragraph is a list item."""
|
|
540
|
+
try:
|
|
541
|
+
# Check for numbering properties
|
|
542
|
+
pPr = para._element.find(qn("w:pPr"))
|
|
543
|
+
if pPr is not None:
|
|
544
|
+
numPr = pPr.find(qn("w:numPr"))
|
|
545
|
+
if numPr is not None:
|
|
546
|
+
return True
|
|
547
|
+
|
|
548
|
+
# Check for bullet/number at start of text
|
|
549
|
+
text = para.text.strip()
|
|
550
|
+
if re.match(r"^[\u2022\u2023\u25E6\u2043\u2219•‣◦⁃∙]\s", text):
|
|
551
|
+
return True
|
|
552
|
+
if re.match(r"^(\d+[\.\):]|\([a-z]\)|\([ivxlc]+\)|[a-z][\.\)])\s", text, re.IGNORECASE):
|
|
553
|
+
return True
|
|
554
|
+
|
|
555
|
+
return False
|
|
556
|
+
except Exception:
|
|
557
|
+
return False
|
|
558
|
+
|
|
559
|
+
def _parse_list_item(self, para) -> Dict[str, Any]:
|
|
560
|
+
"""Parse list item to extract level and content."""
|
|
561
|
+
info: Dict[str, Any] = {"list_type": "unordered", "level": 0}
|
|
562
|
+
|
|
563
|
+
try:
|
|
564
|
+
pPr = para._element.find(qn("w:pPr"))
|
|
565
|
+
if pPr is not None:
|
|
566
|
+
numPr = pPr.find(qn("w:numPr"))
|
|
567
|
+
if numPr is not None:
|
|
568
|
+
ilvl = numPr.find(qn("w:ilvl"))
|
|
569
|
+
if ilvl is not None:
|
|
570
|
+
info["level"] = int(ilvl.get(qn("w:val"), 0))
|
|
571
|
+
|
|
572
|
+
# Detect ordered vs unordered
|
|
573
|
+
text = para.text.strip()
|
|
574
|
+
if re.match(r"^\d+[\.\):]\s", text):
|
|
575
|
+
info["list_type"] = "ordered"
|
|
576
|
+
except Exception:
|
|
577
|
+
pass
|
|
578
|
+
|
|
579
|
+
return info
|
|
580
|
+
|
|
581
|
+
def _extract_runs(self, para) -> List[Dict[str, Any]]:
|
|
582
|
+
"""Extract formatted runs from a paragraph."""
|
|
583
|
+
runs = []
|
|
584
|
+
for run in para.runs:
|
|
585
|
+
if not run.text:
|
|
586
|
+
continue
|
|
587
|
+
run_data = {
|
|
588
|
+
"text": run.text,
|
|
589
|
+
"bold": run.bold,
|
|
590
|
+
"italic": run.italic,
|
|
591
|
+
"underline": run.underline is not None,
|
|
592
|
+
}
|
|
593
|
+
if run.font.size:
|
|
594
|
+
run_data["font_size"] = run.font.size.pt
|
|
595
|
+
if run.font.name:
|
|
596
|
+
run_data["font_name"] = run.font.name
|
|
597
|
+
runs.append(run_data)
|
|
598
|
+
return runs
|
|
599
|
+
|
|
600
|
+
def _heading_level_from_style(self, style_name: str) -> Optional[int]:
|
|
601
|
+
"""Return heading level for a given Word style, or None."""
|
|
602
|
+
for level, expected_style in self.profile.heading_styles.items():
|
|
603
|
+
if style_name == expected_style:
|
|
604
|
+
return level
|
|
605
|
+
return None
|
|
606
|
+
|
|
607
|
+
def _is_caption(self, style_name: str, text: str) -> bool:
|
|
608
|
+
"""Check if paragraph is a caption."""
|
|
609
|
+
if style_name == self.profile.caption_style:
|
|
610
|
+
return True
|
|
611
|
+
|
|
612
|
+
# Check by prefix
|
|
613
|
+
text_lower = text.lower()
|
|
614
|
+
prefixes = (
|
|
615
|
+
self.profile.figure_caption_prefixes
|
|
616
|
+
+ self.profile.table_caption_prefixes
|
|
617
|
+
)
|
|
618
|
+
for prefix in prefixes:
|
|
619
|
+
if text_lower.startswith(prefix.lower()):
|
|
620
|
+
return True
|
|
621
|
+
return False
|
|
622
|
+
|
|
623
|
+
def _parse_caption(self, text: str) -> Dict[str, Any]:
|
|
624
|
+
"""Parse caption text to extract figure/table number."""
|
|
625
|
+
info: Dict[str, Any] = {}
|
|
626
|
+
|
|
627
|
+
# Check figure
|
|
628
|
+
for prefix in self.profile.figure_caption_prefixes:
|
|
629
|
+
pattern = rf"^{re.escape(prefix)}\.?\s*(\d+)[\.:]?\s*(.*)$"
|
|
630
|
+
match = re.match(pattern, text, re.IGNORECASE)
|
|
631
|
+
if match:
|
|
632
|
+
info["caption_type"] = "figure"
|
|
633
|
+
info["number"] = int(match.group(1))
|
|
634
|
+
info["caption_text"] = match.group(2).strip()
|
|
635
|
+
return info
|
|
636
|
+
|
|
637
|
+
# Check table
|
|
638
|
+
for prefix in self.profile.table_caption_prefixes:
|
|
639
|
+
pattern = rf"^{re.escape(prefix)}\.?\s*(\d+)[\.:]?\s*(.*)$"
|
|
640
|
+
match = re.match(pattern, text, re.IGNORECASE)
|
|
641
|
+
if match:
|
|
642
|
+
info["caption_type"] = "table"
|
|
643
|
+
info["number"] = int(match.group(1))
|
|
644
|
+
info["caption_text"] = match.group(2).strip()
|
|
645
|
+
return info
|
|
646
|
+
|
|
647
|
+
info["caption_type"] = "unknown"
|
|
648
|
+
info["caption_text"] = text
|
|
649
|
+
return info
|
|
650
|
+
|
|
651
|
+
def _parse_reference_entry(self, text: str) -> Dict[str, Any]:
|
|
652
|
+
"""Parse a reference entry to extract citation number."""
|
|
653
|
+
info: Dict[str, Any] = {}
|
|
654
|
+
|
|
655
|
+
# Try to extract numbered reference: [1], 1., (1), etc.
|
|
656
|
+
patterns = [
|
|
657
|
+
r"^\[(\d+)\]", # [1] Author...
|
|
658
|
+
r"^(\d+)\.", # 1. Author...
|
|
659
|
+
r"^\((\d+)\)", # (1) Author...
|
|
660
|
+
]
|
|
661
|
+
for pattern in patterns:
|
|
662
|
+
match = re.match(pattern, text)
|
|
663
|
+
if match:
|
|
664
|
+
info["ref_number"] = int(match.group(1))
|
|
665
|
+
info["ref_text"] = re.sub(pattern, "", text).strip()
|
|
666
|
+
break
|
|
667
|
+
else:
|
|
668
|
+
info["ref_text"] = text
|
|
669
|
+
|
|
670
|
+
return info
|
|
671
|
+
|
|
672
|
+
def _process_table(
|
|
673
|
+
self,
|
|
674
|
+
table,
|
|
675
|
+
block_index: int,
|
|
676
|
+
) -> Dict[str, Any]:
|
|
677
|
+
"""Process a table."""
|
|
678
|
+
rows = []
|
|
679
|
+
for row in table.rows:
|
|
680
|
+
cells = []
|
|
681
|
+
for cell in row.cells:
|
|
682
|
+
cells.append(cell.text.strip())
|
|
683
|
+
rows.append(cells)
|
|
684
|
+
|
|
685
|
+
return {
|
|
686
|
+
"index": block_index,
|
|
687
|
+
"type": "table",
|
|
688
|
+
"rows": rows,
|
|
689
|
+
"num_rows": len(rows),
|
|
690
|
+
"num_cols": len(rows[0]) if rows else 0,
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
def _extract_images(
|
|
694
|
+
self,
|
|
695
|
+
doc: DocxDocument,
|
|
696
|
+
source_path: Path,
|
|
697
|
+
) -> List[Dict[str, Any]]:
|
|
698
|
+
"""Extract embedded images from the document."""
|
|
699
|
+
images = []
|
|
700
|
+
|
|
701
|
+
try:
|
|
702
|
+
for rel_id, rel in doc.part.rels.items():
|
|
703
|
+
if "image" in rel.reltype:
|
|
704
|
+
image_part = rel.target_part
|
|
705
|
+
image_bytes = image_part.blob
|
|
706
|
+
|
|
707
|
+
# Generate hash for deduplication
|
|
708
|
+
image_hash = hashlib.md5(image_bytes).hexdigest()[:12]
|
|
709
|
+
|
|
710
|
+
# Determine extension from content type
|
|
711
|
+
content_type = image_part.content_type
|
|
712
|
+
ext_map = {
|
|
713
|
+
"image/png": ".png",
|
|
714
|
+
"image/jpeg": ".jpg",
|
|
715
|
+
"image/gif": ".gif",
|
|
716
|
+
"image/tiff": ".tiff",
|
|
717
|
+
"image/bmp": ".bmp",
|
|
718
|
+
}
|
|
719
|
+
ext = ext_map.get(content_type, ".png")
|
|
720
|
+
|
|
721
|
+
images.append(
|
|
722
|
+
{
|
|
723
|
+
"rel_id": rel_id,
|
|
724
|
+
"hash": image_hash,
|
|
725
|
+
"content_type": content_type,
|
|
726
|
+
"extension": ext,
|
|
727
|
+
"size_bytes": len(image_bytes),
|
|
728
|
+
"data": image_bytes, # Raw bytes
|
|
729
|
+
}
|
|
730
|
+
)
|
|
731
|
+
except Exception as e:
|
|
732
|
+
pass # Image extraction is optional
|
|
733
|
+
|
|
734
|
+
return images
|
|
735
|
+
|
|
736
|
+
def _parse_references(
|
|
737
|
+
self,
|
|
738
|
+
blocks: List[Dict[str, Any]],
|
|
739
|
+
) -> List[Dict[str, Any]]:
|
|
740
|
+
"""Extract and structure references from blocks."""
|
|
741
|
+
references = []
|
|
742
|
+
for block in blocks:
|
|
743
|
+
if block.get("type") == "reference-paragraph":
|
|
744
|
+
ref_entry = {
|
|
745
|
+
"number": block.get("ref_number"),
|
|
746
|
+
"text": block.get("ref_text", block.get("text", "")),
|
|
747
|
+
"raw": block.get("text", ""),
|
|
748
|
+
}
|
|
749
|
+
references.append(ref_entry)
|
|
750
|
+
return references
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
__all__ = ["WordReader"]
|