yomitoku 0.7.3__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {yomitoku-0.7.3 → yomitoku-0.7.4}/PKG-INFO +1 -1
- {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-table-structure-recognizer-rtdtrv2-open-beta.yaml +1 -1
- {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-text-detector-dbnet-open-beta.yaml +1 -1
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/cli/main.py +91 -54
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/data/functions.py +33 -15
- yomitoku-0.7.4/src/yomitoku/export/__init__.py +19 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_csv.py +32 -6
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_html.py +38 -9
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_json.py +37 -18
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_markdown.py +33 -5
- yomitoku-0.7.4/tests/data/sampldoc.tif +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_data.py +6 -4
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_export.py +39 -46
- yomitoku-0.7.3/src/yomitoku/export/__init__.py +0 -15
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/release-drafter.yml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/build-and-publish-docs.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/build-and-publish.yml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/create-release.yml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/lint-and-test.yml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.gitignore +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.pre-commit-config.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/.python-version +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/README.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/README_EN.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-layout-parser-rtdtrv2-open-beta.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-text-recognizer-parseq-open-beta.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-text-recognizer-parseq-small-open-beta.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/sample.pdf +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/setting_document_anaysis.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/simple_document_analysis.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/simple_layout.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/simple_ocr.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/text_detector.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/dockerfile +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/assets/logo.svg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/cli.en.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/cli.ja.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/configuration.en.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/configuration.ja.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/index.en.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/index.ja.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/installation.en.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/installation.ja.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/module.en.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/module.ja.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/gallery.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/mkdocs.yml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/pyproject.toml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/pytest.ini +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/scripts/register_hugging_face_hub.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/base.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/cli/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_layout_parser_rtdtrv2.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_text_detector_dbnet.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_text_recognizer_parseq.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_text_recognizer_parseq_small.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/constants.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/data/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/data/dataset.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/document_analyzer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/layout_analyzer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/layout_parser.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/dbnet_plus.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/activate.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/dbnet_feature_attention.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/parseq_transformer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/rtdetr_backbone.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/rtdetrv2_decoder.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/parseq.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/rtdetr.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/ocr.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/onnx/.gitkeep +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/dbnet_postporcessor.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/parseq_tokenizer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/rtdetr_postprocessor.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/reading_order.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/resource/MPLUS1p-Medium.ttf +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/resource/charset.txt +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/table_structure_recognizer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/text_detector.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/text_recognizer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/__init__.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/graph.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/logger.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/misc.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/visualizer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/demo.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery1.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery2.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery3.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery4.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery5.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery6.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/logo/horizontal.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/demo_html.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_demo_p1_figure_0.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_0.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_1.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_10.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_2.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_3.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_4.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_5.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_6.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_7.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_8.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_9.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery3_p1_figure_0.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery3_p1_figure_1.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery5_p1_figure_0.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery5_p1_figure_1.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery6_p1_figure_0.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery6_p1_figure_1.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1.html +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1_layout.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1_ocr.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1.html +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1_layout.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1_ocr.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1.html +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1_layout.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1_ocr.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1.html +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1_layout.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1_ocr.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1.html +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1_layout.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1_ocr.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1.html +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1_layout.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1_ocr.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1.html +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1.md +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1_layout.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1_ocr.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/invalid.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/invalid.pdf +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/rgba.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/small.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/subdir/test.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.bmp +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.pdf +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.png +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.tiff +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.txt +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test_gray.jpg +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_base.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_cli.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_document_analyzer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_layout_analyzer.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_ocr.py +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/layout_parser.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/table_structure_recognizer.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/text_detector.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/text_recognizer.yaml +0 -0
- {yomitoku-0.7.3 → yomitoku-0.7.4}/uv.lock +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: yomitoku
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.4
|
4
4
|
Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
|
5
5
|
Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
|
6
6
|
License: CC BY-NC-SA 4.0
|
@@ -12,6 +12,7 @@ from ..document_analyzer import DocumentAnalyzer
|
|
12
12
|
from ..utils.logger import set_logger
|
13
13
|
|
14
14
|
from ..export import save_csv, save_html, save_json, save_markdown
|
15
|
+
from ..export import convert_json, convert_csv, convert_html, convert_markdown
|
15
16
|
|
16
17
|
logger = set_logger(__name__, "INFO")
|
17
18
|
|
@@ -51,13 +52,13 @@ def merge_all_pages(results):
|
|
51
52
|
|
52
53
|
def save_merged_file(out_path, args, out):
|
53
54
|
if args.format == "json":
|
54
|
-
save_json(out_path, args.encoding
|
55
|
+
save_json(out, out_path, args.encoding)
|
55
56
|
elif args.format == "csv":
|
56
|
-
save_csv(out_path, args.encoding
|
57
|
+
save_csv(out, out_path, args.encoding)
|
57
58
|
elif args.format == "html":
|
58
|
-
save_html(out_path, args.encoding
|
59
|
+
save_html(out, out_path, args.encoding)
|
59
60
|
elif args.format == "md":
|
60
|
-
save_markdown(out_path, args.encoding
|
61
|
+
save_markdown(out, out_path, args.encoding)
|
61
62
|
|
62
63
|
|
63
64
|
def validate_encoding(encoding):
|
@@ -76,7 +77,7 @@ def process_single_file(args, analyzer, path, format):
|
|
76
77
|
if path.suffix[1:].lower() in ["pdf"]:
|
77
78
|
imgs = load_pdf(path)
|
78
79
|
else:
|
79
|
-
imgs =
|
80
|
+
imgs = load_image(path)
|
80
81
|
|
81
82
|
results = []
|
82
83
|
for page, img in enumerate(imgs):
|
@@ -84,6 +85,10 @@ def process_single_file(args, analyzer, path, format):
|
|
84
85
|
dirname = path.parent.name
|
85
86
|
filename = path.stem
|
86
87
|
|
88
|
+
# cv2.imwrite(
|
89
|
+
# os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
|
90
|
+
# )
|
91
|
+
|
87
92
|
if ocr is not None:
|
88
93
|
out_path = os.path.join(
|
89
94
|
args.outdir, f"{dirname}_{filename}_p{page+1}_ocr.jpg"
|
@@ -103,34 +108,51 @@ def process_single_file(args, analyzer, path, format):
|
|
103
108
|
out_path = os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.{format}")
|
104
109
|
|
105
110
|
if format == "json":
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
111
|
+
if args.combine:
|
112
|
+
json = convert_json(
|
113
|
+
result,
|
114
|
+
out_path,
|
115
|
+
args.ignore_line_break,
|
116
|
+
img,
|
117
|
+
args.figure,
|
118
|
+
args.figure_dir,
|
119
|
+
)
|
120
|
+
else:
|
121
|
+
json = result.to_json(
|
122
|
+
out_path,
|
123
|
+
ignore_line_break=args.ignore_line_break,
|
124
|
+
encoding=args.encoding,
|
125
|
+
img=img,
|
126
|
+
export_figure=args.figure,
|
127
|
+
figure_dir=args.figure_dir,
|
128
|
+
)
|
114
129
|
|
115
130
|
results.append(
|
116
131
|
{
|
117
132
|
"format": format,
|
118
|
-
"data": json,
|
133
|
+
"data": json.model_dump(),
|
119
134
|
}
|
120
135
|
)
|
121
136
|
|
122
|
-
if not args.combine:
|
123
|
-
save_json(out_path, args.encoding, json)
|
124
|
-
|
125
137
|
elif format == "csv":
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
138
|
+
if args.combine:
|
139
|
+
csv = convert_csv(
|
140
|
+
result,
|
141
|
+
out_path,
|
142
|
+
args.ignore_line_break,
|
143
|
+
img,
|
144
|
+
args.figure,
|
145
|
+
args.figure_dir,
|
146
|
+
)
|
147
|
+
else:
|
148
|
+
csv = result.to_csv(
|
149
|
+
out_path,
|
150
|
+
ignore_line_break=args.ignore_line_break,
|
151
|
+
encoding=args.encoding,
|
152
|
+
img=img,
|
153
|
+
export_figure=args.figure,
|
154
|
+
figure_dir=args.figure_dir,
|
155
|
+
)
|
134
156
|
|
135
157
|
results.append(
|
136
158
|
{
|
@@ -139,20 +161,29 @@ def process_single_file(args, analyzer, path, format):
|
|
139
161
|
}
|
140
162
|
)
|
141
163
|
|
142
|
-
if not args.combine:
|
143
|
-
save_csv(out_path, args.encoding, csv)
|
144
|
-
|
145
164
|
elif format == "html":
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
165
|
+
if args.combine:
|
166
|
+
html, _ = convert_html(
|
167
|
+
result,
|
168
|
+
out_path,
|
169
|
+
ignore_line_break=args.ignore_line_break,
|
170
|
+
img=img,
|
171
|
+
export_figure=args.figure,
|
172
|
+
export_figure_letter=args.figure_letter,
|
173
|
+
figure_width=args.figure_width,
|
174
|
+
figure_dir=args.figure_dir,
|
175
|
+
)
|
176
|
+
else:
|
177
|
+
html = result.to_html(
|
178
|
+
out_path,
|
179
|
+
ignore_line_break=args.ignore_line_break,
|
180
|
+
img=img,
|
181
|
+
export_figure=args.figure,
|
182
|
+
export_figure_letter=args.figure_letter,
|
183
|
+
figure_width=args.figure_width,
|
184
|
+
figure_dir=args.figure_dir,
|
185
|
+
encoding=args.encoding,
|
186
|
+
)
|
156
187
|
|
157
188
|
results.append(
|
158
189
|
{
|
@@ -161,20 +192,29 @@ def process_single_file(args, analyzer, path, format):
|
|
161
192
|
}
|
162
193
|
)
|
163
194
|
|
164
|
-
if not args.combine:
|
165
|
-
save_html(out_path, args.encoding, html)
|
166
|
-
|
167
195
|
elif format == "md":
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
196
|
+
if args.combine:
|
197
|
+
md, _ = convert_markdown(
|
198
|
+
result,
|
199
|
+
out_path,
|
200
|
+
ignore_line_break=args.ignore_line_break,
|
201
|
+
img=img,
|
202
|
+
export_figure=args.figure,
|
203
|
+
export_figure_letter=args.figure_letter,
|
204
|
+
figure_width=args.figure_width,
|
205
|
+
figure_dir=args.figure_dir,
|
206
|
+
)
|
207
|
+
else:
|
208
|
+
md = result.to_markdown(
|
209
|
+
out_path,
|
210
|
+
ignore_line_break=args.ignore_line_break,
|
211
|
+
img=img,
|
212
|
+
export_figure=args.figure,
|
213
|
+
export_figure_letter=args.figure_letter,
|
214
|
+
figure_width=args.figure_width,
|
215
|
+
figure_dir=args.figure_dir,
|
216
|
+
encoding=args.encoding,
|
217
|
+
)
|
178
218
|
|
179
219
|
results.append(
|
180
220
|
{
|
@@ -183,9 +223,6 @@ def process_single_file(args, analyzer, path, format):
|
|
183
223
|
}
|
184
224
|
)
|
185
225
|
|
186
|
-
if not args.combine:
|
187
|
-
save_markdown(out_path, args.encoding, md)
|
188
|
-
|
189
226
|
out = merge_all_pages(results)
|
190
227
|
if args.combine:
|
191
228
|
out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
|
3
3
|
import cv2
|
4
|
+
from PIL import Image
|
4
5
|
import numpy as np
|
5
6
|
import torch
|
6
7
|
import pypdfium2
|
@@ -15,6 +16,20 @@ from ..utils.logger import set_logger
|
|
15
16
|
logger = set_logger(__name__)
|
16
17
|
|
17
18
|
|
19
|
+
def validate_image(img: np.ndarray):
|
20
|
+
h, w = img.shape[:2]
|
21
|
+
if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
|
22
|
+
raise ValueError("Image size is too small.")
|
23
|
+
|
24
|
+
if min(h, w) < WARNING_IMAGE_SIZE:
|
25
|
+
logger.warning(
|
26
|
+
"""
|
27
|
+
The image size is small, which may result in reduced OCR accuracy.
|
28
|
+
The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
|
29
|
+
"""
|
30
|
+
)
|
31
|
+
|
32
|
+
|
18
33
|
def load_image(image_path: str) -> np.ndarray:
|
19
34
|
"""
|
20
35
|
Open an image file.
|
@@ -40,24 +55,27 @@ def load_image(image_path: str) -> np.ndarray:
|
|
40
55
|
"PDF file is not supported by load_image(). Use load_pdf() instead."
|
41
56
|
)
|
42
57
|
|
43
|
-
|
44
|
-
|
45
|
-
|
58
|
+
try:
|
59
|
+
img = Image.open(image_path)
|
60
|
+
except Exception:
|
46
61
|
raise ValueError("Invalid image data.")
|
47
62
|
|
48
|
-
|
49
|
-
if
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
63
|
+
pages = []
|
64
|
+
if ext in ["tif", "tiff"]:
|
65
|
+
try:
|
66
|
+
while True:
|
67
|
+
img_arr = np.array(img.copy().convert("RGB"))
|
68
|
+
validate_image(img_arr)
|
69
|
+
pages.append(img_arr[:, :, ::-1])
|
70
|
+
img.seek(img.tell() + 1)
|
71
|
+
except EOFError:
|
72
|
+
pass
|
73
|
+
else:
|
74
|
+
img_arr = np.array(img.convert("RGB"))
|
75
|
+
validate_image(img_arr)
|
76
|
+
pages.append(img_arr[:, :, ::-1])
|
59
77
|
|
60
|
-
return
|
78
|
+
return pages
|
61
79
|
|
62
80
|
|
63
81
|
def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from .export_csv import export_csv, save_csv, convert_csv
|
2
|
+
from .export_html import export_html, save_html, convert_html
|
3
|
+
from .export_json import export_json, save_json, convert_json
|
4
|
+
from .export_markdown import export_markdown, save_markdown, convert_markdown
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"export_html",
|
8
|
+
"export_markdown",
|
9
|
+
"export_csv",
|
10
|
+
"export_json",
|
11
|
+
"save_html",
|
12
|
+
"save_markdown",
|
13
|
+
"save_csv",
|
14
|
+
"save_json",
|
15
|
+
"convert_html",
|
16
|
+
"convert_markdown",
|
17
|
+
"convert_csv",
|
18
|
+
"convert_json",
|
19
|
+
]
|
@@ -57,11 +57,10 @@ def save_figure(
|
|
57
57
|
cv2.imwrite(figure_path, figure_img)
|
58
58
|
|
59
59
|
|
60
|
-
def
|
60
|
+
def convert_csv(
|
61
61
|
inputs,
|
62
|
-
out_path
|
63
|
-
ignore_line_break
|
64
|
-
encoding: str = "utf-8",
|
62
|
+
out_path,
|
63
|
+
ignore_line_break,
|
65
64
|
img=None,
|
66
65
|
export_figure: bool = True,
|
67
66
|
figure_dir="figures",
|
@@ -90,6 +89,8 @@ def export_csv(
|
|
90
89
|
}
|
91
90
|
)
|
92
91
|
|
92
|
+
elements = sorted(elements, key=lambda x: x["order"])
|
93
|
+
|
93
94
|
if export_figure:
|
94
95
|
save_figure(
|
95
96
|
inputs.figures,
|
@@ -98,11 +99,36 @@ def export_csv(
|
|
98
99
|
figure_dir=figure_dir,
|
99
100
|
)
|
100
101
|
|
101
|
-
elements = sorted(elements, key=lambda x: x["order"])
|
102
102
|
return elements
|
103
103
|
|
104
104
|
|
105
|
-
def
|
105
|
+
def export_csv(
|
106
|
+
inputs,
|
107
|
+
out_path: str,
|
108
|
+
ignore_line_break: bool = False,
|
109
|
+
encoding: str = "utf-8",
|
110
|
+
img=None,
|
111
|
+
export_figure: bool = True,
|
112
|
+
figure_dir="figures",
|
113
|
+
):
|
114
|
+
elements = convert_csv(
|
115
|
+
inputs,
|
116
|
+
out_path,
|
117
|
+
ignore_line_break,
|
118
|
+
img,
|
119
|
+
export_figure,
|
120
|
+
figure_dir,
|
121
|
+
)
|
122
|
+
|
123
|
+
save_csv(elements, out_path, encoding)
|
124
|
+
return elements
|
125
|
+
|
126
|
+
|
127
|
+
def save_csv(
|
128
|
+
elements,
|
129
|
+
out_path,
|
130
|
+
encoding,
|
131
|
+
):
|
106
132
|
with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
|
107
133
|
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
108
134
|
for element in elements:
|
@@ -146,16 +146,15 @@ def figure_to_html(
|
|
146
146
|
return elements
|
147
147
|
|
148
148
|
|
149
|
-
def
|
149
|
+
def convert_html(
|
150
150
|
inputs,
|
151
|
-
out_path
|
152
|
-
ignore_line_break
|
153
|
-
export_figure
|
154
|
-
export_figure_letter
|
151
|
+
out_path,
|
152
|
+
ignore_line_break,
|
153
|
+
export_figure,
|
154
|
+
export_figure_letter,
|
155
155
|
img=None,
|
156
156
|
figure_width=200,
|
157
157
|
figure_dir="figures",
|
158
|
-
encoding: str = "utf-8",
|
159
158
|
):
|
160
159
|
html_string = ""
|
161
160
|
elements = []
|
@@ -181,13 +180,43 @@ def export_html(
|
|
181
180
|
elements = sorted(elements, key=lambda x: x["order"])
|
182
181
|
|
183
182
|
html_string = "".join([element["html"] for element in elements])
|
184
|
-
# html_string = add_html_tag(html_string)
|
185
|
-
|
186
183
|
parsed_html = html.fromstring(html_string)
|
187
184
|
formatted_html = etree.tostring(parsed_html, pretty_print=True, encoding="unicode")
|
185
|
+
|
186
|
+
return formatted_html, elements
|
187
|
+
|
188
|
+
|
189
|
+
def export_html(
|
190
|
+
inputs,
|
191
|
+
out_path: str,
|
192
|
+
ignore_line_break: bool = False,
|
193
|
+
export_figure: bool = True,
|
194
|
+
export_figure_letter: bool = False,
|
195
|
+
img=None,
|
196
|
+
figure_width=200,
|
197
|
+
figure_dir="figures",
|
198
|
+
encoding: str = "utf-8",
|
199
|
+
):
|
200
|
+
formatted_html, elements = convert_html(
|
201
|
+
inputs,
|
202
|
+
out_path,
|
203
|
+
ignore_line_break,
|
204
|
+
export_figure,
|
205
|
+
export_figure_letter,
|
206
|
+
img,
|
207
|
+
figure_width,
|
208
|
+
figure_dir,
|
209
|
+
)
|
210
|
+
|
211
|
+
save_html(formatted_html, out_path, encoding)
|
212
|
+
|
188
213
|
return formatted_html
|
189
214
|
|
190
215
|
|
191
|
-
def save_html(
|
216
|
+
def save_html(
|
217
|
+
html,
|
218
|
+
out_path,
|
219
|
+
encoding,
|
220
|
+
):
|
192
221
|
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
|
193
222
|
f.write(html)
|
@@ -36,15 +36,7 @@ def save_figure(
|
|
36
36
|
cv2.imwrite(figure_path, figure_img)
|
37
37
|
|
38
38
|
|
39
|
-
def
|
40
|
-
inputs,
|
41
|
-
out_path,
|
42
|
-
ignore_line_break=False,
|
43
|
-
encoding: str = "utf-8",
|
44
|
-
img=None,
|
45
|
-
export_figure=False,
|
46
|
-
figure_dir="figures",
|
47
|
-
):
|
39
|
+
def convert_json(inputs, out_path, ignore_line_break, img, export_figure, figure_dir):
|
48
40
|
from yomitoku.document_analyzer import DocumentAnalyzerSchema
|
49
41
|
|
50
42
|
if isinstance(inputs, DocumentAnalyzerSchema):
|
@@ -55,18 +47,45 @@ def export_json(
|
|
55
47
|
for paragraph in inputs.paragraphs:
|
56
48
|
paragraph_to_json(paragraph, ignore_line_break)
|
57
49
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
50
|
+
if isinstance(inputs, DocumentAnalyzerSchema) and export_figure:
|
51
|
+
save_figure(
|
52
|
+
inputs.figures,
|
53
|
+
img,
|
54
|
+
out_path,
|
55
|
+
figure_dir=figure_dir,
|
56
|
+
)
|
57
|
+
|
58
|
+
return inputs
|
59
|
+
|
60
|
+
|
61
|
+
def export_json(
|
62
|
+
inputs,
|
63
|
+
out_path,
|
64
|
+
ignore_line_break=False,
|
65
|
+
encoding: str = "utf-8",
|
66
|
+
img=None,
|
67
|
+
export_figure=False,
|
68
|
+
figure_dir="figures",
|
69
|
+
):
|
70
|
+
inputs = convert_json(
|
71
|
+
inputs,
|
72
|
+
out_path,
|
73
|
+
ignore_line_break,
|
74
|
+
img,
|
75
|
+
export_figure,
|
76
|
+
figure_dir,
|
77
|
+
)
|
78
|
+
|
79
|
+
save_json(
|
80
|
+
inputs.model_dump(),
|
81
|
+
out_path,
|
82
|
+
encoding,
|
83
|
+
)
|
65
84
|
|
66
|
-
return inputs
|
85
|
+
return inputs
|
67
86
|
|
68
87
|
|
69
|
-
def save_json(out_path, encoding
|
88
|
+
def save_json(data, out_path, encoding):
|
70
89
|
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
|
71
90
|
json.dump(
|
72
91
|
data,
|
@@ -111,16 +111,15 @@ def figure_to_md(
|
|
111
111
|
return elements
|
112
112
|
|
113
113
|
|
114
|
-
def
|
114
|
+
def convert_markdown(
|
115
115
|
inputs,
|
116
|
-
out_path
|
116
|
+
out_path,
|
117
|
+
ignore_line_break=False,
|
117
118
|
img=None,
|
118
|
-
ignore_line_break: bool = False,
|
119
119
|
export_figure_letter=False,
|
120
120
|
export_figure=True,
|
121
121
|
figure_width=200,
|
122
122
|
figure_dir="figures",
|
123
|
-
encoding: str = "utf-8",
|
124
123
|
):
|
125
124
|
elements = []
|
126
125
|
for table in inputs.tables:
|
@@ -144,10 +143,39 @@ def export_markdown(
|
|
144
143
|
|
145
144
|
elements = sorted(elements, key=lambda x: x["order"])
|
146
145
|
markdown = "\n".join([element["md"] for element in elements])
|
146
|
+
return markdown, elements
|
147
|
+
|
147
148
|
|
149
|
+
def export_markdown(
|
150
|
+
inputs,
|
151
|
+
out_path: str,
|
152
|
+
ignore_line_break: bool = False,
|
153
|
+
img=None,
|
154
|
+
export_figure_letter=False,
|
155
|
+
export_figure=True,
|
156
|
+
figure_width=200,
|
157
|
+
figure_dir="figures",
|
158
|
+
encoding: str = "utf-8",
|
159
|
+
):
|
160
|
+
markdown, elements = convert_markdown(
|
161
|
+
inputs,
|
162
|
+
out_path,
|
163
|
+
ignore_line_break,
|
164
|
+
img,
|
165
|
+
export_figure_letter,
|
166
|
+
export_figure,
|
167
|
+
figure_width,
|
168
|
+
figure_dir,
|
169
|
+
)
|
170
|
+
|
171
|
+
save_markdown(markdown, out_path, encoding)
|
148
172
|
return markdown
|
149
173
|
|
150
174
|
|
151
|
-
def save_markdown(
|
175
|
+
def save_markdown(
|
176
|
+
markdown,
|
177
|
+
out_path,
|
178
|
+
encoding,
|
179
|
+
):
|
152
180
|
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
|
153
181
|
f.write(markdown)
|
Binary file
|
@@ -36,14 +36,16 @@ def test_load_image():
|
|
36
36
|
"tests/data/test.bmp",
|
37
37
|
"tests/data/test_gray.jpg",
|
38
38
|
"tests/data/rgba.png",
|
39
|
+
"tests/data/sampldoc.tif",
|
39
40
|
]
|
40
41
|
|
41
42
|
for target in targets:
|
42
43
|
image = load_image(target)
|
43
|
-
assert image
|
44
|
-
assert image.shape[
|
45
|
-
assert image.shape[
|
46
|
-
assert image.
|
44
|
+
assert len(image) >= 1
|
45
|
+
assert image[0].shape[2] == 3
|
46
|
+
assert image[0].shape[0] > 32
|
47
|
+
assert image[0].shape[1] > 32
|
48
|
+
assert image[0].dtype == "uint8"
|
47
49
|
|
48
50
|
|
49
51
|
def test_load_pdf():
|