yomitoku 0.7.3__tar.gz → 0.7.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {yomitoku-0.7.3 → yomitoku-0.7.4}/PKG-INFO +1 -1
  2. {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-table-structure-recognizer-rtdtrv2-open-beta.yaml +1 -1
  3. {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-text-detector-dbnet-open-beta.yaml +1 -1
  4. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/cli/main.py +91 -54
  5. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/data/functions.py +33 -15
  6. yomitoku-0.7.4/src/yomitoku/export/__init__.py +19 -0
  7. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_csv.py +32 -6
  8. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_html.py +38 -9
  9. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_json.py +37 -18
  10. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_markdown.py +33 -5
  11. yomitoku-0.7.4/tests/data/sampldoc.tif +0 -0
  12. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_data.py +6 -4
  13. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_export.py +39 -46
  14. yomitoku-0.7.3/src/yomitoku/export/__init__.py +0 -15
  15. {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/release-drafter.yml +0 -0
  16. {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/build-and-publish-docs.yaml +0 -0
  17. {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/build-and-publish.yml +0 -0
  18. {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/create-release.yml +0 -0
  19. {yomitoku-0.7.3 → yomitoku-0.7.4}/.github/workflows/lint-and-test.yml +0 -0
  20. {yomitoku-0.7.3 → yomitoku-0.7.4}/.gitignore +0 -0
  21. {yomitoku-0.7.3 → yomitoku-0.7.4}/.pre-commit-config.yaml +0 -0
  22. {yomitoku-0.7.3 → yomitoku-0.7.4}/.python-version +0 -0
  23. {yomitoku-0.7.3 → yomitoku-0.7.4}/README.md +0 -0
  24. {yomitoku-0.7.3 → yomitoku-0.7.4}/README_EN.md +0 -0
  25. {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-layout-parser-rtdtrv2-open-beta.yaml +0 -0
  26. {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-text-recognizer-parseq-open-beta.yaml +0 -0
  27. {yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-text-recognizer-parseq-small-open-beta.yaml +0 -0
  28. {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/sample.pdf +0 -0
  29. {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/setting_document_anaysis.py +0 -0
  30. {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/simple_document_analysis.py +0 -0
  31. {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/simple_layout.py +0 -0
  32. {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/simple_ocr.py +0 -0
  33. {yomitoku-0.7.3 → yomitoku-0.7.4}/demo/text_detector.yaml +0 -0
  34. {yomitoku-0.7.3 → yomitoku-0.7.4}/dockerfile +0 -0
  35. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/assets/logo.svg +0 -0
  36. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/cli.en.md +0 -0
  37. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/cli.ja.md +0 -0
  38. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/configuration.en.md +0 -0
  39. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/configuration.ja.md +0 -0
  40. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/index.en.md +0 -0
  41. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/index.ja.md +0 -0
  42. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/installation.en.md +0 -0
  43. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/installation.ja.md +0 -0
  44. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/module.en.md +0 -0
  45. {yomitoku-0.7.3 → yomitoku-0.7.4}/docs/module.ja.md +0 -0
  46. {yomitoku-0.7.3 → yomitoku-0.7.4}/gallery.md +0 -0
  47. {yomitoku-0.7.3 → yomitoku-0.7.4}/mkdocs.yml +0 -0
  48. {yomitoku-0.7.3 → yomitoku-0.7.4}/pyproject.toml +0 -0
  49. {yomitoku-0.7.3 → yomitoku-0.7.4}/pytest.ini +0 -0
  50. {yomitoku-0.7.3 → yomitoku-0.7.4}/scripts/register_hugging_face_hub.py +0 -0
  51. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/__init__.py +0 -0
  52. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/base.py +0 -0
  53. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/cli/__init__.py +0 -0
  54. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/__init__.py +0 -0
  55. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_layout_parser_rtdtrv2.py +0 -0
  56. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py +0 -0
  57. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_text_detector_dbnet.py +0 -0
  58. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_text_recognizer_parseq.py +0 -0
  59. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/configs/cfg_text_recognizer_parseq_small.py +0 -0
  60. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/constants.py +0 -0
  61. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/data/__init__.py +0 -0
  62. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/data/dataset.py +0 -0
  63. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/document_analyzer.py +0 -0
  64. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/layout_analyzer.py +0 -0
  65. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/layout_parser.py +0 -0
  66. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/__init__.py +0 -0
  67. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/dbnet_plus.py +0 -0
  68. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/__init__.py +0 -0
  69. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/activate.py +0 -0
  70. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/dbnet_feature_attention.py +0 -0
  71. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/parseq_transformer.py +0 -0
  72. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/rtdetr_backbone.py +0 -0
  73. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/rtdetr_hybrid_encoder.py +0 -0
  74. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/layers/rtdetrv2_decoder.py +0 -0
  75. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/parseq.py +0 -0
  76. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/models/rtdetr.py +0 -0
  77. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/ocr.py +0 -0
  78. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/onnx/.gitkeep +0 -0
  79. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/__init__.py +0 -0
  80. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/dbnet_postporcessor.py +0 -0
  81. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/parseq_tokenizer.py +0 -0
  82. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/postprocessor/rtdetr_postprocessor.py +0 -0
  83. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/reading_order.py +0 -0
  84. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/resource/MPLUS1p-Medium.ttf +0 -0
  85. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/resource/charset.txt +0 -0
  86. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/table_structure_recognizer.py +0 -0
  87. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/text_detector.py +0 -0
  88. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/text_recognizer.py +0 -0
  89. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/__init__.py +0 -0
  90. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/graph.py +0 -0
  91. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/logger.py +0 -0
  92. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/misc.py +0 -0
  93. {yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/utils/visualizer.py +0 -0
  94. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/demo.jpg +0 -0
  95. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery1.jpg +0 -0
  96. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery2.jpg +0 -0
  97. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery3.jpg +0 -0
  98. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery4.jpg +0 -0
  99. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery5.jpg +0 -0
  100. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/in/gallery6.jpg +0 -0
  101. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/logo/horizontal.png +0 -0
  102. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/demo_html.png +0 -0
  103. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_demo_p1_figure_0.png +0 -0
  104. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_0.png +0 -0
  105. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_1.png +0 -0
  106. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_10.png +0 -0
  107. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_2.png +0 -0
  108. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_3.png +0 -0
  109. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_4.png +0 -0
  110. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_5.png +0 -0
  111. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_6.png +0 -0
  112. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_7.png +0 -0
  113. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_8.png +0 -0
  114. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery1_p1_figure_9.png +0 -0
  115. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery3_p1_figure_0.png +0 -0
  116. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery3_p1_figure_1.png +0 -0
  117. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery5_p1_figure_0.png +0 -0
  118. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery5_p1_figure_1.png +0 -0
  119. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery6_p1_figure_0.png +0 -0
  120. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/figures/in_gallery6_p1_figure_1.png +0 -0
  121. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1.html +0 -0
  122. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1.md +0 -0
  123. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1_layout.jpg +0 -0
  124. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_demo_p1_ocr.jpg +0 -0
  125. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1.html +0 -0
  126. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1.md +0 -0
  127. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1_layout.jpg +0 -0
  128. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery1_p1_ocr.jpg +0 -0
  129. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1.html +0 -0
  130. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1.md +0 -0
  131. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1_layout.jpg +0 -0
  132. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery2_p1_ocr.jpg +0 -0
  133. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1.html +0 -0
  134. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1.md +0 -0
  135. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1_layout.jpg +0 -0
  136. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery3_p1_ocr.jpg +0 -0
  137. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1.html +0 -0
  138. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1.md +0 -0
  139. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1_layout.jpg +0 -0
  140. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery4_p1_ocr.jpg +0 -0
  141. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1.html +0 -0
  142. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1.md +0 -0
  143. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1_layout.jpg +0 -0
  144. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery5_p1_ocr.jpg +0 -0
  145. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1.html +0 -0
  146. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1.md +0 -0
  147. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1_layout.jpg +0 -0
  148. {yomitoku-0.7.3 → yomitoku-0.7.4}/static/out/in_gallery6_p1_ocr.jpg +0 -0
  149. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/invalid.jpg +0 -0
  150. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/invalid.pdf +0 -0
  151. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/rgba.png +0 -0
  152. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/small.jpg +0 -0
  153. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/subdir/test.jpg +0 -0
  154. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.bmp +0 -0
  155. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.jpg +0 -0
  156. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.pdf +0 -0
  157. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.png +0 -0
  158. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.tiff +0 -0
  159. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test.txt +0 -0
  160. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/data/test_gray.jpg +0 -0
  161. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_base.py +0 -0
  162. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_cli.py +0 -0
  163. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_document_analyzer.py +0 -0
  164. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_layout_analyzer.py +0 -0
  165. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_ocr.py +0 -0
  166. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/layout_parser.yaml +0 -0
  167. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/table_structure_recognizer.yaml +0 -0
  168. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/text_detector.yaml +0 -0
  169. {yomitoku-0.7.3 → yomitoku-0.7.4}/tests/yaml/text_recognizer.yaml +0 -0
  170. {yomitoku-0.7.3 → yomitoku-0.7.4}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: yomitoku
3
- Version: 0.7.3
3
+ Version: 0.7.4
4
4
  Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
5
5
  Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
6
6
  License: CC BY-NC-SA 4.0
@@ -64,4 +64,4 @@ RTDETRTransformerv2:
64
64
  category:
65
65
  - row
66
66
  - col
67
- - span
67
+ - span
@@ -27,4 +27,4 @@ visualize:
27
27
  - 0
28
28
  - 255
29
29
  - 0
30
- heatmap: false
30
+ heatmap: false
@@ -12,6 +12,7 @@ from ..document_analyzer import DocumentAnalyzer
12
12
  from ..utils.logger import set_logger
13
13
 
14
14
  from ..export import save_csv, save_html, save_json, save_markdown
15
+ from ..export import convert_json, convert_csv, convert_html, convert_markdown
15
16
 
16
17
  logger = set_logger(__name__, "INFO")
17
18
 
@@ -51,13 +52,13 @@ def merge_all_pages(results):
51
52
 
52
53
  def save_merged_file(out_path, args, out):
53
54
  if args.format == "json":
54
- save_json(out_path, args.encoding, out)
55
+ save_json(out, out_path, args.encoding)
55
56
  elif args.format == "csv":
56
- save_csv(out_path, args.encoding, out)
57
+ save_csv(out, out_path, args.encoding)
57
58
  elif args.format == "html":
58
- save_html(out_path, args.encoding, out)
59
+ save_html(out, out_path, args.encoding)
59
60
  elif args.format == "md":
60
- save_markdown(out_path, args.encoding, out)
61
+ save_markdown(out, out_path, args.encoding)
61
62
 
62
63
 
63
64
  def validate_encoding(encoding):
@@ -76,7 +77,7 @@ def process_single_file(args, analyzer, path, format):
76
77
  if path.suffix[1:].lower() in ["pdf"]:
77
78
  imgs = load_pdf(path)
78
79
  else:
79
- imgs = [load_image(path)]
80
+ imgs = load_image(path)
80
81
 
81
82
  results = []
82
83
  for page, img in enumerate(imgs):
@@ -84,6 +85,10 @@ def process_single_file(args, analyzer, path, format):
84
85
  dirname = path.parent.name
85
86
  filename = path.stem
86
87
 
88
+ # cv2.imwrite(
89
+ # os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
90
+ # )
91
+
87
92
  if ocr is not None:
88
93
  out_path = os.path.join(
89
94
  args.outdir, f"{dirname}_{filename}_p{page+1}_ocr.jpg"
@@ -103,34 +108,51 @@ def process_single_file(args, analyzer, path, format):
103
108
  out_path = os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.{format}")
104
109
 
105
110
  if format == "json":
106
- json = result.to_json(
107
- out_path,
108
- ignore_line_break=args.ignore_line_break,
109
- encoding=args.encoding,
110
- img=img,
111
- export_figure=args.figure,
112
- figure_dir=args.figure_dir,
113
- )
111
+ if args.combine:
112
+ json = convert_json(
113
+ result,
114
+ out_path,
115
+ args.ignore_line_break,
116
+ img,
117
+ args.figure,
118
+ args.figure_dir,
119
+ )
120
+ else:
121
+ json = result.to_json(
122
+ out_path,
123
+ ignore_line_break=args.ignore_line_break,
124
+ encoding=args.encoding,
125
+ img=img,
126
+ export_figure=args.figure,
127
+ figure_dir=args.figure_dir,
128
+ )
114
129
 
115
130
  results.append(
116
131
  {
117
132
  "format": format,
118
- "data": json,
133
+ "data": json.model_dump(),
119
134
  }
120
135
  )
121
136
 
122
- if not args.combine:
123
- save_json(out_path, args.encoding, json)
124
-
125
137
  elif format == "csv":
126
- csv = result.to_csv(
127
- out_path,
128
- ignore_line_break=args.ignore_line_break,
129
- encoding=args.encoding,
130
- img=img,
131
- export_figure=args.figure,
132
- figure_dir=args.figure_dir,
133
- )
138
+ if args.combine:
139
+ csv = convert_csv(
140
+ result,
141
+ out_path,
142
+ args.ignore_line_break,
143
+ img,
144
+ args.figure,
145
+ args.figure_dir,
146
+ )
147
+ else:
148
+ csv = result.to_csv(
149
+ out_path,
150
+ ignore_line_break=args.ignore_line_break,
151
+ encoding=args.encoding,
152
+ img=img,
153
+ export_figure=args.figure,
154
+ figure_dir=args.figure_dir,
155
+ )
134
156
 
135
157
  results.append(
136
158
  {
@@ -139,20 +161,29 @@ def process_single_file(args, analyzer, path, format):
139
161
  }
140
162
  )
141
163
 
142
- if not args.combine:
143
- save_csv(out_path, args.encoding, csv)
144
-
145
164
  elif format == "html":
146
- html = result.to_html(
147
- out_path,
148
- ignore_line_break=args.ignore_line_break,
149
- img=img,
150
- export_figure=args.figure,
151
- export_figure_letter=args.figure_letter,
152
- figure_width=args.figure_width,
153
- figure_dir=args.figure_dir,
154
- encoding=args.encoding,
155
- )
165
+ if args.combine:
166
+ html, _ = convert_html(
167
+ result,
168
+ out_path,
169
+ ignore_line_break=args.ignore_line_break,
170
+ img=img,
171
+ export_figure=args.figure,
172
+ export_figure_letter=args.figure_letter,
173
+ figure_width=args.figure_width,
174
+ figure_dir=args.figure_dir,
175
+ )
176
+ else:
177
+ html = result.to_html(
178
+ out_path,
179
+ ignore_line_break=args.ignore_line_break,
180
+ img=img,
181
+ export_figure=args.figure,
182
+ export_figure_letter=args.figure_letter,
183
+ figure_width=args.figure_width,
184
+ figure_dir=args.figure_dir,
185
+ encoding=args.encoding,
186
+ )
156
187
 
157
188
  results.append(
158
189
  {
@@ -161,20 +192,29 @@ def process_single_file(args, analyzer, path, format):
161
192
  }
162
193
  )
163
194
 
164
- if not args.combine:
165
- save_html(out_path, args.encoding, html)
166
-
167
195
  elif format == "md":
168
- md = result.to_markdown(
169
- out_path,
170
- ignore_line_break=args.ignore_line_break,
171
- img=img,
172
- export_figure=args.figure,
173
- export_figure_letter=args.figure_letter,
174
- figure_width=args.figure_width,
175
- figure_dir=args.figure_dir,
176
- encoding=args.encoding,
177
- )
196
+ if args.combine:
197
+ md, _ = convert_markdown(
198
+ result,
199
+ out_path,
200
+ ignore_line_break=args.ignore_line_break,
201
+ img=img,
202
+ export_figure=args.figure,
203
+ export_figure_letter=args.figure_letter,
204
+ figure_width=args.figure_width,
205
+ figure_dir=args.figure_dir,
206
+ )
207
+ else:
208
+ md = result.to_markdown(
209
+ out_path,
210
+ ignore_line_break=args.ignore_line_break,
211
+ img=img,
212
+ export_figure=args.figure,
213
+ export_figure_letter=args.figure_letter,
214
+ figure_width=args.figure_width,
215
+ figure_dir=args.figure_dir,
216
+ encoding=args.encoding,
217
+ )
178
218
 
179
219
  results.append(
180
220
  {
@@ -183,9 +223,6 @@ def process_single_file(args, analyzer, path, format):
183
223
  }
184
224
  )
185
225
 
186
- if not args.combine:
187
- save_markdown(out_path, args.encoding, md)
188
-
189
226
  out = merge_all_pages(results)
190
227
  if args.combine:
191
228
  out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")
@@ -1,6 +1,7 @@
1
1
  from pathlib import Path
2
2
 
3
3
  import cv2
4
+ from PIL import Image
4
5
  import numpy as np
5
6
  import torch
6
7
  import pypdfium2
@@ -15,6 +16,20 @@ from ..utils.logger import set_logger
15
16
  logger = set_logger(__name__)
16
17
 
17
18
 
19
+ def validate_image(img: np.ndarray):
20
+ h, w = img.shape[:2]
21
+ if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
22
+ raise ValueError("Image size is too small.")
23
+
24
+ if min(h, w) < WARNING_IMAGE_SIZE:
25
+ logger.warning(
26
+ """
27
+ The image size is small, which may result in reduced OCR accuracy.
28
+ The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
29
+ """
30
+ )
31
+
32
+
18
33
  def load_image(image_path: str) -> np.ndarray:
19
34
  """
20
35
  Open an image file.
@@ -40,24 +55,27 @@ def load_image(image_path: str) -> np.ndarray:
40
55
  "PDF file is not supported by load_image(). Use load_pdf() instead."
41
56
  )
42
57
 
43
- img = cv2.imread(image_path, cv2.IMREAD_COLOR)
44
-
45
- if img is None:
58
+ try:
59
+ img = Image.open(image_path)
60
+ except Exception:
46
61
  raise ValueError("Invalid image data.")
47
62
 
48
- h, w = img.shape[:2]
49
- if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
50
- raise ValueError("Image size is too small.")
51
-
52
- if min(h, w) < WARNING_IMAGE_SIZE:
53
- logger.warning(
54
- """
55
- The image size is small, which may result in reduced OCR accuracy.
56
- The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
57
- """
58
- )
63
+ pages = []
64
+ if ext in ["tif", "tiff"]:
65
+ try:
66
+ while True:
67
+ img_arr = np.array(img.copy().convert("RGB"))
68
+ validate_image(img_arr)
69
+ pages.append(img_arr[:, :, ::-1])
70
+ img.seek(img.tell() + 1)
71
+ except EOFError:
72
+ pass
73
+ else:
74
+ img_arr = np.array(img.convert("RGB"))
75
+ validate_image(img_arr)
76
+ pages.append(img_arr[:, :, ::-1])
59
77
 
60
- return img
78
+ return pages
61
79
 
62
80
 
63
81
  def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
@@ -0,0 +1,19 @@
1
+ from .export_csv import export_csv, save_csv, convert_csv
2
+ from .export_html import export_html, save_html, convert_html
3
+ from .export_json import export_json, save_json, convert_json
4
+ from .export_markdown import export_markdown, save_markdown, convert_markdown
5
+
6
+ __all__ = [
7
+ "export_html",
8
+ "export_markdown",
9
+ "export_csv",
10
+ "export_json",
11
+ "save_html",
12
+ "save_markdown",
13
+ "save_csv",
14
+ "save_json",
15
+ "convert_html",
16
+ "convert_markdown",
17
+ "convert_csv",
18
+ "convert_json",
19
+ ]
@@ -57,11 +57,10 @@ def save_figure(
57
57
  cv2.imwrite(figure_path, figure_img)
58
58
 
59
59
 
60
- def export_csv(
60
+ def convert_csv(
61
61
  inputs,
62
- out_path: str,
63
- ignore_line_break: bool = False,
64
- encoding: str = "utf-8",
62
+ out_path,
63
+ ignore_line_break,
65
64
  img=None,
66
65
  export_figure: bool = True,
67
66
  figure_dir="figures",
@@ -90,6 +89,8 @@ def export_csv(
90
89
  }
91
90
  )
92
91
 
92
+ elements = sorted(elements, key=lambda x: x["order"])
93
+
93
94
  if export_figure:
94
95
  save_figure(
95
96
  inputs.figures,
@@ -98,11 +99,36 @@ def export_csv(
98
99
  figure_dir=figure_dir,
99
100
  )
100
101
 
101
- elements = sorted(elements, key=lambda x: x["order"])
102
102
  return elements
103
103
 
104
104
 
105
- def save_csv(out_path, encoding, elements):
105
+ def export_csv(
106
+ inputs,
107
+ out_path: str,
108
+ ignore_line_break: bool = False,
109
+ encoding: str = "utf-8",
110
+ img=None,
111
+ export_figure: bool = True,
112
+ figure_dir="figures",
113
+ ):
114
+ elements = convert_csv(
115
+ inputs,
116
+ out_path,
117
+ ignore_line_break,
118
+ img,
119
+ export_figure,
120
+ figure_dir,
121
+ )
122
+
123
+ save_csv(elements, out_path, encoding)
124
+ return elements
125
+
126
+
127
+ def save_csv(
128
+ elements,
129
+ out_path,
130
+ encoding,
131
+ ):
106
132
  with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
107
133
  writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
108
134
  for element in elements:
@@ -146,16 +146,15 @@ def figure_to_html(
146
146
  return elements
147
147
 
148
148
 
149
- def export_html(
149
+ def convert_html(
150
150
  inputs,
151
- out_path: str,
152
- ignore_line_break: bool = False,
153
- export_figure: bool = True,
154
- export_figure_letter: bool = False,
151
+ out_path,
152
+ ignore_line_break,
153
+ export_figure,
154
+ export_figure_letter,
155
155
  img=None,
156
156
  figure_width=200,
157
157
  figure_dir="figures",
158
- encoding: str = "utf-8",
159
158
  ):
160
159
  html_string = ""
161
160
  elements = []
@@ -181,13 +180,43 @@ def export_html(
181
180
  elements = sorted(elements, key=lambda x: x["order"])
182
181
 
183
182
  html_string = "".join([element["html"] for element in elements])
184
- # html_string = add_html_tag(html_string)
185
-
186
183
  parsed_html = html.fromstring(html_string)
187
184
  formatted_html = etree.tostring(parsed_html, pretty_print=True, encoding="unicode")
185
+
186
+ return formatted_html, elements
187
+
188
+
189
+ def export_html(
190
+ inputs,
191
+ out_path: str,
192
+ ignore_line_break: bool = False,
193
+ export_figure: bool = True,
194
+ export_figure_letter: bool = False,
195
+ img=None,
196
+ figure_width=200,
197
+ figure_dir="figures",
198
+ encoding: str = "utf-8",
199
+ ):
200
+ formatted_html, elements = convert_html(
201
+ inputs,
202
+ out_path,
203
+ ignore_line_break,
204
+ export_figure,
205
+ export_figure_letter,
206
+ img,
207
+ figure_width,
208
+ figure_dir,
209
+ )
210
+
211
+ save_html(formatted_html, out_path, encoding)
212
+
188
213
  return formatted_html
189
214
 
190
215
 
191
- def save_html(out_path, encoding, html):
216
+ def save_html(
217
+ html,
218
+ out_path,
219
+ encoding,
220
+ ):
192
221
  with open(out_path, "w", encoding=encoding, errors="ignore") as f:
193
222
  f.write(html)
@@ -36,15 +36,7 @@ def save_figure(
36
36
  cv2.imwrite(figure_path, figure_img)
37
37
 
38
38
 
39
- def export_json(
40
- inputs,
41
- out_path,
42
- ignore_line_break=False,
43
- encoding: str = "utf-8",
44
- img=None,
45
- export_figure=False,
46
- figure_dir="figures",
47
- ):
39
+ def convert_json(inputs, out_path, ignore_line_break, img, export_figure, figure_dir):
48
40
  from yomitoku.document_analyzer import DocumentAnalyzerSchema
49
41
 
50
42
  if isinstance(inputs, DocumentAnalyzerSchema):
@@ -55,18 +47,45 @@ def export_json(
55
47
  for paragraph in inputs.paragraphs:
56
48
  paragraph_to_json(paragraph, ignore_line_break)
57
49
 
58
- if export_figure:
59
- save_figure(
60
- inputs.figures,
61
- img,
62
- out_path,
63
- figure_dir=figure_dir,
64
- )
50
+ if isinstance(inputs, DocumentAnalyzerSchema) and export_figure:
51
+ save_figure(
52
+ inputs.figures,
53
+ img,
54
+ out_path,
55
+ figure_dir=figure_dir,
56
+ )
57
+
58
+ return inputs
59
+
60
+
61
+ def export_json(
62
+ inputs,
63
+ out_path,
64
+ ignore_line_break=False,
65
+ encoding: str = "utf-8",
66
+ img=None,
67
+ export_figure=False,
68
+ figure_dir="figures",
69
+ ):
70
+ inputs = convert_json(
71
+ inputs,
72
+ out_path,
73
+ ignore_line_break,
74
+ img,
75
+ export_figure,
76
+ figure_dir,
77
+ )
78
+
79
+ save_json(
80
+ inputs.model_dump(),
81
+ out_path,
82
+ encoding,
83
+ )
65
84
 
66
- return inputs.model_dump()
85
+ return inputs
67
86
 
68
87
 
69
- def save_json(out_path, encoding, data):
88
+ def save_json(data, out_path, encoding):
70
89
  with open(out_path, "w", encoding=encoding, errors="ignore") as f:
71
90
  json.dump(
72
91
  data,
@@ -111,16 +111,15 @@ def figure_to_md(
111
111
  return elements
112
112
 
113
113
 
114
- def export_markdown(
114
+ def convert_markdown(
115
115
  inputs,
116
- out_path: str,
116
+ out_path,
117
+ ignore_line_break=False,
117
118
  img=None,
118
- ignore_line_break: bool = False,
119
119
  export_figure_letter=False,
120
120
  export_figure=True,
121
121
  figure_width=200,
122
122
  figure_dir="figures",
123
- encoding: str = "utf-8",
124
123
  ):
125
124
  elements = []
126
125
  for table in inputs.tables:
@@ -144,10 +143,39 @@ def export_markdown(
144
143
 
145
144
  elements = sorted(elements, key=lambda x: x["order"])
146
145
  markdown = "\n".join([element["md"] for element in elements])
146
+ return markdown, elements
147
+
147
148
 
149
+ def export_markdown(
150
+ inputs,
151
+ out_path: str,
152
+ ignore_line_break: bool = False,
153
+ img=None,
154
+ export_figure_letter=False,
155
+ export_figure=True,
156
+ figure_width=200,
157
+ figure_dir="figures",
158
+ encoding: str = "utf-8",
159
+ ):
160
+ markdown, elements = convert_markdown(
161
+ inputs,
162
+ out_path,
163
+ ignore_line_break,
164
+ img,
165
+ export_figure_letter,
166
+ export_figure,
167
+ figure_width,
168
+ figure_dir,
169
+ )
170
+
171
+ save_markdown(markdown, out_path, encoding)
148
172
  return markdown
149
173
 
150
174
 
151
- def save_markdown(out_path, encoding, markdown):
175
+ def save_markdown(
176
+ markdown,
177
+ out_path,
178
+ encoding,
179
+ ):
152
180
  with open(out_path, "w", encoding=encoding, errors="ignore") as f:
153
181
  f.write(markdown)
Binary file
@@ -36,14 +36,16 @@ def test_load_image():
36
36
  "tests/data/test.bmp",
37
37
  "tests/data/test_gray.jpg",
38
38
  "tests/data/rgba.png",
39
+ "tests/data/sampldoc.tif",
39
40
  ]
40
41
 
41
42
  for target in targets:
42
43
  image = load_image(target)
43
- assert image.shape[2] == 3
44
- assert image.shape[0] > 32
45
- assert image.shape[1] > 32
46
- assert image.dtype == "uint8"
44
+ assert len(image) >= 1
45
+ assert image[0].shape[2] == 3
46
+ assert image[0].shape[0] > 32
47
+ assert image[0].shape[1] > 32
48
+ assert image[0].dtype == "uint8"
47
49
 
48
50
 
49
51
  def test_load_pdf():