yomitoku 0.9.2__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yomitoku/cli/main.py CHANGED
@@ -49,10 +49,15 @@ def merge_all_pages(results):
49
49
  else:
50
50
  out += "\n" + data
51
51
 
52
+ elif format == "pdf":
53
+ if out is None:
54
+ out = [data]
55
+ else:
56
+ out.append(data)
52
57
  return out
53
58
 
54
59
 
55
- def save_merged_file(out_path, args, out):
60
+ def save_merged_file(out_path, args, out, imgs):
56
61
  if args.format == "json":
57
62
  save_json(out, out_path, args.encoding)
58
63
  elif args.format == "csv":
@@ -61,6 +66,13 @@ def save_merged_file(out_path, args, out):
61
66
  save_html(out, out_path, args.encoding)
62
67
  elif args.format == "md":
63
68
  save_markdown(out, out_path, args.encoding)
69
+ elif args.format == "pdf":
70
+ create_searchable_pdf(
71
+ imgs,
72
+ out,
73
+ output_path=out_path,
74
+ font_path=args.font_path,
75
+ )
64
76
 
65
77
 
66
78
  def validate_encoding(encoding):
@@ -82,12 +94,10 @@ def process_single_file(args, analyzer, path, format):
82
94
  imgs = load_image(path)
83
95
 
84
96
  format_results = []
85
- results = []
86
97
  for page, img in enumerate(imgs):
87
98
  result, ocr, layout = analyzer(img)
88
99
  dirname = path.parent.name
89
100
  filename = path.stem
90
- results.append(result)
91
101
 
92
102
  # cv2.imwrite(
93
103
  # os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
@@ -148,6 +158,7 @@ def process_single_file(args, analyzer, path, format):
148
158
  args.ignore_line_break,
149
159
  img,
150
160
  args.figure,
161
+ args.figure_letter,
151
162
  args.figure_dir,
152
163
  )
153
164
  else:
@@ -157,6 +168,7 @@ def process_single_file(args, analyzer, path, format):
157
168
  encoding=args.encoding,
158
169
  img=img,
159
170
  export_figure=args.figure,
171
+ export_figure_letter=args.figure_letter,
160
172
  figure_dir=args.figure_dir,
161
173
  )
162
174
 
@@ -228,6 +240,21 @@ def process_single_file(args, analyzer, path, format):
228
240
  "data": md,
229
241
  }
230
242
  )
243
+ elif format == "pdf":
244
+ if not args.combine:
245
+ create_searchable_pdf(
246
+ [img],
247
+ [result],
248
+ output_path=out_path,
249
+ font_path=args.font_path,
250
+ )
251
+
252
+ format_results.append(
253
+ {
254
+ "format": format,
255
+ "data": result,
256
+ }
257
+ )
231
258
 
232
259
  out = merge_all_pages(format_results)
233
260
  if args.combine:
@@ -236,16 +263,8 @@ def process_single_file(args, analyzer, path, format):
236
263
  out_path,
237
264
  args,
238
265
  out,
239
- )
240
-
241
- if args.searchable_pdf:
242
- pdf_path = os.path.join(args.outdir, f"{filename}.pdf")
243
- create_searchable_pdf(
244
266
  imgs,
245
- results,
246
- output_path=pdf_path,
247
267
  )
248
- logger.info(f"Output SearchablePDF: {pdf_path}")
249
268
 
250
269
 
251
270
  def main():
@@ -362,11 +381,11 @@ def main():
362
381
  choices=["auto", "left2right", "top2bottom", "right2left"],
363
382
  )
364
383
  parser.add_argument(
365
- "--searchable_pdf",
366
- action="store_true",
367
- help="if set, create searchable PDF",
384
+ "--font_path",
385
+ default=None,
386
+ type=str,
387
+ help="Path to the font file(.ttf) for PDF output",
368
388
  )
369
-
370
389
  args = parser.parse_args()
371
390
 
372
391
  path = Path(args.arg1)
@@ -379,6 +398,13 @@ def main():
379
398
  f"Invalid output format: {args.format}. Supported formats are {SUPPORT_OUTPUT_FORMAT}"
380
399
  )
381
400
 
401
+ if (
402
+ args.font_path is not None
403
+ and not os.path.exists(args.font_path)
404
+ and format == "pdf"
405
+ ):
406
+ raise FileNotFoundError(f"Font file not found: {args.font_path}")
407
+
382
408
  validate_encoding(args.encoding)
383
409
 
384
410
  if format == "markdown":
yomitoku/constants.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import os
2
2
 
3
3
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
4
- SUPPORT_OUTPUT_FORMAT = ["json", "csv", "html", "markdown", "md"]
4
+ SUPPORT_OUTPUT_FORMAT = ["json", "csv", "html", "markdown", "md", "pdf"]
5
5
  SUPPORT_INPUT_FORMAT = ["jpg", "jpeg", "png", "bmp", "tiff", "tif", "pdf"]
6
6
  MIN_IMAGE_SIZE = 32
7
7
  WARNING_IMAGE_SIZE = 720
@@ -63,6 +63,7 @@ def convert_csv(
63
63
  ignore_line_break,
64
64
  img=None,
65
65
  export_figure: bool = True,
66
+ export_figure_letter: bool = False,
66
67
  figure_dir="figures",
67
68
  ):
68
69
  elements = []
@@ -89,6 +90,20 @@ def convert_csv(
89
90
  }
90
91
  )
91
92
 
93
+ if export_figure_letter:
94
+ for figure in inputs.figures:
95
+ paragraphs = sorted(figure.paragraphs, key=lambda x: x.order)
96
+ for paragraph in paragraphs:
97
+ contents = paragraph_to_csv(paragraph, ignore_line_break)
98
+ elements.append(
99
+ {
100
+ "type": "paragraph",
101
+ "box": paragraph.box,
102
+ "element": contents,
103
+ "order": figure.order,
104
+ }
105
+ )
106
+
92
107
  elements = sorted(elements, key=lambda x: x["order"])
93
108
 
94
109
  if export_figure:
@@ -109,6 +124,7 @@ def export_csv(
109
124
  encoding: str = "utf-8",
110
125
  img=None,
111
126
  export_figure: bool = True,
127
+ export_figure_letter: bool = False,
112
128
  figure_dir="figures",
113
129
  ):
114
130
  elements = convert_csv(
@@ -117,6 +133,7 @@ def export_csv(
117
133
  ignore_line_break,
118
134
  img,
119
135
  export_figure,
136
+ export_figure_letter,
120
137
  figure_dir,
121
138
  )
122
139
 
@@ -14,7 +14,6 @@ import jaconv
14
14
  from ..constants import ROOT_DIR
15
15
 
16
16
  FONT_PATH = ROOT_DIR + "/resource/MPLUS1p-Medium.ttf"
17
- pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", FONT_PATH))
18
17
 
19
18
 
20
19
  def _poly2rect(points):
@@ -62,14 +61,17 @@ def to_full_width(text):
62
61
  return jaconv_text
63
62
 
64
63
 
65
- def create_searchable_pdf(images, ocr_results, output_path):
64
+ def create_searchable_pdf(images, ocr_results, output_path, font_path=None):
65
+ if font_path is None:
66
+ font_path = FONT_PATH
67
+
68
+ pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", font_path))
69
+
66
70
  packet = BytesIO()
67
71
  c = canvas.Canvas(packet)
68
72
 
69
73
  for i, (image, ocr_result) in enumerate(zip(images, ocr_results)):
70
74
  image = Image.fromarray(image[:, :, ::-1]) # Convert BGR to RGB
71
- pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", FONT_PATH))
72
-
73
75
  image_path = f"tmp_{i}.png"
74
76
  image.save(image_path)
75
77
  w, h = image.size
@@ -97,7 +99,6 @@ def create_searchable_pdf(images, ocr_results, output_path):
97
99
 
98
100
  c.setFont("MPLUS1p-Medium", font_size)
99
101
  c.setFillColorRGB(1, 1, 1, alpha=0) # 透明
100
- # c.setFillColorRGB(0, 0, 0)
101
102
  if direction == "vertical":
102
103
  base_y = h - y2 + (bbox_height - font_size)
103
104
  for j, ch in enumerate(text):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: yomitoku
3
- Version: 0.9.2
3
+ Version: 0.9.4
4
4
  Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
5
5
  Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
6
6
  License: CC BY-NC-SA 4.0
@@ -87,7 +87,7 @@ yomitoku ${path_data} -f md -o results -v --figure --lite
87
87
  ```
88
88
 
89
89
  - `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。ディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。
90
- - `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md をサポート)
90
+ - `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md, pdf(searchable-pdf) をサポート)
91
91
  - `-o`, `--outdir` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
92
92
  - `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
93
93
  - `-l`, `--lite` を指定すると軽量モデルで推論を実行します。通常より高速に推論できますが、若干、精度が低下する可能性があります。
@@ -98,7 +98,6 @@ yomitoku ${path_data} -f md -o results -v --figure --lite
98
98
  - `--encoding` エクスポートする出力ファイルの文字エンコーディングを指定します。サポートされていない文字コードが含まれる場合は、その文字を無視します。(utf-8, utf-8-sig, shift-jis, enc-jp, cp932)
99
99
  - `--combine` PDFを入力に与えたときに、複数ページが含まれる場合に、それらの予測結果を一つのファイルに統合してエクスポートします。
100
100
  - `--ignore_meta` 文章のheater, fotterなどの文字情報を出力ファイルに含めません。
101
- - `--searchable_pdf` 読み取った文字情報をPDFに埋め込み全文検索可能なPDFを出力します。
102
101
 
103
102
  その他のオプションに関しては、ヘルプを参照
104
103
 
@@ -1,6 +1,6 @@
1
1
  yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
2
2
  yomitoku/base.py,sha256=9U3sfe69O6vuO430JzzKQQNkgPsLM9WdLfOUUhp3Ljs,3878
3
- yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
3
+ yomitoku/constants.py,sha256=2jya14UflDkMdYWMKc-ZllkWbJW2qh59Cnt2brrgNb4,693
4
4
  yomitoku/document_analyzer.py,sha256=xliAelQdfsK64FtVuFvstDBr9uf2TwhqW31g2g91_CY,16888
5
5
  yomitoku/layout_analyzer.py,sha256=VhNf1ZQFoozj6WUGk5ll1p2p1jk5X3j-JPcDbTAoSl4,1856
6
6
  yomitoku/layout_parser.py,sha256=0MgbCsD90srQdsxkGEL0TgKm4rkmGzsQYx0sjKQ03yc,7718
@@ -10,7 +10,7 @@ yomitoku/table_structure_recognizer.py,sha256=tHjex6deT_FjRK5ePz9bUXA_QIhgv_vYtK
10
10
  yomitoku/text_detector.py,sha256=6IwEJJKp_F8YH0Oki0QV-Mqi--P2LGbNKo-_kxBB_eo,4383
11
11
  yomitoku/text_recognizer.py,sha256=eaxozNu-Ms6iv8efbKZzn8pJNW1Wo4f86bGhzSMtv3s,5992
12
12
  yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- yomitoku/cli/main.py,sha256=7AaaFzMf33ER__XPDBNkrJkKwclne7QyVFWeBvpUYBY,12849
13
+ yomitoku/cli/main.py,sha256=v1UYsnQdnylhLvDURuxLODU3IU-ssVGqOJT9r-TCVns,13623
14
14
  yomitoku/cli/mcp_server.py,sha256=WnWzxd13HaemC3b-5i9B9NVBGc3WGfum2nYhoBolEnk,5641
15
15
  yomitoku/configs/__init__.py,sha256=x5-ccjGiP6xxRtDPT7f1Enl7SsE0hSk0G8f7eF9V85I,886
16
16
  yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
@@ -25,7 +25,7 @@ yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
25
25
  yomitoku/data/dataset.py,sha256=lpBcpkMuQzRIyLJ4_mqtuhR9s2ZmzgBgc-XYuE_b2Sc,1326
26
26
  yomitoku/data/functions.py,sha256=RExCUxI3-gccIMw-H0ribX2jeGKkrJWhS4fNn_12c3Y,7878
27
27
  yomitoku/export/__init__.py,sha256=gmlikMHRXfzfJ_8q4fyDlnpGms-x1oggQOwJEWHMgBU,508
28
- yomitoku/export/export_csv.py,sha256=VY8mntUCPDbDco_dyvq5O0_Q4wga9_GTyjHCS-y4UiQ,3399
28
+ yomitoku/export/export_csv.py,sha256=4U4KQ2RcBQmyUZ9O7a4uLoB6RUw80HPL1EEJUDwQlcI,4044
29
29
  yomitoku/export/export_html.py,sha256=LQDyZgbzmI0qJ0-FEK-54r9816H3L9hD10ChMcw0KyA,5620
30
30
  yomitoku/export/export_json.py,sha256=iNG37tdIuYG2x3NiiZemKaB6-X45WrhVPZhbX7RUzRI,2410
31
31
  yomitoku/export/export_markdown.py,sha256=KrdxDmKzVP_LbTKuDNGGsT31QOPKVsNNlb6wtLEW-1Q,4705
@@ -51,9 +51,9 @@ yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
52
52
  yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
53
53
  yomitoku/utils/misc.py,sha256=r92x45kQR8lC5jO1MZaHBDtcCWBkQXg_WS9H4RXJzSY,4127
54
- yomitoku/utils/searchable_pdf.py,sha256=40JbcxWrHzYTtzvI9MPYHMrWqLWKiLWo4mWDNRFXwHY,3530
54
+ yomitoku/utils/searchable_pdf.py,sha256=taZ-XtXN4RItePMDv4q0fRVlryusdkexA3TCXzwlXRo,3497
55
55
  yomitoku/utils/visualizer.py,sha256=DjDwHiAu1iFRKh96H3Egq4vuI2s_-9dLCDeykhKi8jo,5251
56
- yomitoku-0.9.2.dist-info/METADATA,sha256=vDEaaXAimCBfVwMeWmfyJBqzb7sXtZk4-ia3PXrtk7c,8966
57
- yomitoku-0.9.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
58
- yomitoku-0.9.2.dist-info/entry_points.txt,sha256=n3c8bQSj5Be5GHAOv_NZ8cldJFmWeigQxSmteFTmu_k,96
59
- yomitoku-0.9.2.dist-info/RECORD,,
56
+ yomitoku-0.9.4.dist-info/METADATA,sha256=oDIp-lxMIQjIfVtrzQXBcY2PJFHlRwktVGFXndQRJZo,8872
57
+ yomitoku-0.9.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
58
+ yomitoku-0.9.4.dist-info/entry_points.txt,sha256=n3c8bQSj5Be5GHAOv_NZ8cldJFmWeigQxSmteFTmu_k,96
59
+ yomitoku-0.9.4.dist-info/RECORD,,