yomitoku 0.7.3__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yomitoku/cli/main.py CHANGED
@@ -12,6 +12,7 @@ from ..document_analyzer import DocumentAnalyzer
12
12
  from ..utils.logger import set_logger
13
13
 
14
14
  from ..export import save_csv, save_html, save_json, save_markdown
15
+ from ..export import convert_json, convert_csv, convert_html, convert_markdown
15
16
 
16
17
  logger = set_logger(__name__, "INFO")
17
18
 
@@ -51,13 +52,13 @@ def merge_all_pages(results):
51
52
 
52
53
  def save_merged_file(out_path, args, out):
53
54
  if args.format == "json":
54
- save_json(out_path, args.encoding, out)
55
+ save_json(out, out_path, args.encoding)
55
56
  elif args.format == "csv":
56
- save_csv(out_path, args.encoding, out)
57
+ save_csv(out, out_path, args.encoding)
57
58
  elif args.format == "html":
58
- save_html(out_path, args.encoding, out)
59
+ save_html(out, out_path, args.encoding)
59
60
  elif args.format == "md":
60
- save_markdown(out_path, args.encoding, out)
61
+ save_markdown(out, out_path, args.encoding)
61
62
 
62
63
 
63
64
  def validate_encoding(encoding):
@@ -76,7 +77,7 @@ def process_single_file(args, analyzer, path, format):
76
77
  if path.suffix[1:].lower() in ["pdf"]:
77
78
  imgs = load_pdf(path)
78
79
  else:
79
- imgs = [load_image(path)]
80
+ imgs = load_image(path)
80
81
 
81
82
  results = []
82
83
  for page, img in enumerate(imgs):
@@ -84,6 +85,10 @@ def process_single_file(args, analyzer, path, format):
84
85
  dirname = path.parent.name
85
86
  filename = path.stem
86
87
 
88
+ # cv2.imwrite(
89
+ # os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
90
+ # )
91
+
87
92
  if ocr is not None:
88
93
  out_path = os.path.join(
89
94
  args.outdir, f"{dirname}_{filename}_p{page+1}_ocr.jpg"
@@ -103,34 +108,51 @@ def process_single_file(args, analyzer, path, format):
103
108
  out_path = os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.{format}")
104
109
 
105
110
  if format == "json":
106
- json = result.to_json(
107
- out_path,
108
- ignore_line_break=args.ignore_line_break,
109
- encoding=args.encoding,
110
- img=img,
111
- export_figure=args.figure,
112
- figure_dir=args.figure_dir,
113
- )
111
+ if args.combine:
112
+ json = convert_json(
113
+ result,
114
+ out_path,
115
+ args.ignore_line_break,
116
+ img,
117
+ args.figure,
118
+ args.figure_dir,
119
+ )
120
+ else:
121
+ json = result.to_json(
122
+ out_path,
123
+ ignore_line_break=args.ignore_line_break,
124
+ encoding=args.encoding,
125
+ img=img,
126
+ export_figure=args.figure,
127
+ figure_dir=args.figure_dir,
128
+ )
114
129
 
115
130
  results.append(
116
131
  {
117
132
  "format": format,
118
- "data": json,
133
+ "data": json.model_dump(),
119
134
  }
120
135
  )
121
136
 
122
- if not args.combine:
123
- save_json(out_path, args.encoding, json)
124
-
125
137
  elif format == "csv":
126
- csv = result.to_csv(
127
- out_path,
128
- ignore_line_break=args.ignore_line_break,
129
- encoding=args.encoding,
130
- img=img,
131
- export_figure=args.figure,
132
- figure_dir=args.figure_dir,
133
- )
138
+ if args.combine:
139
+ csv = convert_csv(
140
+ result,
141
+ out_path,
142
+ args.ignore_line_break,
143
+ img,
144
+ args.figure,
145
+ args.figure_dir,
146
+ )
147
+ else:
148
+ csv = result.to_csv(
149
+ out_path,
150
+ ignore_line_break=args.ignore_line_break,
151
+ encoding=args.encoding,
152
+ img=img,
153
+ export_figure=args.figure,
154
+ figure_dir=args.figure_dir,
155
+ )
134
156
 
135
157
  results.append(
136
158
  {
@@ -139,20 +161,29 @@ def process_single_file(args, analyzer, path, format):
139
161
  }
140
162
  )
141
163
 
142
- if not args.combine:
143
- save_csv(out_path, args.encoding, csv)
144
-
145
164
  elif format == "html":
146
- html = result.to_html(
147
- out_path,
148
- ignore_line_break=args.ignore_line_break,
149
- img=img,
150
- export_figure=args.figure,
151
- export_figure_letter=args.figure_letter,
152
- figure_width=args.figure_width,
153
- figure_dir=args.figure_dir,
154
- encoding=args.encoding,
155
- )
165
+ if args.combine:
166
+ html, _ = convert_html(
167
+ result,
168
+ out_path,
169
+ ignore_line_break=args.ignore_line_break,
170
+ img=img,
171
+ export_figure=args.figure,
172
+ export_figure_letter=args.figure_letter,
173
+ figure_width=args.figure_width,
174
+ figure_dir=args.figure_dir,
175
+ )
176
+ else:
177
+ html = result.to_html(
178
+ out_path,
179
+ ignore_line_break=args.ignore_line_break,
180
+ img=img,
181
+ export_figure=args.figure,
182
+ export_figure_letter=args.figure_letter,
183
+ figure_width=args.figure_width,
184
+ figure_dir=args.figure_dir,
185
+ encoding=args.encoding,
186
+ )
156
187
 
157
188
  results.append(
158
189
  {
@@ -161,20 +192,29 @@ def process_single_file(args, analyzer, path, format):
161
192
  }
162
193
  )
163
194
 
164
- if not args.combine:
165
- save_html(out_path, args.encoding, html)
166
-
167
195
  elif format == "md":
168
- md = result.to_markdown(
169
- out_path,
170
- ignore_line_break=args.ignore_line_break,
171
- img=img,
172
- export_figure=args.figure,
173
- export_figure_letter=args.figure_letter,
174
- figure_width=args.figure_width,
175
- figure_dir=args.figure_dir,
176
- encoding=args.encoding,
177
- )
196
+ if args.combine:
197
+ md, _ = convert_markdown(
198
+ result,
199
+ out_path,
200
+ ignore_line_break=args.ignore_line_break,
201
+ img=img,
202
+ export_figure=args.figure,
203
+ export_figure_letter=args.figure_letter,
204
+ figure_width=args.figure_width,
205
+ figure_dir=args.figure_dir,
206
+ )
207
+ else:
208
+ md = result.to_markdown(
209
+ out_path,
210
+ ignore_line_break=args.ignore_line_break,
211
+ img=img,
212
+ export_figure=args.figure,
213
+ export_figure_letter=args.figure_letter,
214
+ figure_width=args.figure_width,
215
+ figure_dir=args.figure_dir,
216
+ encoding=args.encoding,
217
+ )
178
218
 
179
219
  results.append(
180
220
  {
@@ -183,9 +223,6 @@ def process_single_file(args, analyzer, path, format):
183
223
  }
184
224
  )
185
225
 
186
- if not args.combine:
187
- save_markdown(out_path, args.encoding, md)
188
-
189
226
  out = merge_all_pages(results)
190
227
  if args.combine:
191
228
  out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")
@@ -1,10 +1,14 @@
1
1
  from .cfg_layout_parser_rtdtrv2 import LayoutParserRTDETRv2Config
2
+ from .cfg_layout_parser_rtdtrv2_v2 import LayoutParserRTDETRv2V2Config
2
3
  from .cfg_table_structure_recognizer_rtdtrv2 import (
3
4
  TableStructureRecognizerRTDETRv2Config,
4
5
  )
5
6
  from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
7
+ from .cfg_text_detector_dbnet_v2 import TextDetectorDBNetV2Config
6
8
  from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
7
9
  from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
10
+ from .cfg_text_recognizer_parseq_v2 import TextRecognizerPARSeqV2Config
11
+
8
12
 
9
13
  __all__ = [
10
14
  "TextDetectorDBNetConfig",
@@ -12,4 +16,7 @@ __all__ = [
12
16
  "LayoutParserRTDETRv2Config",
13
17
  "TableStructureRecognizerRTDETRv2Config",
14
18
  "TextRecognizerPARSeqSmallConfig",
19
+ "LayoutParserRTDETRv2V2Config",
20
+ "TextDetectorDBNetV2Config",
21
+ "TextRecognizerPARSeqV2Config",
15
22
  ]
@@ -0,0 +1,89 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+
5
+ @dataclass
6
+ class Data:
7
+ img_size: List[int] = field(default_factory=lambda: [640, 640])
8
+
9
+
10
+ @dataclass
11
+ class BackBone:
12
+ depth: int = 50
13
+ variant: str = "d"
14
+ freeze_at: int = 0
15
+ return_idx: List[int] = field(default_factory=lambda: [1, 2, 3])
16
+ num_stages: int = 4
17
+ freeze_norm: bool = True
18
+
19
+
20
+ @dataclass
21
+ class Encoder:
22
+ in_channels: List[int] = field(default_factory=lambda: [512, 1024, 2048])
23
+ feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
24
+
25
+ # intra
26
+ hidden_dim: int = 256
27
+ use_encoder_idx: List[int] = field(default_factory=lambda: [2])
28
+ num_encoder_layers: int = 1
29
+ nhead: int = 8
30
+ dim_feedforward: int = 1024
31
+ dropout: float = 0.0
32
+ enc_act: str = "gelu"
33
+
34
+ # cross
35
+ expansion: float = 1.0
36
+ depth_mult: int = 1
37
+ act: str = "silu"
38
+
39
+
40
+ @dataclass
41
+ class Decoder:
42
+ num_classes: int = 6
43
+ feat_channels: List[int] = field(default_factory=lambda: [256, 256, 256])
44
+ feat_strides: List[int] = field(default_factory=lambda: [8, 16, 32])
45
+ hidden_dim: int = 256
46
+ num_levels: int = 3
47
+
48
+ num_layers: int = 6
49
+ num_queries: int = 300
50
+
51
+ num_denoising: int = 100
52
+ label_noise_ratio: float = 0.5
53
+ box_noise_scale: float = 1.0
54
+ eval_spatial_size: List[int] = field(default_factory=lambda: [640, 640])
55
+
56
+ eval_idx: int = -1
57
+
58
+ num_points: List[int] = field(default_factory=lambda: [4, 4, 4])
59
+ cross_attn_method: str = "default"
60
+ query_select_method: str = "default"
61
+
62
+
63
+ @dataclass
64
+ class LayoutParserRTDETRv2V2Config:
65
+ hf_hub_repo: str = "KotaroKinoshita/yomitoku-layout-parser-rtdtrv2-v2"
66
+ thresh_score: float = 0.5
67
+ data: Data = field(default_factory=Data)
68
+ PResNet: BackBone = field(default_factory=BackBone)
69
+ HybridEncoder: Encoder = field(default_factory=Encoder)
70
+ RTDETRTransformerv2: Decoder = field(default_factory=Decoder)
71
+
72
+ category: List[str] = field(
73
+ default_factory=lambda: [
74
+ "tables",
75
+ "figures",
76
+ "paragraphs",
77
+ "section_headings",
78
+ "page_header",
79
+ "page_footer",
80
+ ]
81
+ )
82
+
83
+ role: List[str] = field(
84
+ default_factory=lambda: [
85
+ "section_headings",
86
+ "page_header",
87
+ "page_footer",
88
+ ]
89
+ )
@@ -0,0 +1,49 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+
5
+ @dataclass
6
+ class BackBone:
7
+ name: str = "resnet50"
8
+ dilation: bool = True
9
+
10
+
11
+ @dataclass
12
+ class Decoder:
13
+ in_channels: list[int] = field(default_factory=lambda: [256, 512, 1024, 2048])
14
+ hidden_dim: int = 256
15
+ adaptive: bool = True
16
+ serial: bool = True
17
+ smooth: bool = False
18
+ k: int = 50
19
+
20
+
21
+ @dataclass
22
+ class Data:
23
+ shortest_size: int = 1280
24
+ limit_size: int = 1600
25
+
26
+
27
+ @dataclass
28
+ class PostProcess:
29
+ min_size: int = 2
30
+ thresh: float = 0.4
31
+ box_thresh: float = 0.5
32
+ max_candidates: int = 1500
33
+ unclip_ratio: float = 6.0
34
+
35
+
36
+ @dataclass
37
+ class Visualize:
38
+ color: List[int] = field(default_factory=lambda: [0, 255, 0])
39
+ heatmap: bool = False
40
+
41
+
42
+ @dataclass
43
+ class TextDetectorDBNetV2Config:
44
+ hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-detector-dbnet-v2"
45
+ backbone: BackBone = field(default_factory=BackBone)
46
+ decoder: Decoder = field(default_factory=Decoder)
47
+ data: Data = field(default_factory=Data)
48
+ post_process: PostProcess = field(default_factory=PostProcess)
49
+ visualize: Visualize = field(default_factory=Visualize)
@@ -0,0 +1,51 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+ from ..constants import ROOT_DIR
5
+
6
+
7
+ @dataclass
8
+ class Data:
9
+ num_workers: int = 4
10
+ batch_size: int = 128
11
+ img_size: List[int] = field(default_factory=lambda: [32, 800])
12
+
13
+
14
+ @dataclass
15
+ class Encoder:
16
+ patch_size: List[int] = field(default_factory=lambda: [8, 8])
17
+ num_heads: int = 8
18
+ embed_dim: int = 512
19
+ mlp_ratio: int = 4
20
+ depth: int = 12
21
+
22
+
23
+ @dataclass
24
+ class Decoder:
25
+ embed_dim: int = 512
26
+ num_heads: int = 8
27
+ mlp_ratio: int = 4
28
+ depth: int = 1
29
+
30
+
31
+ @dataclass
32
+ class Visualize:
33
+ font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
34
+ color: List[int] = field(default_factory=lambda: [0, 0, 255]) # RGB
35
+ font_size: int = 18
36
+
37
+
38
+ @dataclass
39
+ class TextRecognizerPARSeqV2Config:
40
+ hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-middle-v2"
41
+ charset: str = str(ROOT_DIR + "/resource/charset.txt")
42
+ num_tokens: int = 7312
43
+ max_label_length: int = 100
44
+ decode_ar: int = 1
45
+ refine_iters: int = 1
46
+
47
+ data: Data = field(default_factory=Data)
48
+ encoder: Encoder = field(default_factory=Encoder)
49
+ decoder: Decoder = field(default_factory=Decoder)
50
+
51
+ visualize: Visualize = field(default_factory=Visualize)
@@ -1,6 +1,7 @@
1
1
  from pathlib import Path
2
2
 
3
3
  import cv2
4
+ from PIL import Image
4
5
  import numpy as np
5
6
  import torch
6
7
  import pypdfium2
@@ -15,6 +16,20 @@ from ..utils.logger import set_logger
15
16
  logger = set_logger(__name__)
16
17
 
17
18
 
19
+ def validate_image(img: np.ndarray):
20
+ h, w = img.shape[:2]
21
+ if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
22
+ raise ValueError("Image size is too small.")
23
+
24
+ if min(h, w) < WARNING_IMAGE_SIZE:
25
+ logger.warning(
26
+ """
27
+ The image size is small, which may result in reduced OCR accuracy.
28
+ The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
29
+ """
30
+ )
31
+
32
+
18
33
  def load_image(image_path: str) -> np.ndarray:
19
34
  """
20
35
  Open an image file.
@@ -40,24 +55,27 @@ def load_image(image_path: str) -> np.ndarray:
40
55
  "PDF file is not supported by load_image(). Use load_pdf() instead."
41
56
  )
42
57
 
43
- img = cv2.imread(image_path, cv2.IMREAD_COLOR)
44
-
45
- if img is None:
58
+ try:
59
+ img = Image.open(image_path)
60
+ except Exception:
46
61
  raise ValueError("Invalid image data.")
47
62
 
48
- h, w = img.shape[:2]
49
- if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
50
- raise ValueError("Image size is too small.")
51
-
52
- if min(h, w) < WARNING_IMAGE_SIZE:
53
- logger.warning(
54
- """
55
- The image size is small, which may result in reduced OCR accuracy.
56
- The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
57
- """
58
- )
63
+ pages = []
64
+ if ext in ["tif", "tiff"]:
65
+ try:
66
+ while True:
67
+ img_arr = np.array(img.copy().convert("RGB"))
68
+ validate_image(img_arr)
69
+ pages.append(img_arr[:, :, ::-1])
70
+ img.seek(img.tell() + 1)
71
+ except EOFError:
72
+ pass
73
+ else:
74
+ img_arr = np.array(img.convert("RGB"))
75
+ validate_image(img_arr)
76
+ pages.append(img_arr[:, :, ::-1])
59
77
 
60
- return img
78
+ return pages
61
79
 
62
80
 
63
81
  def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
@@ -1,7 +1,7 @@
1
- from .export_csv import export_csv, save_csv
2
- from .export_html import export_html, save_html
3
- from .export_json import export_json, save_json
4
- from .export_markdown import export_markdown, save_markdown
1
+ from .export_csv import export_csv, save_csv, convert_csv
2
+ from .export_html import export_html, save_html, convert_html
3
+ from .export_json import export_json, save_json, convert_json
4
+ from .export_markdown import export_markdown, save_markdown, convert_markdown
5
5
 
6
6
  __all__ = [
7
7
  "export_html",
@@ -12,4 +12,8 @@ __all__ = [
12
12
  "save_markdown",
13
13
  "save_csv",
14
14
  "save_json",
15
+ "convert_html",
16
+ "convert_markdown",
17
+ "convert_csv",
18
+ "convert_json",
15
19
  ]
@@ -57,11 +57,10 @@ def save_figure(
57
57
  cv2.imwrite(figure_path, figure_img)
58
58
 
59
59
 
60
- def export_csv(
60
+ def convert_csv(
61
61
  inputs,
62
- out_path: str,
63
- ignore_line_break: bool = False,
64
- encoding: str = "utf-8",
62
+ out_path,
63
+ ignore_line_break,
65
64
  img=None,
66
65
  export_figure: bool = True,
67
66
  figure_dir="figures",
@@ -90,6 +89,8 @@ def export_csv(
90
89
  }
91
90
  )
92
91
 
92
+ elements = sorted(elements, key=lambda x: x["order"])
93
+
93
94
  if export_figure:
94
95
  save_figure(
95
96
  inputs.figures,
@@ -98,11 +99,36 @@ def export_csv(
98
99
  figure_dir=figure_dir,
99
100
  )
100
101
 
101
- elements = sorted(elements, key=lambda x: x["order"])
102
102
  return elements
103
103
 
104
104
 
105
- def save_csv(out_path, encoding, elements):
105
+ def export_csv(
106
+ inputs,
107
+ out_path: str,
108
+ ignore_line_break: bool = False,
109
+ encoding: str = "utf-8",
110
+ img=None,
111
+ export_figure: bool = True,
112
+ figure_dir="figures",
113
+ ):
114
+ elements = convert_csv(
115
+ inputs,
116
+ out_path,
117
+ ignore_line_break,
118
+ img,
119
+ export_figure,
120
+ figure_dir,
121
+ )
122
+
123
+ save_csv(elements, out_path, encoding)
124
+ return elements
125
+
126
+
127
+ def save_csv(
128
+ elements,
129
+ out_path,
130
+ encoding,
131
+ ):
106
132
  with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
107
133
  writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
108
134
  for element in elements:
@@ -146,16 +146,15 @@ def figure_to_html(
146
146
  return elements
147
147
 
148
148
 
149
- def export_html(
149
+ def convert_html(
150
150
  inputs,
151
- out_path: str,
152
- ignore_line_break: bool = False,
153
- export_figure: bool = True,
154
- export_figure_letter: bool = False,
151
+ out_path,
152
+ ignore_line_break,
153
+ export_figure,
154
+ export_figure_letter,
155
155
  img=None,
156
156
  figure_width=200,
157
157
  figure_dir="figures",
158
- encoding: str = "utf-8",
159
158
  ):
160
159
  html_string = ""
161
160
  elements = []
@@ -181,13 +180,43 @@ def export_html(
181
180
  elements = sorted(elements, key=lambda x: x["order"])
182
181
 
183
182
  html_string = "".join([element["html"] for element in elements])
184
- # html_string = add_html_tag(html_string)
185
-
186
183
  parsed_html = html.fromstring(html_string)
187
184
  formatted_html = etree.tostring(parsed_html, pretty_print=True, encoding="unicode")
185
+
186
+ return formatted_html, elements
187
+
188
+
189
+ def export_html(
190
+ inputs,
191
+ out_path: str,
192
+ ignore_line_break: bool = False,
193
+ export_figure: bool = True,
194
+ export_figure_letter: bool = False,
195
+ img=None,
196
+ figure_width=200,
197
+ figure_dir="figures",
198
+ encoding: str = "utf-8",
199
+ ):
200
+ formatted_html, elements = convert_html(
201
+ inputs,
202
+ out_path,
203
+ ignore_line_break,
204
+ export_figure,
205
+ export_figure_letter,
206
+ img,
207
+ figure_width,
208
+ figure_dir,
209
+ )
210
+
211
+ save_html(formatted_html, out_path, encoding)
212
+
188
213
  return formatted_html
189
214
 
190
215
 
191
- def save_html(out_path, encoding, html):
216
+ def save_html(
217
+ html,
218
+ out_path,
219
+ encoding,
220
+ ):
192
221
  with open(out_path, "w", encoding=encoding, errors="ignore") as f:
193
222
  f.write(html)
@@ -36,15 +36,7 @@ def save_figure(
36
36
  cv2.imwrite(figure_path, figure_img)
37
37
 
38
38
 
39
- def export_json(
40
- inputs,
41
- out_path,
42
- ignore_line_break=False,
43
- encoding: str = "utf-8",
44
- img=None,
45
- export_figure=False,
46
- figure_dir="figures",
47
- ):
39
+ def convert_json(inputs, out_path, ignore_line_break, img, export_figure, figure_dir):
48
40
  from yomitoku.document_analyzer import DocumentAnalyzerSchema
49
41
 
50
42
  if isinstance(inputs, DocumentAnalyzerSchema):
@@ -55,18 +47,45 @@ def export_json(
55
47
  for paragraph in inputs.paragraphs:
56
48
  paragraph_to_json(paragraph, ignore_line_break)
57
49
 
58
- if export_figure:
59
- save_figure(
60
- inputs.figures,
61
- img,
62
- out_path,
63
- figure_dir=figure_dir,
64
- )
50
+ if isinstance(inputs, DocumentAnalyzerSchema) and export_figure:
51
+ save_figure(
52
+ inputs.figures,
53
+ img,
54
+ out_path,
55
+ figure_dir=figure_dir,
56
+ )
57
+
58
+ return inputs
59
+
60
+
61
+ def export_json(
62
+ inputs,
63
+ out_path,
64
+ ignore_line_break=False,
65
+ encoding: str = "utf-8",
66
+ img=None,
67
+ export_figure=False,
68
+ figure_dir="figures",
69
+ ):
70
+ inputs = convert_json(
71
+ inputs,
72
+ out_path,
73
+ ignore_line_break,
74
+ img,
75
+ export_figure,
76
+ figure_dir,
77
+ )
78
+
79
+ save_json(
80
+ inputs.model_dump(),
81
+ out_path,
82
+ encoding,
83
+ )
65
84
 
66
- return inputs.model_dump()
85
+ return inputs
67
86
 
68
87
 
69
- def save_json(out_path, encoding, data):
88
+ def save_json(data, out_path, encoding):
70
89
  with open(out_path, "w", encoding=encoding, errors="ignore") as f:
71
90
  json.dump(
72
91
  data,
@@ -111,16 +111,15 @@ def figure_to_md(
111
111
  return elements
112
112
 
113
113
 
114
- def export_markdown(
114
+ def convert_markdown(
115
115
  inputs,
116
- out_path: str,
116
+ out_path,
117
+ ignore_line_break=False,
117
118
  img=None,
118
- ignore_line_break: bool = False,
119
119
  export_figure_letter=False,
120
120
  export_figure=True,
121
121
  figure_width=200,
122
122
  figure_dir="figures",
123
- encoding: str = "utf-8",
124
123
  ):
125
124
  elements = []
126
125
  for table in inputs.tables:
@@ -144,10 +143,39 @@ def export_markdown(
144
143
 
145
144
  elements = sorted(elements, key=lambda x: x["order"])
146
145
  markdown = "\n".join([element["md"] for element in elements])
146
+ return markdown, elements
147
+
147
148
 
149
+ def export_markdown(
150
+ inputs,
151
+ out_path: str,
152
+ ignore_line_break: bool = False,
153
+ img=None,
154
+ export_figure_letter=False,
155
+ export_figure=True,
156
+ figure_width=200,
157
+ figure_dir="figures",
158
+ encoding: str = "utf-8",
159
+ ):
160
+ markdown, elements = convert_markdown(
161
+ inputs,
162
+ out_path,
163
+ ignore_line_break,
164
+ img,
165
+ export_figure_letter,
166
+ export_figure,
167
+ figure_width,
168
+ figure_dir,
169
+ )
170
+
171
+ save_markdown(markdown, out_path, encoding)
148
172
  return markdown
149
173
 
150
174
 
151
- def save_markdown(out_path, encoding, markdown):
175
+ def save_markdown(
176
+ markdown,
177
+ out_path,
178
+ encoding,
179
+ ):
152
180
  with open(out_path, "w", encoding=encoding, errors="ignore") as f:
153
181
  f.write(markdown)
yomitoku/layout_parser.py CHANGED
@@ -12,7 +12,7 @@ from pydantic import conlist
12
12
  from .constants import ROOT_DIR
13
13
 
14
14
  from .base import BaseModelCatalog, BaseModule, BaseSchema
15
- from .configs import LayoutParserRTDETRv2Config
15
+ from .configs import LayoutParserRTDETRv2Config, LayoutParserRTDETRv2V2Config
16
16
  from .models import RTDETRv2
17
17
  from .postprocessor import RTDETRPostProcessor
18
18
  from .utils.misc import filter_by_flag, is_contained
@@ -35,6 +35,7 @@ class LayoutParserModelCatalog(BaseModelCatalog):
35
35
  def __init__(self):
36
36
  super().__init__()
37
37
  self.register("rtdetrv2", LayoutParserRTDETRv2Config, RTDETRv2)
38
+ self.register("rtdetrv2v2", LayoutParserRTDETRv2V2Config, RTDETRv2)
38
39
 
39
40
 
40
41
  def filter_contained_rectangles_within_category(category_elements):
@@ -91,7 +92,7 @@ class LayoutParser(BaseModule):
91
92
 
92
93
  def __init__(
93
94
  self,
94
- model_name="rtdetrv2",
95
+ model_name="rtdetrv2v2",
95
96
  path_cfg=None,
96
97
  device="cuda",
97
98
  visualize=False,
@@ -49,6 +49,13 @@ class RTDETRPostProcessor(nn.Module):
49
49
  def extra_repr(self) -> str:
50
50
  return f"use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}"
51
51
 
52
+ def clamp(self, boxes, h, w):
53
+ boxes[:, 0] = torch.clamp(boxes[:, 0], min=torch.Tensor([0]), max=None)
54
+ boxes[:, 1] = torch.clamp(boxes[:, 1], min=torch.Tensor([0]), max=None)
55
+ boxes[:, 2] = torch.clamp(boxes[:, 2], min=torch.Tensor([0]), max=w)
56
+ boxes[:, 3] = torch.clamp(boxes[:, 3], min=torch.Tensor([0]), max=h)
57
+ return boxes
58
+
52
59
  # def forward(self, outputs, orig_target_sizes):
53
60
  def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold):
54
61
  logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
@@ -57,6 +64,8 @@ class RTDETRPostProcessor(nn.Module):
57
64
  bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy")
58
65
  bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
59
66
 
67
+ w, h = orig_target_sizes.unbind(1)
68
+
60
69
  if self.use_focal_loss:
61
70
  scores = F.sigmoid(logits)
62
71
  scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
@@ -104,9 +113,10 @@ class RTDETRPostProcessor(nn.Module):
104
113
  sco = sco[sco > threshold]
105
114
 
106
115
  lab = lab.cpu().numpy()
107
- box = box.cpu().numpy()
108
116
  sco = sco.cpu().numpy()
109
117
 
118
+ box = self.clamp(box.cpu(), h.cpu(), w.cpu()).numpy()
119
+
110
120
  result = dict(labels=lab, boxes=box, scores=sco)
111
121
  results.append(result)
112
122
 
yomitoku/text_detector.py CHANGED
@@ -6,7 +6,10 @@ import os
6
6
  from pydantic import conlist
7
7
 
8
8
  from .base import BaseModelCatalog, BaseModule, BaseSchema
9
- from .configs import TextDetectorDBNetConfig
9
+ from .configs import (
10
+ TextDetectorDBNetConfig,
11
+ TextDetectorDBNetV2Config,
12
+ )
10
13
  from .data.functions import (
11
14
  array_to_tensor,
12
15
  resize_shortest_edge,
@@ -25,6 +28,7 @@ class TextDetectorModelCatalog(BaseModelCatalog):
25
28
  def __init__(self):
26
29
  super().__init__()
27
30
  self.register("dbnet", TextDetectorDBNetConfig, DBNet)
31
+ self.register("dbnetv2", TextDetectorDBNetV2Config, DBNet)
28
32
 
29
33
 
30
34
  class TextDetectorSchema(BaseSchema):
@@ -43,7 +47,7 @@ class TextDetector(BaseModule):
43
47
 
44
48
  def __init__(
45
49
  self,
46
- model_name="dbnet",
50
+ model_name="dbnetv2",
47
51
  path_cfg=None,
48
52
  device="cuda",
49
53
  visualize=False,
@@ -7,7 +7,11 @@ import unicodedata
7
7
  from pydantic import conlist
8
8
 
9
9
  from .base import BaseModelCatalog, BaseModule, BaseSchema
10
- from .configs import TextRecognizerPARSeqConfig, TextRecognizerPARSeqSmallConfig
10
+ from .configs import (
11
+ TextRecognizerPARSeqConfig,
12
+ TextRecognizerPARSeqSmallConfig,
13
+ TextRecognizerPARSeqV2Config,
14
+ )
11
15
  from .data.dataset import ParseqDataset
12
16
  from .models import PARSeq
13
17
  from .postprocessor import ParseqTokenizer as Tokenizer
@@ -23,6 +27,7 @@ class TextRecognizerModelCatalog(BaseModelCatalog):
23
27
  def __init__(self):
24
28
  super().__init__()
25
29
  self.register("parseq", TextRecognizerPARSeqConfig, PARSeq)
30
+ self.register("parseqv2", TextRecognizerPARSeqV2Config, PARSeq)
26
31
  self.register("parseq-small", TextRecognizerPARSeqSmallConfig, PARSeq)
27
32
 
28
33
 
@@ -44,7 +49,7 @@ class TextRecognizer(BaseModule):
44
49
 
45
50
  def __init__(
46
51
  self,
47
- model_name="parseq",
52
+ model_name="parseqv2",
48
53
  path_cfg=None,
49
54
  device="cuda",
50
55
  visualize=False,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: yomitoku
3
- Version: 0.7.3
3
+ Version: 0.8.0
4
4
  Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
5
5
  Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
6
6
  License: CC BY-NC-SA 4.0
@@ -37,7 +37,7 @@ Description-Content-Type: text/markdown
37
37
  YomiToku は日本語に特化した AI 文章画像解析エンジン(Document AI)です。画像内の文字の全文 OCR およびレイアウト解析機能を有しており、画像内の文字情報や図表を認識、抽出、変換します。
38
38
 
39
39
  - 🤖 日本語データセットで学習した 4 種類(文字位置の検知、文字列認識、レイアウト解析、表の構造認識)の AI モデルを搭載しています。4 種類のモデルはすべて独自に学習されたモデルで日本語文書に対して、高精度に推論可能です。
40
- - 🇯🇵 各モデルは日本語の文書画像に特化して学習されており、7000 文字を超える日本語文字の認識をサーポート、縦書きなど日本語特有のレイアウト構造の文書画像の解析も可能です。(日本語以外にも英語の文書に対しても対応しています)。
40
+ - 🇯🇵 各モデルは日本語の文書画像に特化して学習されており、7000 文字を超える日本語文字の認識をサーポート、手書き文字、縦書きなど日本語特有のレイアウト構造の文書画像の解析も可能です。(日本語以外にも英語の文書に対しても対応しています)。
41
41
  - 📈 レイアウト解析、表の構造解析, 読み順推定機能により、文書画像のレイアウトの意味的構造を壊さずに情報を抽出することが可能です。
42
42
  - 📄 多様な出力形式をサポートしています。html やマークダウン、json、csv のいずれかのフォーマットに変換可能です。また、文書内に含まれる図表、画像の抽出の出力も可能です。
43
43
  - ⚡ GPU 環境で高速に動作し、効率的に文書の文字起こし解析が可能です。また、VRAM も 8GB 以内で動作し、ハイエンドな GPU を用意する必要はありません。
@@ -103,7 +103,6 @@ yomitoku --help
103
103
  **NOTE**
104
104
 
105
105
  - GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
106
- - 活字のみ識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
107
106
  - Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
108
107
  - AI-OCR の識別精度を高めるために、入力画像の解像度が重要です。低解像度画像では識別精度が低下します。最低でも画像の短辺を 720px 以上の画像で推論することをお勧めします。
109
108
 
@@ -3,28 +3,31 @@ yomitoku/base.py,sha256=9U3sfe69O6vuO430JzzKQQNkgPsLM9WdLfOUUhp3Ljs,3878
3
3
  yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
4
4
  yomitoku/document_analyzer.py,sha256=wQMmXACDsDmyaxg2OnG9Og5Nx53WPUkQdUmgYtljACQ,16412
5
5
  yomitoku/layout_analyzer.py,sha256=VhNf1ZQFoozj6WUGk5ll1p2p1jk5X3j-JPcDbTAoSl4,1856
6
- yomitoku/layout_parser.py,sha256=V_mAkZxke1gwHfnxBFMTOJ8hnz2X_kfZu2lLiMd8cAs,7610
6
+ yomitoku/layout_parser.py,sha256=0MgbCsD90srQdsxkGEL0TgKm4rkmGzsQYx0sjKQ03yc,7718
7
7
  yomitoku/ocr.py,sha256=JSTjkupcxHITQm6ERnzU7As0c3KWf8-oxc0AqNoWHXo,2272
8
8
  yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
9
9
  yomitoku/table_structure_recognizer.py,sha256=tHjex6deT_FjRK5ePz9bUXA_QIhgv_vYtK-ynm4ALxg,9625
10
- yomitoku/text_detector.py,sha256=XgqhtbNcJww2x3BrH8EFz45qC6kqPKCX9hsa-dzRoIA,4274
11
- yomitoku/text_recognizer.py,sha256=t95sbxve-E9VOCaU9CFGZIlk_a4my9KfFfr9tXws9As,5871
10
+ yomitoku/text_detector.py,sha256=6IwEJJKp_F8YH0Oki0QV-Mqi--P2LGbNKo-_kxBB_eo,4383
11
+ yomitoku/text_recognizer.py,sha256=eaxozNu-Ms6iv8efbKZzn8pJNW1Wo4f86bGhzSMtv3s,5992
12
12
  yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- yomitoku/cli/main.py,sha256=WvQO9V5HzxxvRAIsGkrDl9OGrmaKsAbDBrg4ApCSy_c,10527
14
- yomitoku/configs/__init__.py,sha256=e1Alss5QJLZSNfD6zLEG6xu5vDQDw-4Jayiqq8bq52s,571
13
+ yomitoku/cli/main.py,sha256=jQCSwHw4oOwLQjARvaIO1yoSjz-2Rdb9c3DNShLS5OE,12038
14
+ yomitoku/configs/__init__.py,sha256=x5-ccjGiP6xxRtDPT7f1Enl7SsE0hSk0G8f7eF9V85I,886
15
15
  yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
16
+ yomitoku/configs/cfg_layout_parser_rtdtrv2_v2.py,sha256=nMrL3uvoVmyzZ909Bz2zmfp9b6AEBLKhIprOvQ5yiQE,2324
16
17
  yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
17
18
  yomitoku/configs/cfg_text_detector_dbnet.py,sha256=U9k48PON7haoOaytiELhbZRpv9RMiUm6nnfHmdxIa9Q,1153
19
+ yomitoku/configs/cfg_text_detector_dbnet_v2.py,sha256=PzdV6-f75ba-KBEBcPxyo9STWQ6m5-1Rl3MFBLl2TSc,1148
18
20
  yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLEGahEsCaZdjfKC_MO8,1247
19
21
  yomitoku/configs/cfg_text_recognizer_parseq_small.py,sha256=uCm_VC_G79IbZpOiK8fgYzAJ4b98H5pf328wyQomtfo,1259
22
+ yomitoku/configs/cfg_text_recognizer_parseq_v2.py,sha256=GfHzbByOKjH21PRTxT8x_fU4r4Mda6F750Z8pjNeb8g,1249
20
23
  yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
21
24
  yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
22
- yomitoku/data/functions.py,sha256=7a_3xDKAQVdWfzQwFcdyJBojoyzUa3ePZOnG4pX1dpI,7532
23
- yomitoku/export/__init__.py,sha256=fkwOtqH0lh6eZQW5b4EMSjIH1FmWYLKKszahR-jQYSg,366
24
- yomitoku/export/export_csv.py,sha256=B234jlNeO4n5kQ_lwxxAZe_O2ipTbeDYlWU1zyyaVrw,3001
25
- yomitoku/export/export_html.py,sha256=pCLoxV10_SzRWmZlDnHuyfPFIuUGB3ZkqSdABVU7DTs,5038
26
- yomitoku/export/export_json.py,sha256=D6dD04gcPR5lmfHFVX-iGOYapsOVaJ_kH1Qhs6d2O0M,2035
27
- yomitoku/export/export_markdown.py,sha256=D1kX3X8odWa0pf4AFZ6gik5EKMKK7pgpQXaHHv6pWDI,4170
25
+ yomitoku/data/functions.py,sha256=HIrffs0zCJOq8IvQiI_z-b4MwTb-H2wmZjEE_5VpxFs,8040
26
+ yomitoku/export/__init__.py,sha256=gmlikMHRXfzfJ_8q4fyDlnpGms-x1oggQOwJEWHMgBU,508
27
+ yomitoku/export/export_csv.py,sha256=4DT5Nf4FdeGP0olIzv1ypBlswkZSdMB4MeQOgYWe8uk,3375
28
+ yomitoku/export/export_html.py,sha256=syzAapHcUHcUlabmZcQdWiNy2NrRs7LPzA_x39pFtfQ,5494
29
+ yomitoku/export/export_json.py,sha256=6cSXSsyEVJ5Rw2nKSUOcW8_XlGmSLWlWQWCBNmRKsps,2386
30
+ yomitoku/export/export_markdown.py,sha256=7Jib-YXOw70H46kvNc6z0_3LFwX9iwp1eXxsGeylF0I,4681
28
31
  yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
29
32
  yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
30
33
  yomitoku/models/parseq.py,sha256=psCPjP3eKjOFAUZJPQQhbD0nWEV5FeOZ0tTK27Rvvbw,8748
@@ -40,7 +43,7 @@ yomitoku/onnx/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
43
  yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
41
44
  yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
42
45
  yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
43
- yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=TCv1t1zCxg2rSirsLm4sXlaltGubH-roVdEqnUoRs-8,3905
46
+ yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=zp_PEAIl0-b7EJIWVZFrAaEUBSp9OgBVd1G-mP9R20E,4350
44
47
  yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
45
48
  yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
46
49
  yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -48,7 +51,7 @@ yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
48
51
  yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
49
52
  yomitoku/utils/misc.py,sha256=FbwPLeIYYBvNf9wQh2RoEonTM5BF7_IwaEqmRsYHKA8,2673
50
53
  yomitoku/utils/visualizer.py,sha256=DjDwHiAu1iFRKh96H3Egq4vuI2s_-9dLCDeykhKi8jo,5251
51
- yomitoku-0.7.3.dist-info/METADATA,sha256=pHT4lQyl9cN4KbsOTooiJAaEEJqXhmAl9SVZKVaPkR0,8717
52
- yomitoku-0.7.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
53
- yomitoku-0.7.3.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
54
- yomitoku-0.7.3.dist-info/RECORD,,
54
+ yomitoku-0.8.0.dist-info/METADATA,sha256=CH5KOT64Q8AMOaKkUbbd9rI1Zcd_dBk_OXd2GguC4f0,8555
55
+ yomitoku-0.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ yomitoku-0.8.0.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
57
+ yomitoku-0.8.0.dist-info/RECORD,,