yomitoku 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,7 @@ class PostProcess:
30
30
  thresh: float = 0.2
31
31
  box_thresh: float = 0.5
32
32
  max_candidates: int = 1500
33
- unclip_ratio: float = 2.0
33
+ unclip_ratio: float = 7.0
34
34
 
35
35
 
36
36
  @dataclass
@@ -3,7 +3,7 @@ from pathlib import Path
3
3
  import cv2
4
4
  import numpy as np
5
5
  import torch
6
- from pdf2image import convert_from_path
6
+ import pypdfium2
7
7
 
8
8
  from ..constants import (
9
9
  MIN_IMAGE_SIZE,
@@ -70,6 +70,7 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
70
70
  Returns:
71
71
  list[np.ndarray]: list of image data(BGR)
72
72
  """
73
+
73
74
  pdf_path = Path(pdf_path)
74
75
  if not pdf_path.exists():
75
76
  raise FileNotFoundError(f"File not found: {pdf_path}")
@@ -86,11 +87,19 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
86
87
  )
87
88
 
88
89
  try:
89
- images = convert_from_path(pdf_path, dpi=dpi)
90
+ doc = pypdfium2.PdfDocument(pdf_path)
91
+ renderer = doc.render(
92
+ pypdfium2.PdfBitmap.to_pil,
93
+ scale=dpi / 72,
94
+ )
95
+ images = list(renderer)
96
+ images = [np.array(image.convert("RGB"))[:, :, ::-1] for image in images]
97
+
98
+ doc.close()
90
99
  except Exception as e:
91
100
  raise ValueError(f"Failed to open the PDF file: {pdf_path}") from e
92
101
 
93
- return [np.array(img)[:, :, ::-1] for img in images]
102
+ return images
94
103
 
95
104
 
96
105
  def resize_shortest_edge(
@@ -193,9 +202,7 @@ def validate_quads(img: np.ndarray, quads: list[list[list[int]]]):
193
202
  h, w = img.shape[:2]
194
203
 
195
204
  if x1 < 0 or x2 > w or y1 < 0 or y2 > h:
196
- raise ValueError(
197
- f"The vertices are out of the image. {quad.tolist()}"
198
- )
205
+ raise ValueError(f"The vertices are out of the image. {quad.tolist()}")
199
206
 
200
207
  return True
201
208
 
@@ -109,8 +109,8 @@ def extract_words_within_element(pred_words, element):
109
109
  if len(contained_words) == 0:
110
110
  return None, None, check_list
111
111
 
112
- mean_width = word_sum_width / len(contained_words)
113
- mean_height = word_sum_height / len(contained_words)
112
+ # mean_width = word_sum_width / len(contained_words)
113
+ # mean_height = word_sum_height / len(contained_words)
114
114
 
115
115
  word_direction = [word.direction for word in contained_words]
116
116
  cnt_horizontal = word_direction.count("horizontal")
@@ -120,18 +120,12 @@ def extract_words_within_element(pred_words, element):
120
120
  if element_direction == "horizontal":
121
121
  contained_words = sorted(
122
122
  contained_words,
123
- key=lambda x: (
124
- x.points[0][1] // int(mean_height),
125
- x.points[0][0],
126
- ),
123
+ key=lambda x: (sum([p[1] for p in x.points]) / 4),
127
124
  )
128
125
  else:
129
126
  contained_words = sorted(
130
127
  contained_words,
131
- key=lambda x: (
132
- x.points[1][0] // int(mean_width),
133
- x.points[1][1],
134
- ),
128
+ key=lambda x: (sum([p[0] for p in x.points]) / 4),
135
129
  reverse=True,
136
130
  )
137
131
 
@@ -27,8 +27,7 @@ class LayoutAnalyzer:
27
27
 
28
28
  if isinstance(configs, dict):
29
29
  assert (
30
- "layout_parser" in configs
31
- or "table_structure_recognizer" in configs
30
+ "layout_parser" in configs or "table_structure_recognizer" in configs
32
31
  ), "Invalid config key. Please check the config keys."
33
32
 
34
33
  if "layout_parser" in configs:
@@ -53,9 +52,7 @@ class LayoutAnalyzer:
53
52
  def __call__(self, img):
54
53
  layout_results, vis = self.layout_parser(img)
55
54
  table_boxes = [table.box for table in layout_results.tables]
56
- table_results, vis = self.table_structure_recognizer(
57
- img, table_boxes, vis=vis
58
- )
55
+ table_results, vis = self.table_structure_recognizer(img, table_boxes, vis=vis)
59
56
 
60
57
  results = LayoutAnalyzerSchema(
61
58
  paragraphs=layout_results.paragraphs,
@@ -20,9 +20,7 @@ class BackboneBase(nn.Module):
20
20
  "layer4": "layer4",
21
21
  }
22
22
 
23
- self.body = IntermediateLayerGetter(
24
- backbone, return_layers=return_layers
25
- )
23
+ self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
26
24
 
27
25
  def forward(self, tensor):
28
26
  xs = self.body(tensor)
@@ -57,18 +55,10 @@ class DBNetDecoder(nn.Module):
57
55
  self.training = True
58
56
  self.input_proj = nn.ModuleDict(
59
57
  {
60
- "layer1": nn.Conv2d(
61
- in_channels[0], self.d_model, 1, bias=False
62
- ),
63
- "layer2": nn.Conv2d(
64
- in_channels[1], self.d_model, 1, bias=False
65
- ),
66
- "layer3": nn.Conv2d(
67
- in_channels[2], self.d_model, 1, bias=False
68
- ),
69
- "layer4": nn.Conv2d(
70
- in_channels[3], self.d_model, 1, bias=False
71
- ),
58
+ "layer1": nn.Conv2d(in_channels[0], self.d_model, 1, bias=False),
59
+ "layer2": nn.Conv2d(in_channels[1], self.d_model, 1, bias=False),
60
+ "layer3": nn.Conv2d(in_channels[2], self.d_model, 1, bias=False),
61
+ "layer4": nn.Conv2d(in_channels[3], self.d_model, 1, bias=False),
72
62
  }
73
63
  )
74
64
 
@@ -89,9 +79,7 @@ class DBNetDecoder(nn.Module):
89
79
  padding=1,
90
80
  bias=False,
91
81
  ),
92
- nn.Upsample(
93
- scale_factor=2, mode="bilinear", align_corners=False
94
- ),
82
+ nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
95
83
  ),
96
84
  "layer3": nn.Sequential(
97
85
  nn.Conv2d(
@@ -101,9 +89,7 @@ class DBNetDecoder(nn.Module):
101
89
  padding=1,
102
90
  bias=False,
103
91
  ),
104
- nn.Upsample(
105
- scale_factor=4, mode="bilinear", align_corners=False
106
- ),
92
+ nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
107
93
  ),
108
94
  "layer4": nn.Sequential(
109
95
  nn.Conv2d(
@@ -113,17 +99,13 @@ class DBNetDecoder(nn.Module):
113
99
  padding=1,
114
100
  bias=False,
115
101
  ),
116
- nn.Upsample(
117
- scale_factor=4, mode="bilinear", align_corners=False
118
- ),
102
+ nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
119
103
  ),
120
104
  }
121
105
  )
122
106
 
123
107
  self.binarize = nn.Sequential(
124
- nn.Conv2d(
125
- self.d_model, self.d_model // 4, 3, padding=1, bias=False
126
- ),
108
+ nn.Conv2d(self.d_model, self.d_model // 4, 3, padding=1, bias=False),
127
109
  nn.BatchNorm2d(self.d_model // 4),
128
110
  nn.ReLU(inplace=True),
129
111
  nn.ConvTranspose2d(self.d_model // 4, self.d_model // 4, 2, 2),
@@ -166,16 +148,12 @@ class DBNetDecoder(nn.Module):
166
148
  m.weight.data.fill_(1.0)
167
149
  m.bias.data.fill_(1e-4)
168
150
 
169
- def _init_thresh(
170
- self, inner_channels, serial=False, smooth=False, bias=False
171
- ):
151
+ def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
172
152
  in_channels = inner_channels
173
153
  if serial:
174
154
  in_channels += 1
175
155
  self.thresh = nn.Sequential(
176
- nn.Conv2d(
177
- in_channels, inner_channels // 4, 3, padding=1, bias=bias
178
- ),
156
+ nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
179
157
  nn.BatchNorm2d(inner_channels // 4),
180
158
  nn.ReLU(inplace=True),
181
159
  self._init_upsample(
@@ -186,16 +164,12 @@ class DBNetDecoder(nn.Module):
186
164
  ),
187
165
  nn.BatchNorm2d(inner_channels // 4),
188
166
  nn.ReLU(inplace=True),
189
- self._init_upsample(
190
- inner_channels // 4, 1, smooth=smooth, bias=bias
191
- ),
167
+ self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
192
168
  nn.Sigmoid(),
193
169
  )
194
170
  return self.thresh
195
171
 
196
- def _init_upsample(
197
- self, in_channels, out_channels, smooth=False, bias=False
198
- ):
172
+ def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
199
173
  if smooth:
200
174
  inter_out_channels = out_channels
201
175
  if out_channels == 1:
@@ -1,5 +1,4 @@
1
- """Copyright(c) 2023 lyuwenyu. All Rights Reserved.
2
- """
1
+ """Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
3
2
 
4
3
  from collections import OrderedDict
5
4
 
@@ -48,9 +47,7 @@ class ConvNormLayer(nn.Module):
48
47
  class BasicBlock(nn.Module):
49
48
  expansion = 1
50
49
 
51
- def __init__(
52
- self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
53
- ):
50
+ def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
54
51
  super().__init__()
55
52
 
56
53
  self.shortcut = shortcut
@@ -89,9 +86,7 @@ class BasicBlock(nn.Module):
89
86
  class BottleNeck(nn.Module):
90
87
  expansion = 4
91
88
 
92
- def __init__(
93
- self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
94
- ):
89
+ def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
95
90
  super().__init__()
96
91
 
97
92
  if variant == "a":
@@ -114,17 +109,13 @@ class BottleNeck(nn.Module):
114
109
  ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
115
110
  (
116
111
  "conv",
117
- ConvNormLayer(
118
- ch_in, ch_out * self.expansion, 1, 1
119
- ),
112
+ ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
120
113
  ),
121
114
  ]
122
115
  )
123
116
  )
124
117
  else:
125
- self.short = ConvNormLayer(
126
- ch_in, ch_out * self.expansion, 1, stride
127
- )
118
+ self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
128
119
 
129
120
  self.act = nn.Identity() if act is None else get_activation(act)
130
121
 
@@ -145,9 +136,7 @@ class BottleNeck(nn.Module):
145
136
 
146
137
 
147
138
  class Blocks(nn.Module):
148
- def __init__(
149
- self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
150
- ):
139
+ def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
151
140
  super().__init__()
152
141
 
153
142
  self.blocks = nn.ModuleList()
@@ -1,5 +1,4 @@
1
- """Copyright(c) 2023 lyuwenyu. All Rights Reserved.
2
- """
1
+ """Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
3
2
 
4
3
  import copy
5
4
  from collections import OrderedDict
@@ -241,9 +240,7 @@ class HybridEncoder(nn.Module):
241
240
  for in_channel in in_channels:
242
241
  if version == "v1":
243
242
  proj = nn.Sequential(
244
- nn.Conv2d(
245
- in_channel, hidden_dim, kernel_size=1, bias=False
246
- ),
243
+ nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
247
244
  nn.BatchNorm2d(hidden_dim),
248
245
  )
249
246
  elif version == "v2":
@@ -279,9 +276,7 @@ class HybridEncoder(nn.Module):
279
276
 
280
277
  self.encoder = nn.ModuleList(
281
278
  [
282
- TransformerEncoder(
283
- copy.deepcopy(encoder_layer), num_encoder_layers
284
- )
279
+ TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
285
280
  for _ in range(len(use_encoder_idx))
286
281
  ]
287
282
  )
@@ -336,9 +331,7 @@ class HybridEncoder(nn.Module):
336
331
  # self.register_buffer(f'pos_embed{idx}', pos_embed)
337
332
 
338
333
  @staticmethod
339
- def build_2d_sincos_position_embedding(
340
- w, h, embed_dim=256, temperature=10000.0
341
- ):
334
+ def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
342
335
  """ """
343
336
  grid_w = torch.arange(int(w), dtype=torch.float32)
344
337
  grid_h = torch.arange(int(h), dtype=torch.float32)
@@ -376,9 +369,7 @@ class HybridEncoder(nn.Module):
376
369
  src_flatten.device
377
370
  )
378
371
 
379
- memory: torch.Tensor = self.encoder[i](
380
- src_flatten, pos_embed=pos_embed
381
- )
372
+ memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
382
373
  proj_feats[enc_ind] = (
383
374
  memory.permute(0, 2, 1)
384
375
  .reshape(-1, self.hidden_dim, h, w)
@@ -390,13 +381,9 @@ class HybridEncoder(nn.Module):
390
381
  for idx in range(len(self.in_channels) - 1, 0, -1):
391
382
  feat_heigh = inner_outs[0]
392
383
  feat_low = proj_feats[idx - 1]
393
- feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
394
- feat_heigh
395
- )
384
+ feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
396
385
  inner_outs[0] = feat_heigh
397
- upsample_feat = F.interpolate(
398
- feat_heigh, scale_factor=2.0, mode="nearest"
399
- )
386
+ upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
400
387
  inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
401
388
  torch.concat([upsample_feat, feat_low], dim=1)
402
389
  )
yomitoku/models/parseq.py CHANGED
@@ -26,9 +26,7 @@ from ..postprocessor import ParseqTokenizer as Tokenizer
26
26
  from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
27
27
 
28
28
 
29
- def init_weights(
30
- module: nn.Module, name: str = "", exclude: Sequence[str] = ()
31
- ):
29
+ def init_weights(module: nn.Module, name: str = "", exclude: Sequence[str] = ()):
32
30
  """Initialize the weights using the typical initialization schemes used in SOTA models."""
33
31
  if any(map(name.startswith, exclude)):
34
32
  return
@@ -41,9 +39,7 @@ def init_weights(
41
39
  if module.padding_idx is not None:
42
40
  module.weight.data[module.padding_idx].zero_()
43
41
  elif isinstance(module, nn.Conv2d):
44
- nn.init.kaiming_normal_(
45
- module.weight, mode="fan_out", nonlinearity="relu"
46
- )
42
+ nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
47
43
  if module.bias is not None:
48
44
  nn.init.zeros_(module.bias)
49
45
  elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
@@ -93,9 +89,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
93
89
  @torch.jit.ignore
94
90
  def no_weight_decay(self):
95
91
  param_names = {"text_embed.embedding.weight", "pos_queries"}
96
- enc_param_names = {
97
- "encoder." + n for n in self.encoder.no_weight_decay()
98
- }
92
+ enc_param_names = {"encoder." + n for n in self.encoder.no_weight_decay()}
99
93
  return param_names.union(enc_param_names)
100
94
 
101
95
  def encode(self, img: torch.Tensor):
@@ -149,9 +143,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
149
143
 
150
144
  # Special case for the forward permutation. Faster than using `generate_attn_masks()`
151
145
  tgt_mask = query_mask = torch.triu(
152
- torch.ones(
153
- (num_steps, num_steps), dtype=torch.bool, device=self._device
154
- ),
146
+ torch.ones((num_steps, num_steps), dtype=torch.bool, device=self._device),
155
147
  1,
156
148
  )
157
149
 
@@ -185,10 +177,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
185
177
  # greedy decode. add the next token index to the target input
186
178
  tgt_in[:, j] = p_i.squeeze().argmax(-1)
187
179
  # Efficient batch decoding: If all output words have at least one EOS token, end decoding.
188
- if (
189
- testing
190
- and (tgt_in == tokenizer.eos_id).any(dim=-1).all()
191
- ):
180
+ if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
192
181
  break
193
182
 
194
183
  logits = torch.cat(logits, dim=1)
@@ -227,9 +216,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
227
216
  # Prior context is the previous output.
228
217
  tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
229
218
  # Mask tokens beyond the first EOS token.
230
- tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(
231
- -1
232
- ) > 0
219
+ tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(-1) > 0
233
220
  tgt_out = self.decode(
234
221
  tgt_in,
235
222
  memory,
@@ -1,13 +1,12 @@
1
1
  import cv2
2
+ import math
2
3
  import numpy as np
3
4
  import pyclipper
4
5
  from shapely.geometry import Polygon
5
6
 
6
7
 
7
8
  class DBnetPostProcessor:
8
- def __init__(
9
- self, min_size, thresh, box_thresh, max_candidates, unclip_ratio
10
- ):
9
+ def __init__(self, min_size, thresh, box_thresh, max_candidates, unclip_ratio):
11
10
  self.min_size = min_size
12
11
  self.thresh = thresh
13
12
  self.box_thresh = box_thresh
@@ -24,9 +23,7 @@ class DBnetPostProcessor:
24
23
  pred = preds["binary"][0]
25
24
  segmentation = self.binarize(pred)[0]
26
25
  height, width = image_size
27
- quads, scores = self.boxes_from_bitmap(
28
- pred, segmentation, width, height
29
- )
26
+ quads, scores = self.boxes_from_bitmap(pred, segmentation, width, height)
30
27
  return quads, scores
31
28
 
32
29
  def binarize(self, pred):
@@ -65,9 +62,7 @@ class DBnetPostProcessor:
65
62
  if self.box_thresh > score:
66
63
  continue
67
64
 
68
- box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(
69
- -1, 1, 2
70
- )
65
+ box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2)
71
66
  box, sside = self.get_mini_boxes(box)
72
67
  if sside < self.min_size + 2:
73
68
  continue
@@ -76,9 +71,7 @@ class DBnetPostProcessor:
76
71
  dest_width = dest_width.item()
77
72
  dest_height = dest_height.item()
78
73
 
79
- box[:, 0] = np.clip(
80
- np.round(box[:, 0] / width * dest_width), 0, dest_width
81
- )
74
+ box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
82
75
  box[:, 1] = np.clip(
83
76
  np.round(box[:, 1] / height * dest_height), 0, dest_height
84
77
  )
@@ -88,9 +81,17 @@ class DBnetPostProcessor:
88
81
 
89
82
  return boxes, scores
90
83
 
91
- def unclip(self, box, unclip_ratio=1.5):
84
+ def unclip(self, box, unclip_ratio=7):
85
+ # 小さい文字が見切れやすい、大きい文字のマージンが過度に大きくなる等の課題がある
86
+ # 対応として、文字の大きさに応じて、拡大パラメータを動的に変更する
87
+ # Note: こののルールはヒューリスティックで理論的根拠はない
92
88
  poly = Polygon(box)
93
- distance = poly.area * unclip_ratio / poly.length
89
+ width = box[:, 0].max() - box[:, 0].min()
90
+ height = box[:, 1].max() - box[:, 1].min()
91
+ box_dist = min(width, height)
92
+ ratio = unclip_ratio / math.sqrt(box_dist)
93
+
94
+ distance = poly.area * ratio / poly.length
94
95
  offset = pyclipper.PyclipperOffset()
95
96
  offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
96
97
  expanded = np.array(offset.Execute(distance))
@@ -122,7 +122,5 @@ class ParseqTokenizer(BaseTokenizer):
122
122
  eos_idx = len(ids) # Nothing to truncate.
123
123
  # Truncate after EOS
124
124
  ids = ids[:eos_idx]
125
- probs = probs[
126
- : eos_idx + 1
127
- ] # but include prob. for EOS (if it exists)
125
+ probs = probs[: eos_idx + 1] # but include prob. for EOS (if it exists)
128
126
  return probs, ids
yomitoku/utils/misc.py CHANGED
@@ -1,5 +1,5 @@
1
1
  def load_charset(charset_path):
2
- with open(charset_path, "r") as f:
2
+ with open(charset_path, "r", encoding="utf-8") as f:
3
3
  charset = f.read()
4
4
  return charset
5
5
 
@@ -1,8 +1,10 @@
1
1
  import cv2
2
2
  import numpy as np
3
- from PIL import Image, ImageDraw, ImageFont
4
-
3
+ from PIL import Image, ImageDraw, ImageFont, features
5
4
  from ..constants import PALETTE
5
+ from .logger import set_logger
6
+
7
+ logger = set_logger(__name__, "INFO")
6
8
 
7
9
 
8
10
  def _reading_order_visualizer(img, elements, line_color, tip_size):
@@ -148,13 +150,18 @@ def rec_visualizer(
148
150
  out = img.copy()
149
151
  pillow_img = Image.fromarray(out)
150
152
  draw = ImageDraw.Draw(pillow_img)
153
+ has_raqm = features.check_feature(feature="raqm")
154
+ if not has_raqm:
155
+ logger.warning(
156
+ "libraqm is not installed. Vertical text rendering is not supported. Rendering horizontally instead."
157
+ )
151
158
 
152
159
  for pred, quad, direction in zip(
153
160
  outputs.contents, outputs.points, outputs.directions
154
161
  ):
155
162
  quad = np.array(quad).astype(np.int32)
156
163
  font = ImageFont.truetype(font_path, font_size)
157
- if direction == "horizontal":
164
+ if direction == "horizontal" or not has_raqm:
158
165
  x_offset = 0
159
166
  y_offset = -font_size
160
167
 
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: yomitoku
3
- Version: 0.4.1
4
- Summary: Yomitoku is a document image analysis package powered by AI technology for the Japanese language.
3
+ Version: 0.5.1
4
+ Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
5
5
  Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
6
6
  License: CC BY-NC-SA 4.0
7
7
  Keywords: Deep Learning,Japanese,OCR
@@ -10,24 +10,25 @@ Requires-Dist: huggingface-hub>=0.26.1
10
10
  Requires-Dist: lxml>=5.3.0
11
11
  Requires-Dist: omegaconf>=2.3.0
12
12
  Requires-Dist: opencv-python>=4.10.0.84
13
- Requires-Dist: pdf2image>=1.17.0
14
13
  Requires-Dist: pyclipper>=1.3.0.post6
15
14
  Requires-Dist: pydantic>=2.9.2
15
+ Requires-Dist: pypdfium2>=4.30.0
16
16
  Requires-Dist: shapely>=2.0.6
17
17
  Requires-Dist: timm>=1.0.11
18
- Requires-Dist: torch>=2.5.0
18
+ Requires-Dist: torch==2.5.0
19
19
  Requires-Dist: torchvision>=0.20.0
20
20
  Description-Content-Type: text/markdown
21
21
 
22
- # YomiToku
22
+ 日本語版 | [English](README_EN.md)
23
+
24
+ <img src="static/logo/horizontal.png" width="800px">
23
25
 
24
26
  ![Python](https://img.shields.io/badge/Python-3.9|3.10|3.11|3.12-F9DC3E.svg?logo=python&logoColor=&style=flat)
25
27
  ![Pytorch](https://img.shields.io/badge/Pytorch-2.5-EE4C2C.svg?logo=Pytorch&style=fla)
26
- ![OS](https://img.shields.io/badge/OS-Linux|MacOS-1793D1.svg?&style=fla)
28
+ ![CUDA](https://img.shields.io/badge/CUDA->=11.8-76B900.svg?logo=NVIDIA&style=fla)
29
+ ![OS](https://img.shields.io/badge/OS-Linux|Mac|Win-1793D1.svg?&style=fla)
27
30
  [![Document](https://img.shields.io/badge/docs-live-brightgreen)](https://kotaro-kinoshita.github.io/yomitoku-dev/)
28
31
 
29
- <img src="static/logo/horizontal.png" width="800px">
30
-
31
32
  ## 🌟 概要
32
33
 
33
34
  YomiToku は日本語に特化した AI 文章画像解析エンジン(Document AI)です。画像内の文字の全文 OCR およびレイアウト解析機能を有しており、画像内の文字情報や図表を認識、抽出、変換します。
@@ -60,31 +61,16 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
60
61
 
61
62
  ## 📣 リリース情報
62
63
 
63
- - 2024 年 12XX YomiToku vX.X.X を公開
64
+ - 2024 年 1126 YomiToku v0.5.1 (beta) を公開
64
65
 
65
66
  ## 💡 インストールの方法
66
67
 
67
68
  ```
68
- pip install git+https://github.com/kotaro-kinoshita/yomitoku-dev.git@main
69
+ pip install yomitoku
69
70
  ```
70
71
 
71
- - pytorch がご自身の GPU の環境にあったものをインストールしてください
72
-
73
- ### 依存ライブラリ
74
-
75
- pdf ファイルの解析を行うためには、別途、[poppler](https://poppler.freedesktop.org/)のインストールが必要です。
76
-
77
- **Mac**
78
-
79
- ```
80
- brew install poppler
81
- ```
82
-
83
- **Linux**
84
-
85
- ```
86
- apt install poppler-utils -y
87
- ```
72
+ - pytorch はご自身の CUDAのバージョンにあったものをインストールしてください。デフォルトではCUDA12.4以上に対応したものがインストールされます。
73
+ - pytorch は2.5以上のバージョンに対応しています。その関係でCUDA11.8以上のバージョンが必要になります。対応できない場合は、リポジトリ内のDockerfileを利用してください。
88
74
 
89
75
  ## 🚀 実行方法
90
76
 
@@ -98,8 +84,8 @@ yomitoku ${path_data} -f md -o results -v --figure
98
84
  - `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
99
85
  - `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
100
86
  - `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。(デフォルト:画像通りの改行位置位置で改行します。)
101
- - `figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
102
- - `figure` 検出した図、画像を出力ファイルにエクスポートします。(html と markdown のみ)
87
+ - `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
88
+ - `--figure` 検出した図、画像を出力ファイルにエクスポートします。(html と markdown のみ)
103
89
 
104
90
  その他のオプションに関しては、ヘルプを参照
105
91
 
@@ -107,11 +93,10 @@ yomitoku ${path_data} -f md -o results -v --figure
107
93
  yomitoku --help
108
94
  ```
109
95
 
110
- ### Note
111
-
112
- - CPU を用いての推論向けに最適化されておらず、処理時間が長くなりますので、GPU での実行を推奨します。
96
+ **NOTE**
97
+ - GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
113
98
  - 活字のみ識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
114
- - OCR は文書 OCR と情景 OCR(看板など紙以外にプリントされた文字)に大別されますが、Yomitoku は文書 OCR 向けに最適化されています。
99
+ - Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
115
100
  - AI-OCR の識別精度を高めるために、入力画像の解像度が重要です。低解像度画像では識別精度が低下します。最低でも画像の短辺を 720px 以上の画像で推論することをお勧めします。
116
101
 
117
102
  ## 📝 ドキュメント
@@ -120,8 +105,8 @@ yomitoku --help
120
105
 
121
106
  ## LICENSE
122
107
 
123
- 本リポジトリ内に格納されているリソースのライセンスは YomiToku CC BY-NC-SA 4.0 に従います。
124
- 非商用での個人利用、研究目的での利用は自由に利用できます。
108
+ 本リポジトリ内に格納されているソースコードおよび本プロジェクトに関連する HuggingFaceHub 上のモデルの重みファイルのライセンスは CC BY-NC-SA 4.0 に従います。
109
+ 非商用での個人利用、研究目的での利用はご自由にお使いください。
125
110
  商用目的での利用に関しては、別途、商用ライセンスを提供しますので、開発者にお問い合わせください。
126
111
 
127
- YomiToku © 2024 by MLism Inc. is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
112
+ YomiToku © 2024 by Kotaro Kinoshita is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
@@ -1,8 +1,8 @@
1
1
  yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
2
2
  yomitoku/base.py,sha256=lzR_V8t87aRasmFdFwD-8KAeSahSTI3AZaEn6g8sOv8,3871
3
3
  yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
4
- yomitoku/document_analyzer.py,sha256=0dTH6YrCvp2EZXufPnSN4DdF95DZ0_z1TIDML744oX0,10029
5
- yomitoku/layout_analyzer.py,sha256=WIP8PjuayoM7VNtmrbb1b1r4joHYuSyIHg91GZ3F46s,2071
4
+ yomitoku/document_analyzer.py,sha256=HIg-nVzDhJIP-h-tn4uU86KakgHdlAhosEqK_i-SWe4,9906
5
+ yomitoku/layout_analyzer.py,sha256=QTeRcVd8aySz8u6dg2ikET77ar3sqlukRLBwYfTyMPM,2033
6
6
  yomitoku/layout_parser.py,sha256=V2jCNHE61jNp8ytYdKwPV34V5qEK7y-7-Mq7-AkoQhU,5898
7
7
  yomitoku/ocr.py,sha256=Rcojw0aGA6yDF2RjqfK23_rMw-xm61KGd8JmTCTOOVU,2516
8
8
  yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
@@ -14,39 +14,39 @@ yomitoku/cli/main.py,sha256=MBD0S4sXgquJ8P2egkZjJcglXvCke5Uw46C28SDtr8g,6252
14
14
  yomitoku/configs/__init__.py,sha256=KBhb9S7xt22HZaIcoWSgZHfscXXj9YlimOwLH5z9CRo,454
15
15
  yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
16
16
  yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
17
- yomitoku/configs/cfg_text_detector_dbnet.py,sha256=AUl9aStR6z7SEPldIDd7GNQVPQx0eUlyn6ui3B4RVjA,1153
17
+ yomitoku/configs/cfg_text_detector_dbnet.py,sha256=U9k48PON7haoOaytiELhbZRpv9RMiUm6nnfHmdxIa9Q,1153
18
18
  yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLEGahEsCaZdjfKC_MO8,1247
19
19
  yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
20
20
  yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
21
- yomitoku/data/functions.py,sha256=2rJz4Gfd3UzlTq2bzXyFhcwtxJoUjNsnnNMJfk5-i4o,7361
21
+ yomitoku/data/functions.py,sha256=eOyxo8S6EoAf1xGSPLWQFb9-t5Rg52NggD9MFIrOSpY,7506
22
22
  yomitoku/export/__init__.py,sha256=aANEfuovH2aevFjb2pGrBLFP-4iRzEzD9wcriCR-M7I,229
23
23
  yomitoku/export/export_csv.py,sha256=-n8eYPIzDQuiixeqpTbWaN9aQ5oFyl7XRfpv51oKPTI,1979
24
24
  yomitoku/export/export_html.py,sha256=X3H_orkS1BRlQo8Z1NzgrFwsIboDzRAx9etmqj90k2Y,4866
25
25
  yomitoku/export/export_json.py,sha256=1ChvCAHfCmMQvCfcAb1p3fSpr4elNAs3xBSIbpfn3bc,998
26
26
  yomitoku/export/export_markdown.py,sha256=mCcsXUWBLrYc1NcRSBFfBT28d6eCddAF1oHp0qdBEnE,3986
27
27
  yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
28
- yomitoku/models/dbnet_plus.py,sha256=VsE9anGOL1OzCivLilWpJ__32JHnSBEJOwdk_fpHE_o,8428
29
- yomitoku/models/parseq.py,sha256=OfN3ts1Z6f5T27amoRKvnL8qCma-wf0veIbWWoG4GuU,8801
28
+ yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
29
+ yomitoku/models/parseq.py,sha256=7QT-q5_oWqXTDXobRk1R6Lpap_AxdC4AzkSsOgXjOwM,8611
30
30
  yomitoku/models/rtdetr.py,sha256=oJsr8RHz3frslhLfXdVJve47lUsrmqLjfdTrZ41tlQ0,687
31
31
  yomitoku/models/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  yomitoku/models/layers/activate.py,sha256=HUw0q-76RNjZF-o9O3fowfJcw0t1H5o0pbyioGdqUvU,668
33
33
  yomitoku/models/layers/dbnet_feature_attention.py,sha256=Vpp_PiLVuI7Zs30TTg4RNRn16KTb81ewonADpUHd4aE,6060
34
34
  yomitoku/models/layers/parseq_transformer.py,sha256=33eroJf8rmgIptP-NpZLJMhG7XOTwV4rXsq674VrKnU,6704
35
- yomitoku/models/layers/rtdetr_backbone.py,sha256=8-57bh8IjUCL94qM5mTpOXUTYPih1Xek5E8xs5pMGBE,9537
36
- yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=i19sqZAIfPVotvpWBuxpxbepi9xhnGlRpaiL9XMp_Cc,13804
35
+ yomitoku/models/layers/rtdetr_backbone.py,sha256=QjfLW-3qn2My3Jbg6yLORX8A-D2sph9J9u3r5nNnDLo,9386
36
+ yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=D3dK37k7_0jPqV39-6Se8kBzF_SyZttNlbLleyNFiJU,13607
37
37
  yomitoku/models/layers/rtdetrv2_decoder.py,sha256=5bVYPLFYCy3PcjyHTPFHNLWqg3bctrk-dKVG4kayhaw,27517
38
38
  yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
39
- yomitoku/postprocessor/dbnet_postporcessor.py,sha256=iEPOWbGaJ8YIYCQJOpBadbf7uUGEAPjmeNDcNAvY8yc,4523
40
- yomitoku/postprocessor/parseq_tokenizer.py,sha256=eXIHIazEkByjyXKegYEzQ3CE0ReAJYIC2VpQJjnNQjU,4337
39
+ yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
40
+ yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
41
41
  yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=f52wfRKrxqSXy_LeidKDR9XAta_qPjto-oYEdO0XL8A,3386
42
42
  yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
43
43
  yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
44
44
  yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
46
46
  yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
47
- yomitoku/utils/misc.py,sha256=dC1w3DmsoU_ECqngbAs14vPOFCbcecZSGmbztgwx4XU,2479
48
- yomitoku/utils/visualizer.py,sha256=EEDo4bts61FX6mJecgJiHtzY2vLH6sJOQgOVr9yVsF0,4912
49
- yomitoku-0.4.1.dist-info/METADATA,sha256=mTZjZU6_zGTcnYgGR7bu5nGNOnys7DFH_NkAbq3FQrc,7553
50
- yomitoku-0.4.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
51
- yomitoku-0.4.1.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
52
- yomitoku-0.4.1.dist-info/RECORD,,
47
+ yomitoku/utils/misc.py,sha256=2Eyy7-9K_h4Mal1zGXq6OlxubfNzhS0mEYwn_xt7xl8,2497
48
+ yomitoku/utils/visualizer.py,sha256=2pSmbhUPylzVVJ0bXtGDoNmMdArAByab4Py7Xavvs_A,5230
49
+ yomitoku-0.5.1.dist-info/METADATA,sha256=-8bUVnN26cxYlZO0ZQH3liki_xMfhUX47ruHLl-2BGM,7817
50
+ yomitoku-0.5.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
51
+ yomitoku-0.5.1.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
52
+ yomitoku-0.5.1.dist-info/RECORD,,