yomitoku 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/configs/cfg_text_detector_dbnet.py +1 -1
- yomitoku/data/functions.py +13 -6
- yomitoku/document_analyzer.py +4 -10
- yomitoku/layout_analyzer.py +2 -5
- yomitoku/models/dbnet_plus.py +13 -39
- yomitoku/models/layers/rtdetr_backbone.py +6 -17
- yomitoku/models/layers/rtdetr_hybrid_encoder.py +7 -20
- yomitoku/models/parseq.py +6 -19
- yomitoku/postprocessor/dbnet_postporcessor.py +15 -14
- yomitoku/postprocessor/parseq_tokenizer.py +1 -3
- yomitoku/utils/misc.py +1 -1
- yomitoku/utils/visualizer.py +10 -3
- {yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/METADATA +21 -36
- {yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/RECORD +16 -16
- {yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/WHEEL +0 -0
- {yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/entry_points.txt +0 -0
yomitoku/data/functions.py
CHANGED
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
3
3
|
import cv2
|
4
4
|
import numpy as np
|
5
5
|
import torch
|
6
|
-
|
6
|
+
import pypdfium2
|
7
7
|
|
8
8
|
from ..constants import (
|
9
9
|
MIN_IMAGE_SIZE,
|
@@ -70,6 +70,7 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
|
|
70
70
|
Returns:
|
71
71
|
list[np.ndarray]: list of image data(BGR)
|
72
72
|
"""
|
73
|
+
|
73
74
|
pdf_path = Path(pdf_path)
|
74
75
|
if not pdf_path.exists():
|
75
76
|
raise FileNotFoundError(f"File not found: {pdf_path}")
|
@@ -86,11 +87,19 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
|
|
86
87
|
)
|
87
88
|
|
88
89
|
try:
|
89
|
-
|
90
|
+
doc = pypdfium2.PdfDocument(pdf_path)
|
91
|
+
renderer = doc.render(
|
92
|
+
pypdfium2.PdfBitmap.to_pil,
|
93
|
+
scale=dpi / 72,
|
94
|
+
)
|
95
|
+
images = list(renderer)
|
96
|
+
images = [np.array(image.convert("RGB"))[:, :, ::-1] for image in images]
|
97
|
+
|
98
|
+
doc.close()
|
90
99
|
except Exception as e:
|
91
100
|
raise ValueError(f"Failed to open the PDF file: {pdf_path}") from e
|
92
101
|
|
93
|
-
return
|
102
|
+
return images
|
94
103
|
|
95
104
|
|
96
105
|
def resize_shortest_edge(
|
@@ -193,9 +202,7 @@ def validate_quads(img: np.ndarray, quads: list[list[list[int]]]):
|
|
193
202
|
h, w = img.shape[:2]
|
194
203
|
|
195
204
|
if x1 < 0 or x2 > w or y1 < 0 or y2 > h:
|
196
|
-
raise ValueError(
|
197
|
-
f"The vertices are out of the image. {quad.tolist()}"
|
198
|
-
)
|
205
|
+
raise ValueError(f"The vertices are out of the image. {quad.tolist()}")
|
199
206
|
|
200
207
|
return True
|
201
208
|
|
yomitoku/document_analyzer.py
CHANGED
@@ -109,8 +109,8 @@ def extract_words_within_element(pred_words, element):
|
|
109
109
|
if len(contained_words) == 0:
|
110
110
|
return None, None, check_list
|
111
111
|
|
112
|
-
mean_width = word_sum_width / len(contained_words)
|
113
|
-
mean_height = word_sum_height / len(contained_words)
|
112
|
+
# mean_width = word_sum_width / len(contained_words)
|
113
|
+
# mean_height = word_sum_height / len(contained_words)
|
114
114
|
|
115
115
|
word_direction = [word.direction for word in contained_words]
|
116
116
|
cnt_horizontal = word_direction.count("horizontal")
|
@@ -120,18 +120,12 @@ def extract_words_within_element(pred_words, element):
|
|
120
120
|
if element_direction == "horizontal":
|
121
121
|
contained_words = sorted(
|
122
122
|
contained_words,
|
123
|
-
key=lambda x: (
|
124
|
-
x.points[0][1] // int(mean_height),
|
125
|
-
x.points[0][0],
|
126
|
-
),
|
123
|
+
key=lambda x: (sum([p[1] for p in x.points]) / 4),
|
127
124
|
)
|
128
125
|
else:
|
129
126
|
contained_words = sorted(
|
130
127
|
contained_words,
|
131
|
-
key=lambda x: (
|
132
|
-
x.points[1][0] // int(mean_width),
|
133
|
-
x.points[1][1],
|
134
|
-
),
|
128
|
+
key=lambda x: (sum([p[0] for p in x.points]) / 4),
|
135
129
|
reverse=True,
|
136
130
|
)
|
137
131
|
|
yomitoku/layout_analyzer.py
CHANGED
@@ -27,8 +27,7 @@ class LayoutAnalyzer:
|
|
27
27
|
|
28
28
|
if isinstance(configs, dict):
|
29
29
|
assert (
|
30
|
-
"layout_parser" in configs
|
31
|
-
or "table_structure_recognizer" in configs
|
30
|
+
"layout_parser" in configs or "table_structure_recognizer" in configs
|
32
31
|
), "Invalid config key. Please check the config keys."
|
33
32
|
|
34
33
|
if "layout_parser" in configs:
|
@@ -53,9 +52,7 @@ class LayoutAnalyzer:
|
|
53
52
|
def __call__(self, img):
|
54
53
|
layout_results, vis = self.layout_parser(img)
|
55
54
|
table_boxes = [table.box for table in layout_results.tables]
|
56
|
-
table_results, vis = self.table_structure_recognizer(
|
57
|
-
img, table_boxes, vis=vis
|
58
|
-
)
|
55
|
+
table_results, vis = self.table_structure_recognizer(img, table_boxes, vis=vis)
|
59
56
|
|
60
57
|
results = LayoutAnalyzerSchema(
|
61
58
|
paragraphs=layout_results.paragraphs,
|
yomitoku/models/dbnet_plus.py
CHANGED
@@ -20,9 +20,7 @@ class BackboneBase(nn.Module):
|
|
20
20
|
"layer4": "layer4",
|
21
21
|
}
|
22
22
|
|
23
|
-
self.body = IntermediateLayerGetter(
|
24
|
-
backbone, return_layers=return_layers
|
25
|
-
)
|
23
|
+
self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
|
26
24
|
|
27
25
|
def forward(self, tensor):
|
28
26
|
xs = self.body(tensor)
|
@@ -57,18 +55,10 @@ class DBNetDecoder(nn.Module):
|
|
57
55
|
self.training = True
|
58
56
|
self.input_proj = nn.ModuleDict(
|
59
57
|
{
|
60
|
-
"layer1": nn.Conv2d(
|
61
|
-
|
62
|
-
),
|
63
|
-
"
|
64
|
-
in_channels[1], self.d_model, 1, bias=False
|
65
|
-
),
|
66
|
-
"layer3": nn.Conv2d(
|
67
|
-
in_channels[2], self.d_model, 1, bias=False
|
68
|
-
),
|
69
|
-
"layer4": nn.Conv2d(
|
70
|
-
in_channels[3], self.d_model, 1, bias=False
|
71
|
-
),
|
58
|
+
"layer1": nn.Conv2d(in_channels[0], self.d_model, 1, bias=False),
|
59
|
+
"layer2": nn.Conv2d(in_channels[1], self.d_model, 1, bias=False),
|
60
|
+
"layer3": nn.Conv2d(in_channels[2], self.d_model, 1, bias=False),
|
61
|
+
"layer4": nn.Conv2d(in_channels[3], self.d_model, 1, bias=False),
|
72
62
|
}
|
73
63
|
)
|
74
64
|
|
@@ -89,9 +79,7 @@ class DBNetDecoder(nn.Module):
|
|
89
79
|
padding=1,
|
90
80
|
bias=False,
|
91
81
|
),
|
92
|
-
nn.Upsample(
|
93
|
-
scale_factor=2, mode="bilinear", align_corners=False
|
94
|
-
),
|
82
|
+
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
|
95
83
|
),
|
96
84
|
"layer3": nn.Sequential(
|
97
85
|
nn.Conv2d(
|
@@ -101,9 +89,7 @@ class DBNetDecoder(nn.Module):
|
|
101
89
|
padding=1,
|
102
90
|
bias=False,
|
103
91
|
),
|
104
|
-
nn.Upsample(
|
105
|
-
scale_factor=4, mode="bilinear", align_corners=False
|
106
|
-
),
|
92
|
+
nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
|
107
93
|
),
|
108
94
|
"layer4": nn.Sequential(
|
109
95
|
nn.Conv2d(
|
@@ -113,17 +99,13 @@ class DBNetDecoder(nn.Module):
|
|
113
99
|
padding=1,
|
114
100
|
bias=False,
|
115
101
|
),
|
116
|
-
nn.Upsample(
|
117
|
-
scale_factor=4, mode="bilinear", align_corners=False
|
118
|
-
),
|
102
|
+
nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
|
119
103
|
),
|
120
104
|
}
|
121
105
|
)
|
122
106
|
|
123
107
|
self.binarize = nn.Sequential(
|
124
|
-
nn.Conv2d(
|
125
|
-
self.d_model, self.d_model // 4, 3, padding=1, bias=False
|
126
|
-
),
|
108
|
+
nn.Conv2d(self.d_model, self.d_model // 4, 3, padding=1, bias=False),
|
127
109
|
nn.BatchNorm2d(self.d_model // 4),
|
128
110
|
nn.ReLU(inplace=True),
|
129
111
|
nn.ConvTranspose2d(self.d_model // 4, self.d_model // 4, 2, 2),
|
@@ -166,16 +148,12 @@ class DBNetDecoder(nn.Module):
|
|
166
148
|
m.weight.data.fill_(1.0)
|
167
149
|
m.bias.data.fill_(1e-4)
|
168
150
|
|
169
|
-
def _init_thresh(
|
170
|
-
self, inner_channels, serial=False, smooth=False, bias=False
|
171
|
-
):
|
151
|
+
def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
|
172
152
|
in_channels = inner_channels
|
173
153
|
if serial:
|
174
154
|
in_channels += 1
|
175
155
|
self.thresh = nn.Sequential(
|
176
|
-
nn.Conv2d(
|
177
|
-
in_channels, inner_channels // 4, 3, padding=1, bias=bias
|
178
|
-
),
|
156
|
+
nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
|
179
157
|
nn.BatchNorm2d(inner_channels // 4),
|
180
158
|
nn.ReLU(inplace=True),
|
181
159
|
self._init_upsample(
|
@@ -186,16 +164,12 @@ class DBNetDecoder(nn.Module):
|
|
186
164
|
),
|
187
165
|
nn.BatchNorm2d(inner_channels // 4),
|
188
166
|
nn.ReLU(inplace=True),
|
189
|
-
self._init_upsample(
|
190
|
-
inner_channels // 4, 1, smooth=smooth, bias=bias
|
191
|
-
),
|
167
|
+
self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
|
192
168
|
nn.Sigmoid(),
|
193
169
|
)
|
194
170
|
return self.thresh
|
195
171
|
|
196
|
-
def _init_upsample(
|
197
|
-
self, in_channels, out_channels, smooth=False, bias=False
|
198
|
-
):
|
172
|
+
def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
|
199
173
|
if smooth:
|
200
174
|
inter_out_channels = out_channels
|
201
175
|
if out_channels == 1:
|
@@ -1,5 +1,4 @@
|
|
1
|
-
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
|
2
|
-
"""
|
1
|
+
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
|
3
2
|
|
4
3
|
from collections import OrderedDict
|
5
4
|
|
@@ -48,9 +47,7 @@ class ConvNormLayer(nn.Module):
|
|
48
47
|
class BasicBlock(nn.Module):
|
49
48
|
expansion = 1
|
50
49
|
|
51
|
-
def __init__(
|
52
|
-
self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
|
53
|
-
):
|
50
|
+
def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
|
54
51
|
super().__init__()
|
55
52
|
|
56
53
|
self.shortcut = shortcut
|
@@ -89,9 +86,7 @@ class BasicBlock(nn.Module):
|
|
89
86
|
class BottleNeck(nn.Module):
|
90
87
|
expansion = 4
|
91
88
|
|
92
|
-
def __init__(
|
93
|
-
self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
|
94
|
-
):
|
89
|
+
def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
|
95
90
|
super().__init__()
|
96
91
|
|
97
92
|
if variant == "a":
|
@@ -114,17 +109,13 @@ class BottleNeck(nn.Module):
|
|
114
109
|
("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
|
115
110
|
(
|
116
111
|
"conv",
|
117
|
-
ConvNormLayer(
|
118
|
-
ch_in, ch_out * self.expansion, 1, 1
|
119
|
-
),
|
112
|
+
ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
|
120
113
|
),
|
121
114
|
]
|
122
115
|
)
|
123
116
|
)
|
124
117
|
else:
|
125
|
-
self.short = ConvNormLayer(
|
126
|
-
ch_in, ch_out * self.expansion, 1, stride
|
127
|
-
)
|
118
|
+
self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
|
128
119
|
|
129
120
|
self.act = nn.Identity() if act is None else get_activation(act)
|
130
121
|
|
@@ -145,9 +136,7 @@ class BottleNeck(nn.Module):
|
|
145
136
|
|
146
137
|
|
147
138
|
class Blocks(nn.Module):
|
148
|
-
def __init__(
|
149
|
-
self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
|
150
|
-
):
|
139
|
+
def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
|
151
140
|
super().__init__()
|
152
141
|
|
153
142
|
self.blocks = nn.ModuleList()
|
@@ -1,5 +1,4 @@
|
|
1
|
-
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
|
2
|
-
"""
|
1
|
+
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
|
3
2
|
|
4
3
|
import copy
|
5
4
|
from collections import OrderedDict
|
@@ -241,9 +240,7 @@ class HybridEncoder(nn.Module):
|
|
241
240
|
for in_channel in in_channels:
|
242
241
|
if version == "v1":
|
243
242
|
proj = nn.Sequential(
|
244
|
-
nn.Conv2d(
|
245
|
-
in_channel, hidden_dim, kernel_size=1, bias=False
|
246
|
-
),
|
243
|
+
nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
|
247
244
|
nn.BatchNorm2d(hidden_dim),
|
248
245
|
)
|
249
246
|
elif version == "v2":
|
@@ -279,9 +276,7 @@ class HybridEncoder(nn.Module):
|
|
279
276
|
|
280
277
|
self.encoder = nn.ModuleList(
|
281
278
|
[
|
282
|
-
TransformerEncoder(
|
283
|
-
copy.deepcopy(encoder_layer), num_encoder_layers
|
284
|
-
)
|
279
|
+
TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
|
285
280
|
for _ in range(len(use_encoder_idx))
|
286
281
|
]
|
287
282
|
)
|
@@ -336,9 +331,7 @@ class HybridEncoder(nn.Module):
|
|
336
331
|
# self.register_buffer(f'pos_embed{idx}', pos_embed)
|
337
332
|
|
338
333
|
@staticmethod
|
339
|
-
def build_2d_sincos_position_embedding(
|
340
|
-
w, h, embed_dim=256, temperature=10000.0
|
341
|
-
):
|
334
|
+
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
|
342
335
|
""" """
|
343
336
|
grid_w = torch.arange(int(w), dtype=torch.float32)
|
344
337
|
grid_h = torch.arange(int(h), dtype=torch.float32)
|
@@ -376,9 +369,7 @@ class HybridEncoder(nn.Module):
|
|
376
369
|
src_flatten.device
|
377
370
|
)
|
378
371
|
|
379
|
-
memory: torch.Tensor = self.encoder[i](
|
380
|
-
src_flatten, pos_embed=pos_embed
|
381
|
-
)
|
372
|
+
memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
|
382
373
|
proj_feats[enc_ind] = (
|
383
374
|
memory.permute(0, 2, 1)
|
384
375
|
.reshape(-1, self.hidden_dim, h, w)
|
@@ -390,13 +381,9 @@ class HybridEncoder(nn.Module):
|
|
390
381
|
for idx in range(len(self.in_channels) - 1, 0, -1):
|
391
382
|
feat_heigh = inner_outs[0]
|
392
383
|
feat_low = proj_feats[idx - 1]
|
393
|
-
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
|
394
|
-
feat_heigh
|
395
|
-
)
|
384
|
+
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
|
396
385
|
inner_outs[0] = feat_heigh
|
397
|
-
upsample_feat = F.interpolate(
|
398
|
-
feat_heigh, scale_factor=2.0, mode="nearest"
|
399
|
-
)
|
386
|
+
upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
|
400
387
|
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
|
401
388
|
torch.concat([upsample_feat, feat_low], dim=1)
|
402
389
|
)
|
yomitoku/models/parseq.py
CHANGED
@@ -26,9 +26,7 @@ from ..postprocessor import ParseqTokenizer as Tokenizer
|
|
26
26
|
from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
|
27
27
|
|
28
28
|
|
29
|
-
def init_weights(
|
30
|
-
module: nn.Module, name: str = "", exclude: Sequence[str] = ()
|
31
|
-
):
|
29
|
+
def init_weights(module: nn.Module, name: str = "", exclude: Sequence[str] = ()):
|
32
30
|
"""Initialize the weights using the typical initialization schemes used in SOTA models."""
|
33
31
|
if any(map(name.startswith, exclude)):
|
34
32
|
return
|
@@ -41,9 +39,7 @@ def init_weights(
|
|
41
39
|
if module.padding_idx is not None:
|
42
40
|
module.weight.data[module.padding_idx].zero_()
|
43
41
|
elif isinstance(module, nn.Conv2d):
|
44
|
-
nn.init.kaiming_normal_(
|
45
|
-
module.weight, mode="fan_out", nonlinearity="relu"
|
46
|
-
)
|
42
|
+
nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
|
47
43
|
if module.bias is not None:
|
48
44
|
nn.init.zeros_(module.bias)
|
49
45
|
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
|
@@ -93,9 +89,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
93
89
|
@torch.jit.ignore
|
94
90
|
def no_weight_decay(self):
|
95
91
|
param_names = {"text_embed.embedding.weight", "pos_queries"}
|
96
|
-
enc_param_names = {
|
97
|
-
"encoder." + n for n in self.encoder.no_weight_decay()
|
98
|
-
}
|
92
|
+
enc_param_names = {"encoder." + n for n in self.encoder.no_weight_decay()}
|
99
93
|
return param_names.union(enc_param_names)
|
100
94
|
|
101
95
|
def encode(self, img: torch.Tensor):
|
@@ -149,9 +143,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
149
143
|
|
150
144
|
# Special case for the forward permutation. Faster than using `generate_attn_masks()`
|
151
145
|
tgt_mask = query_mask = torch.triu(
|
152
|
-
torch.ones(
|
153
|
-
(num_steps, num_steps), dtype=torch.bool, device=self._device
|
154
|
-
),
|
146
|
+
torch.ones((num_steps, num_steps), dtype=torch.bool, device=self._device),
|
155
147
|
1,
|
156
148
|
)
|
157
149
|
|
@@ -185,10 +177,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
185
177
|
# greedy decode. add the next token index to the target input
|
186
178
|
tgt_in[:, j] = p_i.squeeze().argmax(-1)
|
187
179
|
# Efficient batch decoding: If all output words have at least one EOS token, end decoding.
|
188
|
-
if (
|
189
|
-
testing
|
190
|
-
and (tgt_in == tokenizer.eos_id).any(dim=-1).all()
|
191
|
-
):
|
180
|
+
if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
|
192
181
|
break
|
193
182
|
|
194
183
|
logits = torch.cat(logits, dim=1)
|
@@ -227,9 +216,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
227
216
|
# Prior context is the previous output.
|
228
217
|
tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
|
229
218
|
# Mask tokens beyond the first EOS token.
|
230
|
-
tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(
|
231
|
-
-1
|
232
|
-
) > 0
|
219
|
+
tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(-1) > 0
|
233
220
|
tgt_out = self.decode(
|
234
221
|
tgt_in,
|
235
222
|
memory,
|
@@ -1,13 +1,12 @@
|
|
1
1
|
import cv2
|
2
|
+
import math
|
2
3
|
import numpy as np
|
3
4
|
import pyclipper
|
4
5
|
from shapely.geometry import Polygon
|
5
6
|
|
6
7
|
|
7
8
|
class DBnetPostProcessor:
|
8
|
-
def __init__(
|
9
|
-
self, min_size, thresh, box_thresh, max_candidates, unclip_ratio
|
10
|
-
):
|
9
|
+
def __init__(self, min_size, thresh, box_thresh, max_candidates, unclip_ratio):
|
11
10
|
self.min_size = min_size
|
12
11
|
self.thresh = thresh
|
13
12
|
self.box_thresh = box_thresh
|
@@ -24,9 +23,7 @@ class DBnetPostProcessor:
|
|
24
23
|
pred = preds["binary"][0]
|
25
24
|
segmentation = self.binarize(pred)[0]
|
26
25
|
height, width = image_size
|
27
|
-
quads, scores = self.boxes_from_bitmap(
|
28
|
-
pred, segmentation, width, height
|
29
|
-
)
|
26
|
+
quads, scores = self.boxes_from_bitmap(pred, segmentation, width, height)
|
30
27
|
return quads, scores
|
31
28
|
|
32
29
|
def binarize(self, pred):
|
@@ -65,9 +62,7 @@ class DBnetPostProcessor:
|
|
65
62
|
if self.box_thresh > score:
|
66
63
|
continue
|
67
64
|
|
68
|
-
box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(
|
69
|
-
-1, 1, 2
|
70
|
-
)
|
65
|
+
box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2)
|
71
66
|
box, sside = self.get_mini_boxes(box)
|
72
67
|
if sside < self.min_size + 2:
|
73
68
|
continue
|
@@ -76,9 +71,7 @@ class DBnetPostProcessor:
|
|
76
71
|
dest_width = dest_width.item()
|
77
72
|
dest_height = dest_height.item()
|
78
73
|
|
79
|
-
box[:, 0] = np.clip(
|
80
|
-
np.round(box[:, 0] / width * dest_width), 0, dest_width
|
81
|
-
)
|
74
|
+
box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
|
82
75
|
box[:, 1] = np.clip(
|
83
76
|
np.round(box[:, 1] / height * dest_height), 0, dest_height
|
84
77
|
)
|
@@ -88,9 +81,17 @@ class DBnetPostProcessor:
|
|
88
81
|
|
89
82
|
return boxes, scores
|
90
83
|
|
91
|
-
def unclip(self, box, unclip_ratio=
|
84
|
+
def unclip(self, box, unclip_ratio=7):
|
85
|
+
# 小さい文字が見切れやすい、大きい文字のマージンが過度に大きくなる等の課題がある
|
86
|
+
# 対応として、文字の大きさに応じて、拡大パラメータを動的に変更する
|
87
|
+
# Note: こののルールはヒューリスティックで理論的根拠はない
|
92
88
|
poly = Polygon(box)
|
93
|
-
|
89
|
+
width = box[:, 0].max() - box[:, 0].min()
|
90
|
+
height = box[:, 1].max() - box[:, 1].min()
|
91
|
+
box_dist = min(width, height)
|
92
|
+
ratio = unclip_ratio / math.sqrt(box_dist)
|
93
|
+
|
94
|
+
distance = poly.area * ratio / poly.length
|
94
95
|
offset = pyclipper.PyclipperOffset()
|
95
96
|
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
96
97
|
expanded = np.array(offset.Execute(distance))
|
@@ -122,7 +122,5 @@ class ParseqTokenizer(BaseTokenizer):
|
|
122
122
|
eos_idx = len(ids) # Nothing to truncate.
|
123
123
|
# Truncate after EOS
|
124
124
|
ids = ids[:eos_idx]
|
125
|
-
probs = probs[
|
126
|
-
: eos_idx + 1
|
127
|
-
] # but include prob. for EOS (if it exists)
|
125
|
+
probs = probs[: eos_idx + 1] # but include prob. for EOS (if it exists)
|
128
126
|
return probs, ids
|
yomitoku/utils/misc.py
CHANGED
yomitoku/utils/visualizer.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import cv2
|
2
2
|
import numpy as np
|
3
|
-
from PIL import Image, ImageDraw, ImageFont
|
4
|
-
|
3
|
+
from PIL import Image, ImageDraw, ImageFont, features
|
5
4
|
from ..constants import PALETTE
|
5
|
+
from .logger import set_logger
|
6
|
+
|
7
|
+
logger = set_logger(__name__, "INFO")
|
6
8
|
|
7
9
|
|
8
10
|
def _reading_order_visualizer(img, elements, line_color, tip_size):
|
@@ -148,13 +150,18 @@ def rec_visualizer(
|
|
148
150
|
out = img.copy()
|
149
151
|
pillow_img = Image.fromarray(out)
|
150
152
|
draw = ImageDraw.Draw(pillow_img)
|
153
|
+
has_raqm = features.check_feature(feature="raqm")
|
154
|
+
if not has_raqm:
|
155
|
+
logger.warning(
|
156
|
+
"libraqm is not installed. Vertical text rendering is not supported. Rendering horizontally instead."
|
157
|
+
)
|
151
158
|
|
152
159
|
for pred, quad, direction in zip(
|
153
160
|
outputs.contents, outputs.points, outputs.directions
|
154
161
|
):
|
155
162
|
quad = np.array(quad).astype(np.int32)
|
156
163
|
font = ImageFont.truetype(font_path, font_size)
|
157
|
-
if direction == "horizontal":
|
164
|
+
if direction == "horizontal" or not has_raqm:
|
158
165
|
x_offset = 0
|
159
166
|
y_offset = -font_size
|
160
167
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: yomitoku
|
3
|
-
Version: 0.
|
4
|
-
Summary: Yomitoku is
|
3
|
+
Version: 0.5.1
|
4
|
+
Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
|
5
5
|
Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
|
6
6
|
License: CC BY-NC-SA 4.0
|
7
7
|
Keywords: Deep Learning,Japanese,OCR
|
@@ -10,24 +10,25 @@ Requires-Dist: huggingface-hub>=0.26.1
|
|
10
10
|
Requires-Dist: lxml>=5.3.0
|
11
11
|
Requires-Dist: omegaconf>=2.3.0
|
12
12
|
Requires-Dist: opencv-python>=4.10.0.84
|
13
|
-
Requires-Dist: pdf2image>=1.17.0
|
14
13
|
Requires-Dist: pyclipper>=1.3.0.post6
|
15
14
|
Requires-Dist: pydantic>=2.9.2
|
15
|
+
Requires-Dist: pypdfium2>=4.30.0
|
16
16
|
Requires-Dist: shapely>=2.0.6
|
17
17
|
Requires-Dist: timm>=1.0.11
|
18
|
-
Requires-Dist: torch
|
18
|
+
Requires-Dist: torch==2.5.0
|
19
19
|
Requires-Dist: torchvision>=0.20.0
|
20
20
|
Description-Content-Type: text/markdown
|
21
21
|
|
22
|
-
|
22
|
+
日本語版 | [English](README_EN.md)
|
23
|
+
|
24
|
+
<img src="static/logo/horizontal.png" width="800px">
|
23
25
|
|
24
26
|

|
25
27
|

|
26
|
-

|
29
|
+

|
27
30
|
[](https://kotaro-kinoshita.github.io/yomitoku-dev/)
|
28
31
|
|
29
|
-
<img src="static/logo/horizontal.png" width="800px">
|
30
|
-
|
31
32
|
## 🌟 概要
|
32
33
|
|
33
34
|
YomiToku は日本語に特化した AI 文章画像解析エンジン(Document AI)です。画像内の文字の全文 OCR およびレイアウト解析機能を有しており、画像内の文字情報や図表を認識、抽出、変換します。
|
@@ -60,31 +61,16 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
|
|
60
61
|
|
61
62
|
## 📣 リリース情報
|
62
63
|
|
63
|
-
- 2024 年
|
64
|
+
- 2024 年 11 月 26 YomiToku v0.5.1 (beta) を公開
|
64
65
|
|
65
66
|
## 💡 インストールの方法
|
66
67
|
|
67
68
|
```
|
68
|
-
pip install
|
69
|
+
pip install yomitoku
|
69
70
|
```
|
70
71
|
|
71
|
-
- pytorch
|
72
|
-
|
73
|
-
### 依存ライブラリ
|
74
|
-
|
75
|
-
pdf ファイルの解析を行うためには、別途、[poppler](https://poppler.freedesktop.org/)のインストールが必要です。
|
76
|
-
|
77
|
-
**Mac**
|
78
|
-
|
79
|
-
```
|
80
|
-
brew install poppler
|
81
|
-
```
|
82
|
-
|
83
|
-
**Linux**
|
84
|
-
|
85
|
-
```
|
86
|
-
apt install poppler-utils -y
|
87
|
-
```
|
72
|
+
- pytorch はご自身の CUDAのバージョンにあったものをインストールしてください。デフォルトではCUDA12.4以上に対応したものがインストールされます。
|
73
|
+
- pytorch は2.5以上のバージョンに対応しています。その関係でCUDA11.8以上のバージョンが必要になります。対応できない場合は、リポジトリ内のDockerfileを利用してください。
|
88
74
|
|
89
75
|
## 🚀 実行方法
|
90
76
|
|
@@ -98,8 +84,8 @@ yomitoku ${path_data} -f md -o results -v --figure
|
|
98
84
|
- `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
|
99
85
|
- `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
|
100
86
|
- `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。(デフォルト:画像通りの改行位置位置で改行します。)
|
101
|
-
-
|
102
|
-
-
|
87
|
+
- `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
|
88
|
+
- `--figure` 検出した図、画像を出力ファイルにエクスポートします。(html と markdown のみ)
|
103
89
|
|
104
90
|
その他のオプションに関しては、ヘルプを参照
|
105
91
|
|
@@ -107,11 +93,10 @@ yomitoku ${path_data} -f md -o results -v --figure
|
|
107
93
|
yomitoku --help
|
108
94
|
```
|
109
95
|
|
110
|
-
|
111
|
-
|
112
|
-
- CPU を用いての推論向けに最適化されておらず、処理時間が長くなりますので、GPU での実行を推奨します。
|
96
|
+
**NOTE**
|
97
|
+
- GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
|
113
98
|
- 活字のみ識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
|
114
|
-
-
|
99
|
+
- Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
|
115
100
|
- AI-OCR の識別精度を高めるために、入力画像の解像度が重要です。低解像度画像では識別精度が低下します。最低でも画像の短辺を 720px 以上の画像で推論することをお勧めします。
|
116
101
|
|
117
102
|
## 📝 ドキュメント
|
@@ -120,8 +105,8 @@ yomitoku --help
|
|
120
105
|
|
121
106
|
## LICENSE
|
122
107
|
|
123
|
-
|
124
|
-
|
108
|
+
本リポジトリ内に格納されているソースコードおよび本プロジェクトに関連する HuggingFaceHub 上のモデルの重みファイルのライセンスは CC BY-NC-SA 4.0 に従います。
|
109
|
+
非商用での個人利用、研究目的での利用はご自由にお使いください。
|
125
110
|
商用目的での利用に関しては、別途、商用ライセンスを提供しますので、開発者にお問い合わせください。
|
126
111
|
|
127
|
-
YomiToku © 2024 by
|
112
|
+
YomiToku © 2024 by Kotaro Kinoshita is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
|
@@ -1,8 +1,8 @@
|
|
1
1
|
yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
|
2
2
|
yomitoku/base.py,sha256=lzR_V8t87aRasmFdFwD-8KAeSahSTI3AZaEn6g8sOv8,3871
|
3
3
|
yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
|
4
|
-
yomitoku/document_analyzer.py,sha256=
|
5
|
-
yomitoku/layout_analyzer.py,sha256=
|
4
|
+
yomitoku/document_analyzer.py,sha256=HIg-nVzDhJIP-h-tn4uU86KakgHdlAhosEqK_i-SWe4,9906
|
5
|
+
yomitoku/layout_analyzer.py,sha256=QTeRcVd8aySz8u6dg2ikET77ar3sqlukRLBwYfTyMPM,2033
|
6
6
|
yomitoku/layout_parser.py,sha256=V2jCNHE61jNp8ytYdKwPV34V5qEK7y-7-Mq7-AkoQhU,5898
|
7
7
|
yomitoku/ocr.py,sha256=Rcojw0aGA6yDF2RjqfK23_rMw-xm61KGd8JmTCTOOVU,2516
|
8
8
|
yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
|
@@ -14,39 +14,39 @@ yomitoku/cli/main.py,sha256=MBD0S4sXgquJ8P2egkZjJcglXvCke5Uw46C28SDtr8g,6252
|
|
14
14
|
yomitoku/configs/__init__.py,sha256=KBhb9S7xt22HZaIcoWSgZHfscXXj9YlimOwLH5z9CRo,454
|
15
15
|
yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
|
16
16
|
yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
|
17
|
-
yomitoku/configs/cfg_text_detector_dbnet.py,sha256=
|
17
|
+
yomitoku/configs/cfg_text_detector_dbnet.py,sha256=U9k48PON7haoOaytiELhbZRpv9RMiUm6nnfHmdxIa9Q,1153
|
18
18
|
yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLEGahEsCaZdjfKC_MO8,1247
|
19
19
|
yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
|
20
20
|
yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
|
21
|
-
yomitoku/data/functions.py,sha256=
|
21
|
+
yomitoku/data/functions.py,sha256=eOyxo8S6EoAf1xGSPLWQFb9-t5Rg52NggD9MFIrOSpY,7506
|
22
22
|
yomitoku/export/__init__.py,sha256=aANEfuovH2aevFjb2pGrBLFP-4iRzEzD9wcriCR-M7I,229
|
23
23
|
yomitoku/export/export_csv.py,sha256=-n8eYPIzDQuiixeqpTbWaN9aQ5oFyl7XRfpv51oKPTI,1979
|
24
24
|
yomitoku/export/export_html.py,sha256=X3H_orkS1BRlQo8Z1NzgrFwsIboDzRAx9etmqj90k2Y,4866
|
25
25
|
yomitoku/export/export_json.py,sha256=1ChvCAHfCmMQvCfcAb1p3fSpr4elNAs3xBSIbpfn3bc,998
|
26
26
|
yomitoku/export/export_markdown.py,sha256=mCcsXUWBLrYc1NcRSBFfBT28d6eCddAF1oHp0qdBEnE,3986
|
27
27
|
yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
|
28
|
-
yomitoku/models/dbnet_plus.py,sha256=
|
29
|
-
yomitoku/models/parseq.py,sha256=
|
28
|
+
yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
|
29
|
+
yomitoku/models/parseq.py,sha256=7QT-q5_oWqXTDXobRk1R6Lpap_AxdC4AzkSsOgXjOwM,8611
|
30
30
|
yomitoku/models/rtdetr.py,sha256=oJsr8RHz3frslhLfXdVJve47lUsrmqLjfdTrZ41tlQ0,687
|
31
31
|
yomitoku/models/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
32
|
yomitoku/models/layers/activate.py,sha256=HUw0q-76RNjZF-o9O3fowfJcw0t1H5o0pbyioGdqUvU,668
|
33
33
|
yomitoku/models/layers/dbnet_feature_attention.py,sha256=Vpp_PiLVuI7Zs30TTg4RNRn16KTb81ewonADpUHd4aE,6060
|
34
34
|
yomitoku/models/layers/parseq_transformer.py,sha256=33eroJf8rmgIptP-NpZLJMhG7XOTwV4rXsq674VrKnU,6704
|
35
|
-
yomitoku/models/layers/rtdetr_backbone.py,sha256=
|
36
|
-
yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=
|
35
|
+
yomitoku/models/layers/rtdetr_backbone.py,sha256=QjfLW-3qn2My3Jbg6yLORX8A-D2sph9J9u3r5nNnDLo,9386
|
36
|
+
yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=D3dK37k7_0jPqV39-6Se8kBzF_SyZttNlbLleyNFiJU,13607
|
37
37
|
yomitoku/models/layers/rtdetrv2_decoder.py,sha256=5bVYPLFYCy3PcjyHTPFHNLWqg3bctrk-dKVG4kayhaw,27517
|
38
38
|
yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
|
39
|
-
yomitoku/postprocessor/dbnet_postporcessor.py,sha256=
|
40
|
-
yomitoku/postprocessor/parseq_tokenizer.py,sha256=
|
39
|
+
yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
|
40
|
+
yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
|
41
41
|
yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=f52wfRKrxqSXy_LeidKDR9XAta_qPjto-oYEdO0XL8A,3386
|
42
42
|
yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
|
43
43
|
yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
|
44
44
|
yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
|
46
46
|
yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
|
47
|
-
yomitoku/utils/misc.py,sha256=
|
48
|
-
yomitoku/utils/visualizer.py,sha256=
|
49
|
-
yomitoku-0.
|
50
|
-
yomitoku-0.
|
51
|
-
yomitoku-0.
|
52
|
-
yomitoku-0.
|
47
|
+
yomitoku/utils/misc.py,sha256=2Eyy7-9K_h4Mal1zGXq6OlxubfNzhS0mEYwn_xt7xl8,2497
|
48
|
+
yomitoku/utils/visualizer.py,sha256=2pSmbhUPylzVVJ0bXtGDoNmMdArAByab4Py7Xavvs_A,5230
|
49
|
+
yomitoku-0.5.1.dist-info/METADATA,sha256=-8bUVnN26cxYlZO0ZQH3liki_xMfhUX47ruHLl-2BGM,7817
|
50
|
+
yomitoku-0.5.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
51
|
+
yomitoku-0.5.1.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
|
52
|
+
yomitoku-0.5.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|