openocr-python 0.0.9__py3-none-any.whl → 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openocr/__init__.py +35 -1
- openocr/configs/dataset/rec/evaluation.yaml +41 -0
- openocr/configs/dataset/rec/ltb.yaml +9 -0
- openocr/configs/dataset/rec/mjsynth.yaml +11 -0
- openocr/configs/dataset/rec/openvino.yaml +25 -0
- openocr/configs/dataset/rec/ost.yaml +17 -0
- openocr/configs/dataset/rec/synthtext.yaml +7 -0
- openocr/configs/dataset/rec/test.yaml +77 -0
- openocr/configs/dataset/rec/textocr.yaml +13 -0
- openocr/configs/dataset/rec/textocr_horizontal.yaml +13 -0
- openocr/configs/dataset/rec/union14m_b.yaml +47 -0
- openocr/configs/dataset/rec/union14m_l_filtered.yaml +35 -0
- openocr/configs/rec/cmer/cmer.yml +127 -0
- openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_base.yml +152 -0
- openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_small.yml +152 -0
- openocr/configs/rec/unirec/focalsvtr_ardecoder_unirec.yml +114 -0
- openocr/configs/rec/unirec/opendoc_pipeline.yml +105 -0
- openocr/demo_gradio.py +28 -8
- openocr/demo_opendoc.py +572 -0
- openocr/demo_unirec.py +392 -0
- openocr/opendet/losses/__init__.py +5 -7
- openocr/opendet/preprocess/crop_resize.py +2 -1
- openocr/openocr.py +685 -0
- openocr/openrec/losses/__init__.py +8 -3
- openocr/openrec/losses/cmer_loss.py +12 -0
- openocr/openrec/losses/mdiff_loss.py +11 -0
- openocr/openrec/losses/unirec_loss.py +12 -0
- openocr/openrec/metrics/__init__.py +4 -1
- openocr/openrec/metrics/rec_metric_cmer.py +328 -0
- openocr/openrec/modeling/cmer_modeling/modeling_cmer.py +643 -0
- openocr/openrec/modeling/decoders/__init__.py +1 -0
- openocr/openrec/modeling/decoders/ctc_decoder.py +1 -1
- openocr/openrec/modeling/decoders/dan_decoder.py +4 -4
- openocr/openrec/modeling/decoders/dptr_parseq_clip_b_decoder.py +1563 -1398
- openocr/openrec/modeling/decoders/mdiff_decoder.py +587 -0
- openocr/openrec/modeling/decoders/smtr_decoder.py +99 -48
- openocr/openrec/modeling/unirec_modeling/configuration_unirec.py +166 -0
- openocr/openrec/modeling/unirec_modeling/modeling_unirec.py +433 -0
- openocr/openrec/optimizer/__init__.py +4 -3
- openocr/openrec/optimizer/lr.py +49 -0
- openocr/openrec/postprocess/__init__.py +2 -0
- openocr/openrec/postprocess/abinet_postprocess.py +1 -1
- openocr/openrec/postprocess/ar_postprocess.py +1 -1
- openocr/openrec/postprocess/cmer_postprocess.py +86 -0
- openocr/openrec/postprocess/cppd_postprocess.py +1 -1
- openocr/openrec/postprocess/igtr_postprocess.py +1 -1
- openocr/openrec/postprocess/lister_postprocess.py +1 -1
- openocr/openrec/postprocess/mgp_postprocess.py +1 -1
- openocr/openrec/postprocess/nrtr_postprocess.py +2 -2
- openocr/openrec/postprocess/smtr_postprocess.py +1 -1
- openocr/openrec/postprocess/srn_postprocess.py +1 -1
- openocr/openrec/postprocess/unirec_postprocess.py +58 -0
- openocr/openrec/postprocess/visionlan_postprocess.py +1 -1
- openocr/openrec/preprocess/__init__.py +5 -0
- openocr/openrec/preprocess/ce_label_encode.py +1 -1
- openocr/openrec/preprocess/cmer_label_encode.py +1025 -0
- openocr/openrec/preprocess/ctc_label_encode.py +1 -1
- openocr/openrec/preprocess/dptr_label_encode.py +177 -157
- openocr/openrec/preprocess/igtr_label_encode.py +4 -2
- openocr/openrec/preprocess/mdiff_label_encode.py +312 -0
- openocr/openrec/preprocess/rec_aug.py +128 -2
- openocr/openrec/preprocess/resize.py +57 -0
- openocr/openrec/preprocess/unirec_label_encode.py +62 -0
- openocr/tools/data/__init__.py +78 -55
- openocr/tools/data/cmer_web_dataset.py +310 -0
- openocr/tools/data/native_size_dataset.py +753 -0
- openocr/tools/data/native_size_sampler.py +158 -0
- openocr/tools/data/ratio_dataset_tvresize.py +2 -0
- openocr/tools/data/ratio_sampler.py +2 -1
- openocr/tools/download/download_dataset.py +38 -0
- openocr/tools/download/utils.py +28 -0
- openocr/tools/download_example_images.py +236 -0
- openocr/tools/engine/trainer.py +155 -39
- openocr/tools/eval_rec_all_ch.py +2 -2
- openocr/tools/infer_det.py +20 -2
- openocr/tools/infer_doc.py +898 -0
- openocr/tools/infer_doc_onnx.py +1172 -0
- openocr/tools/infer_e2e.py +27 -10
- openocr/tools/infer_rec.py +64 -15
- openocr/tools/infer_unirec_onnx.py +730 -0
- openocr/tools/to_markdown.py +468 -0
- openocr/tools/utils/ckpt.py +17 -5
- openocr/tools/utils/opendoc_onnx_utils/utils.py +1052 -0
- openocr_python-0.1.0.dev0.dist-info/METADATA +324 -0
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/RECORD +89 -45
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/WHEEL +1 -1
- openocr_python-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- openocr_python-0.0.9.dist-info/METADATA +0 -149
- /openocr_python-0.0.9.dist-info/LICENCE → /openocr_python-0.1.0.dev0.dist-info/licenses/LICENSE +0 -0
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1052 @@
|
|
|
1
|
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import html
|
|
16
|
+
import itertools
|
|
17
|
+
import math
|
|
18
|
+
import re
|
|
19
|
+
from collections import Counter
|
|
20
|
+
from copy import deepcopy
|
|
21
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
from PIL import Image
|
|
25
|
+
from pydantic import BaseModel, computed_field, model_validator
|
|
26
|
+
|
|
27
|
+
def calculate_bbox_area(bbox):
|
|
28
|
+
"""Calculate bounding box area"""
|
|
29
|
+
x1, y1, x2, y2 = map(float, bbox)
|
|
30
|
+
area = abs((x2 - x1) * (y2 - y1))
|
|
31
|
+
return area
|
|
32
|
+
|
|
33
|
+
def calculate_overlap_ratio(
|
|
34
|
+
bbox1: Union[np.ndarray, list, tuple],
|
|
35
|
+
bbox2: Union[np.ndarray, list, tuple],
|
|
36
|
+
mode="union",
|
|
37
|
+
) -> float:
|
|
38
|
+
"""
|
|
39
|
+
Calculate the overlap ratio between two bounding boxes using NumPy.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
bbox1 (np.ndarray, list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
|
|
43
|
+
bbox2 (np.ndarray, list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
|
|
44
|
+
mode (str): The mode of calculation, either 'union', 'small', or 'large'.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
float: The overlap ratio value between the two bounding boxes
|
|
48
|
+
"""
|
|
49
|
+
bbox1 = np.array(bbox1)
|
|
50
|
+
bbox2 = np.array(bbox2)
|
|
51
|
+
|
|
52
|
+
x_min_inter = np.maximum(bbox1[0], bbox2[0])
|
|
53
|
+
y_min_inter = np.maximum(bbox1[1], bbox2[1])
|
|
54
|
+
x_max_inter = np.minimum(bbox1[2], bbox2[2])
|
|
55
|
+
y_max_inter = np.minimum(bbox1[3], bbox2[3])
|
|
56
|
+
|
|
57
|
+
inter_width = np.maximum(0, x_max_inter - x_min_inter)
|
|
58
|
+
inter_height = np.maximum(0, y_max_inter - y_min_inter)
|
|
59
|
+
|
|
60
|
+
inter_area = inter_width * inter_height
|
|
61
|
+
|
|
62
|
+
bbox1_area = calculate_bbox_area(bbox1)
|
|
63
|
+
bbox2_area = calculate_bbox_area(bbox2)
|
|
64
|
+
|
|
65
|
+
if mode == "union":
|
|
66
|
+
ref_area = bbox1_area + bbox2_area - inter_area
|
|
67
|
+
elif mode == "small":
|
|
68
|
+
ref_area = np.minimum(bbox1_area, bbox2_area)
|
|
69
|
+
elif mode == "large":
|
|
70
|
+
ref_area = np.maximum(bbox1_area, bbox2_area)
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if ref_area == 0:
|
|
77
|
+
return 0.0
|
|
78
|
+
|
|
79
|
+
return inter_area / ref_area
|
|
80
|
+
|
|
81
|
+
def calculate_projection_overlap_ratio(
|
|
82
|
+
bbox1: List[float],
|
|
83
|
+
bbox2: List[float],
|
|
84
|
+
direction: str = "horizontal",
|
|
85
|
+
mode="union",
|
|
86
|
+
) -> float:
|
|
87
|
+
"""
|
|
88
|
+
Calculate the IoU of lines between two bounding boxes.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
bbox1 (List[float]): First bounding box [x_min, y_min, x_max, y_max].
|
|
92
|
+
bbox2 (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
|
|
93
|
+
direction (str): direction of the projection, "horizontal" or "vertical".
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
float: Line overlap ratio. Returns 0 if there is no overlap.
|
|
97
|
+
"""
|
|
98
|
+
start_index, end_index = 1, 3
|
|
99
|
+
if direction == "horizontal":
|
|
100
|
+
start_index, end_index = 0, 2
|
|
101
|
+
|
|
102
|
+
intersection_start = max(bbox1[start_index], bbox2[start_index])
|
|
103
|
+
intersection_end = min(bbox1[end_index], bbox2[end_index])
|
|
104
|
+
overlap = intersection_end - intersection_start
|
|
105
|
+
if overlap <= 0:
|
|
106
|
+
return 0
|
|
107
|
+
|
|
108
|
+
if mode == "union":
|
|
109
|
+
ref_width = max(bbox1[end_index], bbox2[end_index]) - min(
|
|
110
|
+
bbox1[start_index], bbox2[start_index]
|
|
111
|
+
)
|
|
112
|
+
elif mode == "small":
|
|
113
|
+
ref_width = min(
|
|
114
|
+
bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
|
|
115
|
+
)
|
|
116
|
+
elif mode == "large":
|
|
117
|
+
ref_width = max(
|
|
118
|
+
bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(
|
|
122
|
+
f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return overlap / ref_width if ref_width > 0 else 0.0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def filter_overlap_boxes(
|
|
130
|
+
layout_det_res: Dict[str, List[Dict]]
|
|
131
|
+
) -> Dict[str, List[Dict]]:
|
|
132
|
+
"""
|
|
133
|
+
Remove overlapping boxes from layout detection results based on a given overlap ratio.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
layout_det_res (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
|
|
140
|
+
"""
|
|
141
|
+
layout_det_res_filtered = deepcopy(layout_det_res)
|
|
142
|
+
boxes = [
|
|
143
|
+
box for box in layout_det_res_filtered["boxes"] if box["label"] != "reference"
|
|
144
|
+
]
|
|
145
|
+
dropped_indexes = set()
|
|
146
|
+
|
|
147
|
+
for i in range(len(boxes)):
|
|
148
|
+
for j in range(i + 1, len(boxes)):
|
|
149
|
+
if i in dropped_indexes or j in dropped_indexes:
|
|
150
|
+
continue
|
|
151
|
+
overlap_ratio = calculate_overlap_ratio(
|
|
152
|
+
boxes[i]["coordinate"], boxes[j]["coordinate"], "small"
|
|
153
|
+
)
|
|
154
|
+
if overlap_ratio > 0.7:
|
|
155
|
+
box_area_i = calculate_bbox_area(boxes[i]["coordinate"])
|
|
156
|
+
box_area_j = calculate_bbox_area(boxes[j]["coordinate"])
|
|
157
|
+
if (
|
|
158
|
+
boxes[i]["label"] == "image" or boxes[j]["label"] == "image"
|
|
159
|
+
) and boxes[i]["label"] != boxes[j]["label"]:
|
|
160
|
+
continue
|
|
161
|
+
if box_area_i >= box_area_j:
|
|
162
|
+
dropped_indexes.add(j)
|
|
163
|
+
else:
|
|
164
|
+
dropped_indexes.add(i)
|
|
165
|
+
layout_det_res_filtered["boxes"] = [
|
|
166
|
+
box for idx, box in enumerate(boxes) if idx not in dropped_indexes
|
|
167
|
+
]
|
|
168
|
+
return layout_det_res_filtered
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def to_pil_image(img):
|
|
172
|
+
"""
|
|
173
|
+
Convert the input to a PIL Image.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
img (PIL.Image or numpy.ndarray): Input image.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
PIL.Image: PIL Image object.
|
|
180
|
+
"""
|
|
181
|
+
if isinstance(img, Image.Image):
|
|
182
|
+
return img
|
|
183
|
+
return Image.fromarray(img)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def to_np_array(img):
|
|
187
|
+
"""
|
|
188
|
+
Convert the input to a numpy array.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
img (PIL.Image or numpy.ndarray): Input image.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
numpy.ndarray: Numpy array image.
|
|
195
|
+
"""
|
|
196
|
+
if isinstance(img, Image.Image):
|
|
197
|
+
return np.array(img)
|
|
198
|
+
return img
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def calc_merged_wh(images):
|
|
202
|
+
"""
|
|
203
|
+
Calculate width (max of all) and height (sum) for a vertical merge of images.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
images (List[PIL.Image or np.ndarray]): List of images.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Tuple[int, int]: (width, height) of merged image.
|
|
210
|
+
"""
|
|
211
|
+
widths = [to_pil_image(img).width for img in images]
|
|
212
|
+
heights = [to_pil_image(img).height for img in images]
|
|
213
|
+
w = max(widths)
|
|
214
|
+
h = sum(heights)
|
|
215
|
+
return w, h
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def merge_images(images, aligns="center"):
|
|
219
|
+
"""
|
|
220
|
+
Merge images vertically with given alignment.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
images (List[PIL.Image or np.ndarray]): List of images to merge.
|
|
224
|
+
aligns (str or List[str]): Alignment(s) for each merge step ('center', 'right', 'left').
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
np.ndarray: Merged image as numpy array.
|
|
228
|
+
"""
|
|
229
|
+
if not images:
|
|
230
|
+
return None
|
|
231
|
+
if len(images) == 1:
|
|
232
|
+
return to_np_array(images[0])
|
|
233
|
+
if isinstance(aligns, str):
|
|
234
|
+
aligns = [aligns] * (len(images) - 1)
|
|
235
|
+
if len(aligns) != len(images) - 1:
|
|
236
|
+
raise ValueError("The length of aligns must be len(images) - 1")
|
|
237
|
+
merged = to_pil_image(images[0])
|
|
238
|
+
for i in range(1, len(images)):
|
|
239
|
+
img2 = to_pil_image(images[i])
|
|
240
|
+
align = aligns[i - 1]
|
|
241
|
+
w = max(merged.width, img2.width)
|
|
242
|
+
h = merged.height + img2.height
|
|
243
|
+
new_img = Image.new("RGB", (w, h), (255, 255, 255))
|
|
244
|
+
if align == "center":
|
|
245
|
+
x1 = (w - merged.width) // 2
|
|
246
|
+
x2 = (w - img2.width) // 2
|
|
247
|
+
elif align == "right":
|
|
248
|
+
x1 = w - merged.width
|
|
249
|
+
x2 = w - img2.width
|
|
250
|
+
else: # left
|
|
251
|
+
x1 = x2 = 0
|
|
252
|
+
new_img.paste(merged, (x1, 0))
|
|
253
|
+
new_img.paste(img2, (x2, merged.height))
|
|
254
|
+
merged = new_img
|
|
255
|
+
return to_np_array(merged)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def merge_blocks(blocks, non_merge_labels):
|
|
259
|
+
"""
|
|
260
|
+
Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
blocks (List[Dict]): List of block dicts.
|
|
264
|
+
non_merge_labels (List[str]): Block labels that should not be merged.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
List[Dict]: List of processed (and possibly merged) blocks.
|
|
268
|
+
"""
|
|
269
|
+
blocks_to_merge = []
|
|
270
|
+
non_merge_blocks = {}
|
|
271
|
+
for idx, block in enumerate(blocks):
|
|
272
|
+
if block["label"] in non_merge_labels:
|
|
273
|
+
non_merge_blocks[idx] = block
|
|
274
|
+
else:
|
|
275
|
+
blocks_to_merge.append((idx, block))
|
|
276
|
+
|
|
277
|
+
merged_groups = []
|
|
278
|
+
current_group = []
|
|
279
|
+
current_indices = []
|
|
280
|
+
current_aligns = []
|
|
281
|
+
|
|
282
|
+
def is_aligned(a1, a2):
|
|
283
|
+
return abs(a1 - a2) <= 5
|
|
284
|
+
|
|
285
|
+
def get_alignment(block_bbox, prev_bbox):
|
|
286
|
+
if is_aligned(block_bbox[0], prev_bbox[0]):
|
|
287
|
+
return "left"
|
|
288
|
+
elif is_aligned(block_bbox[2], prev_bbox[2]):
|
|
289
|
+
return "right"
|
|
290
|
+
else:
|
|
291
|
+
return "center"
|
|
292
|
+
|
|
293
|
+
def overlapwith_other_box(block_idx, prev_idx, blocks):
|
|
294
|
+
prev_bbox = blocks[prev_idx]["box"]
|
|
295
|
+
block_bbox = blocks[block_idx]["box"]
|
|
296
|
+
x1 = min(prev_bbox[0], block_bbox[0])
|
|
297
|
+
y1 = min(prev_bbox[1], block_bbox[1])
|
|
298
|
+
x2 = max(prev_bbox[2], block_bbox[2])
|
|
299
|
+
y2 = max(prev_bbox[3], block_bbox[3])
|
|
300
|
+
min_box = [x1, y1, x2, y2]
|
|
301
|
+
for idx, other_block in enumerate(blocks):
|
|
302
|
+
if idx in [block_idx, prev_idx]:
|
|
303
|
+
continue
|
|
304
|
+
other_bbox = other_block["box"]
|
|
305
|
+
if calculate_overlap_ratio(min_box, other_bbox) > 0:
|
|
306
|
+
return True
|
|
307
|
+
return False
|
|
308
|
+
|
|
309
|
+
for i, (idx, block) in enumerate(blocks_to_merge):
|
|
310
|
+
if not current_group:
|
|
311
|
+
current_group = [block]
|
|
312
|
+
current_indices = [idx]
|
|
313
|
+
current_aligns = []
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
prev_idx, prev_block = blocks_to_merge[i - 1]
|
|
317
|
+
prev_bbox = prev_block["box"]
|
|
318
|
+
prev_label = prev_block["label"]
|
|
319
|
+
block_bbox = block["box"]
|
|
320
|
+
block_label = block["label"]
|
|
321
|
+
|
|
322
|
+
iou_h = calculate_projection_overlap_ratio(block_bbox, prev_bbox, "horizontal")
|
|
323
|
+
is_cross = (
|
|
324
|
+
iou_h == 0
|
|
325
|
+
and block_label == "text"
|
|
326
|
+
and block_label == prev_label
|
|
327
|
+
and block_bbox[0] > prev_bbox[2]
|
|
328
|
+
and block_bbox[1] < prev_bbox[3]
|
|
329
|
+
and block_bbox[0] - prev_bbox[2]
|
|
330
|
+
< max(prev_bbox[2] - prev_bbox[0], block_bbox[2] - block_bbox[0]) * 0.3
|
|
331
|
+
)
|
|
332
|
+
is_updown_align = (
|
|
333
|
+
iou_h > 0
|
|
334
|
+
and block_label in ["text"]
|
|
335
|
+
and block_label == prev_label
|
|
336
|
+
and block_bbox[3] >= prev_bbox[1]
|
|
337
|
+
and abs(block_bbox[1] - prev_bbox[3])
|
|
338
|
+
< max(prev_bbox[3] - prev_bbox[1], block_bbox[3] - block_bbox[1]) * 0.5
|
|
339
|
+
and (
|
|
340
|
+
is_aligned(block_bbox[0], prev_bbox[0])
|
|
341
|
+
^ is_aligned(block_bbox[2], prev_bbox[2])
|
|
342
|
+
)
|
|
343
|
+
and overlapwith_other_box(idx, prev_idx, blocks)
|
|
344
|
+
)
|
|
345
|
+
if is_cross:
|
|
346
|
+
align_mode = "center"
|
|
347
|
+
elif is_updown_align:
|
|
348
|
+
align_mode = get_alignment(block_bbox, prev_bbox)
|
|
349
|
+
else:
|
|
350
|
+
align_mode = None
|
|
351
|
+
|
|
352
|
+
if is_cross or is_updown_align:
|
|
353
|
+
current_group.append(block)
|
|
354
|
+
current_indices.append(idx)
|
|
355
|
+
current_aligns.append(align_mode)
|
|
356
|
+
else:
|
|
357
|
+
merged_groups.append((current_indices, current_group, current_aligns))
|
|
358
|
+
current_group = [block]
|
|
359
|
+
current_indices = [idx]
|
|
360
|
+
current_aligns = []
|
|
361
|
+
if current_group:
|
|
362
|
+
merged_groups.append((current_indices, current_group, current_aligns))
|
|
363
|
+
|
|
364
|
+
group_ranges = []
|
|
365
|
+
for group_indices, group, aligns in merged_groups:
|
|
366
|
+
start, end = min(group_indices), max(group_indices)
|
|
367
|
+
group_ranges.append((start, end, group_indices, aligns))
|
|
368
|
+
|
|
369
|
+
result_blocks = []
|
|
370
|
+
used_indices = set()
|
|
371
|
+
idx = 0
|
|
372
|
+
while idx < len(blocks):
|
|
373
|
+
group_found = False
|
|
374
|
+
for (start, end, group_indices, aligns), (g_indices, g_blocks, g_aligns) in zip(
|
|
375
|
+
group_ranges, merged_groups
|
|
376
|
+
):
|
|
377
|
+
if idx == start and all(i not in used_indices for i in group_indices):
|
|
378
|
+
group_found = True
|
|
379
|
+
imgs = [blocks[i]["img"] for i in group_indices]
|
|
380
|
+
merge_aligns = aligns if aligns else []
|
|
381
|
+
w, h = calc_merged_wh(imgs)
|
|
382
|
+
aspect_ratio = h / w if w != 0 else float("inf")
|
|
383
|
+
if aspect_ratio >= 3:
|
|
384
|
+
for j, block_idx in enumerate(group_indices):
|
|
385
|
+
block = blocks[block_idx].copy()
|
|
386
|
+
block["img"] = blocks[block_idx]["img"]
|
|
387
|
+
block["merge_aligns"] = None
|
|
388
|
+
result_blocks.append(block)
|
|
389
|
+
used_indices.add(block_idx)
|
|
390
|
+
else:
|
|
391
|
+
merged_img = merge_images(imgs, merge_aligns)
|
|
392
|
+
for j, block_idx in enumerate(group_indices):
|
|
393
|
+
block = blocks[block_idx].copy()
|
|
394
|
+
block["img"] = merged_img if j == 0 else None
|
|
395
|
+
block["merge_aligns"] = merge_aligns if j == 0 else None
|
|
396
|
+
result_blocks.append(block)
|
|
397
|
+
used_indices.add(block_idx)
|
|
398
|
+
insert_list = []
|
|
399
|
+
for n_idx in range(start + 1, end):
|
|
400
|
+
if n_idx in non_merge_blocks:
|
|
401
|
+
insert_list.append(n_idx)
|
|
402
|
+
for n_idx in insert_list:
|
|
403
|
+
result_blocks.append(non_merge_blocks[n_idx])
|
|
404
|
+
used_indices.add(n_idx)
|
|
405
|
+
idx = end + 1
|
|
406
|
+
break
|
|
407
|
+
if group_found:
|
|
408
|
+
continue
|
|
409
|
+
if idx in non_merge_blocks and idx not in used_indices:
|
|
410
|
+
result_blocks.append(non_merge_blocks[idx])
|
|
411
|
+
used_indices.add(idx)
|
|
412
|
+
idx += 1
|
|
413
|
+
return result_blocks
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def paint_token(image, box, token_str):
|
|
417
|
+
"""
|
|
418
|
+
Fill a rectangular area in the image with a white background and write the given token string.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
image (np.ndarray): Image to paint on.
|
|
422
|
+
box (tuple): (x1, y1, x2, y2) coordinates of rectangle.
|
|
423
|
+
token_str (str): Token string to write.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
np.ndarray: Modified image.
|
|
427
|
+
"""
|
|
428
|
+
import cv2
|
|
429
|
+
|
|
430
|
+
def get_optimal_font_scale(text, fontFace, square_size, fill_ratio=0.9):
|
|
431
|
+
# the scale is greater than 0.2 and less than 10,
|
|
432
|
+
# suitable for square_size is greater than 30 and less than 1000
|
|
433
|
+
left, right = 0.2, 10
|
|
434
|
+
optimal_scale = left
|
|
435
|
+
# search the optimal font scale
|
|
436
|
+
while right - left > 1e-2:
|
|
437
|
+
mid = (left + right) / 2
|
|
438
|
+
(w, h), _ = cv2.getTextSize(text, fontFace, mid, thickness=1)
|
|
439
|
+
if w < square_size * fill_ratio and h < square_size * fill_ratio:
|
|
440
|
+
optimal_scale = mid
|
|
441
|
+
left = mid
|
|
442
|
+
else:
|
|
443
|
+
right = mid
|
|
444
|
+
return optimal_scale, w, h
|
|
445
|
+
|
|
446
|
+
x1, y1, x2, y2 = [int(v) for v in box]
|
|
447
|
+
box_w = x2 - x1
|
|
448
|
+
box_h = y2 - y1
|
|
449
|
+
|
|
450
|
+
img = image.copy()
|
|
451
|
+
cv2.rectangle(img, (x1, y1), (x2, y2), color=(255, 255, 255), thickness=-1)
|
|
452
|
+
|
|
453
|
+
# automatically set scale and thickness according to length of the shortest side
|
|
454
|
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
|
455
|
+
thickness_scale_ratio = 4
|
|
456
|
+
font_scale, text_w, text_h = get_optimal_font_scale(
|
|
457
|
+
token_str, font, min(box_w, box_h), fill_ratio=0.9
|
|
458
|
+
)
|
|
459
|
+
font_thickness = max(1, math.floor(font_scale * thickness_scale_ratio))
|
|
460
|
+
|
|
461
|
+
# calculate center coordinates of the patinting text
|
|
462
|
+
text_x = x1 + (box_w - text_w) // 2
|
|
463
|
+
text_y = y1 + (box_h + text_h) // 2
|
|
464
|
+
|
|
465
|
+
cv2.putText(
|
|
466
|
+
img,
|
|
467
|
+
token_str,
|
|
468
|
+
(text_x, text_y),
|
|
469
|
+
font,
|
|
470
|
+
font_scale,
|
|
471
|
+
(0, 0, 0),
|
|
472
|
+
font_thickness,
|
|
473
|
+
lineType=cv2.LINE_AA,
|
|
474
|
+
)
|
|
475
|
+
return img
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def tokenize_figure_of_table(table_block_img, table_box, figures):
|
|
479
|
+
"""
|
|
480
|
+
Replace figures in a table area with tokens, return new image and token map.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
table_block_img (np.ndarray): Table image.
|
|
484
|
+
table_box (list): Table bounding box [x_min, y_min, x_max, y_max].
|
|
485
|
+
figures (List[Dict]): List of figure dicts (must contain 'coordinate', 'path').
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
Tuple[np.ndarray, Dict[str, str], List[str]]:
|
|
489
|
+
- New table image,
|
|
490
|
+
- Token-to-img HTML map,
|
|
491
|
+
- List of figure paths dropped.
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
def gen_random_map(num):
|
|
495
|
+
exclude_digits = {"0", "1", "9"}
|
|
496
|
+
seq = []
|
|
497
|
+
i = 0
|
|
498
|
+
while len(seq) < num:
|
|
499
|
+
if not (set(str(i)) & exclude_digits):
|
|
500
|
+
seq.append(i)
|
|
501
|
+
i += 1
|
|
502
|
+
return seq
|
|
503
|
+
|
|
504
|
+
import random
|
|
505
|
+
|
|
506
|
+
random.seed(1024)
|
|
507
|
+
token_map = {}
|
|
508
|
+
table_x_min, table_y_min, table_x_max, table_y_max = table_box
|
|
509
|
+
drop_idxes = []
|
|
510
|
+
random_map = gen_random_map(len(figures))
|
|
511
|
+
random.shuffle(random_map)
|
|
512
|
+
for figure_id, figure in enumerate(figures):
|
|
513
|
+
figure_x_min, figure_y_min, figure_x_max, figure_y_max = figure["coordinate"]
|
|
514
|
+
if (
|
|
515
|
+
figure_x_min >= table_x_min
|
|
516
|
+
and figure_y_min >= table_y_min
|
|
517
|
+
and figure_x_max <= table_x_max
|
|
518
|
+
and figure_y_max <= table_y_max
|
|
519
|
+
):
|
|
520
|
+
drop_idxes.append(figure_id)
|
|
521
|
+
# the figure is too small to can't be tokenized and recognized when shortest length is less than 25
|
|
522
|
+
if min(figure_x_max - figure_x_min, figure_y_max - figure_y_min) < 25:
|
|
523
|
+
continue
|
|
524
|
+
draw_box = [
|
|
525
|
+
figure_x_min - table_x_min,
|
|
526
|
+
figure_y_min - table_y_min,
|
|
527
|
+
figure_x_max - table_x_min,
|
|
528
|
+
figure_y_max - table_y_min,
|
|
529
|
+
]
|
|
530
|
+
token_str = "[F" + str(random_map[figure_id]) + "]"
|
|
531
|
+
table_block_img = paint_token(table_block_img, draw_box, token_str)
|
|
532
|
+
token_map[token_str] = f'<img src="{figure["path"]}" >'
|
|
533
|
+
drop_figures = [f["path"] for i, f in enumerate(figures) if i in drop_idxes]
|
|
534
|
+
return table_block_img, token_map, drop_figures
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def untokenize_figure_of_table(table_res_str, figure_token_map):
|
|
538
|
+
"""
|
|
539
|
+
Replace tokens in a string with their HTML image equivalents.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
table_res_str (str): Table string with tokens.
|
|
543
|
+
figure_token_map (dict): Mapping from tokens to HTML img tags.
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
str: Untokenized string.
|
|
547
|
+
"""
|
|
548
|
+
|
|
549
|
+
def repl(match):
|
|
550
|
+
token_id = match.group(1)
|
|
551
|
+
token = f"[F{token_id}]"
|
|
552
|
+
return figure_token_map.get(token, match.group(0))
|
|
553
|
+
|
|
554
|
+
pattern = r"\[F(\d+)\]"
|
|
555
|
+
return re.sub(pattern, repl, table_res_str)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
class TableCell(BaseModel):
|
|
559
|
+
"""
|
|
560
|
+
TableCell represents a single cell in a table.
|
|
561
|
+
|
|
562
|
+
Attributes:
|
|
563
|
+
row_span (int): Number of rows spanned.
|
|
564
|
+
col_span (int): Number of columns spanned.
|
|
565
|
+
start_row_offset_idx (int): Start row index.
|
|
566
|
+
end_row_offset_idx (int): End row index (exclusive).
|
|
567
|
+
start_col_offset_idx (int): Start column index.
|
|
568
|
+
end_col_offset_idx (int): End column index (exclusive).
|
|
569
|
+
text (str): Cell text content.
|
|
570
|
+
column_header (bool): Whether this cell is a column header.
|
|
571
|
+
row_header (bool): Whether this cell is a row header.
|
|
572
|
+
row_section (bool): Whether this cell is a row section.
|
|
573
|
+
"""
|
|
574
|
+
|
|
575
|
+
row_span: int = 1
|
|
576
|
+
col_span: int = 1
|
|
577
|
+
start_row_offset_idx: int
|
|
578
|
+
end_row_offset_idx: int
|
|
579
|
+
start_col_offset_idx: int
|
|
580
|
+
end_col_offset_idx: int
|
|
581
|
+
text: str
|
|
582
|
+
column_header: bool = False
|
|
583
|
+
row_header: bool = False
|
|
584
|
+
row_section: bool = False
|
|
585
|
+
|
|
586
|
+
@model_validator(mode="before")
|
|
587
|
+
@classmethod
|
|
588
|
+
def from_dict_format(cls, data: Any) -> Any:
|
|
589
|
+
"""
|
|
590
|
+
Create TableCell from dict, extracting 'text' property correctly.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
data (Any): Input data.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
Any: TableCell-compatible dict.
|
|
597
|
+
"""
|
|
598
|
+
if isinstance(data, Dict):
|
|
599
|
+
if "text" in data:
|
|
600
|
+
return data
|
|
601
|
+
text = data["bbox"].get("token", "")
|
|
602
|
+
if not len(text):
|
|
603
|
+
text_cells = data.pop("text_cell_bboxes", None)
|
|
604
|
+
if text_cells:
|
|
605
|
+
for el in text_cells:
|
|
606
|
+
text += el["token"] + " "
|
|
607
|
+
text = text.strip()
|
|
608
|
+
data["text"] = text
|
|
609
|
+
return data
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
class TableData(BaseModel):
|
|
613
|
+
"""
|
|
614
|
+
TableData holds a table's cells, row and column counts, and provides a grid property.
|
|
615
|
+
|
|
616
|
+
Attributes:
|
|
617
|
+
table_cells (List[TableCell]): List of table cells.
|
|
618
|
+
num_rows (int): Number of rows.
|
|
619
|
+
num_cols (int): Number of columns.
|
|
620
|
+
"""
|
|
621
|
+
|
|
622
|
+
table_cells: List[TableCell] = []
|
|
623
|
+
num_rows: int = 0
|
|
624
|
+
num_cols: int = 0
|
|
625
|
+
|
|
626
|
+
@computed_field
|
|
627
|
+
@property
|
|
628
|
+
def grid(self) -> List[List[TableCell]]:
|
|
629
|
+
"""
|
|
630
|
+
Returns a 2D grid of TableCell objects for the table.
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
List[List[TableCell]]: Table as 2D grid.
|
|
634
|
+
"""
|
|
635
|
+
table_data = [
|
|
636
|
+
[
|
|
637
|
+
TableCell(
|
|
638
|
+
text="",
|
|
639
|
+
start_row_offset_idx=i,
|
|
640
|
+
end_row_offset_idx=i + 1,
|
|
641
|
+
start_col_offset_idx=j,
|
|
642
|
+
end_col_offset_idx=j + 1,
|
|
643
|
+
)
|
|
644
|
+
for j in range(self.num_cols)
|
|
645
|
+
]
|
|
646
|
+
for i in range(self.num_rows)
|
|
647
|
+
]
|
|
648
|
+
for cell in self.table_cells:
|
|
649
|
+
for i in range(
|
|
650
|
+
min(cell.start_row_offset_idx, self.num_rows),
|
|
651
|
+
min(cell.end_row_offset_idx, self.num_rows),
|
|
652
|
+
):
|
|
653
|
+
for j in range(
|
|
654
|
+
min(cell.start_col_offset_idx, self.num_cols),
|
|
655
|
+
min(cell.end_col_offset_idx, self.num_cols),
|
|
656
|
+
):
|
|
657
|
+
table_data[i][j] = cell
|
|
658
|
+
return table_data
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
# OTSL tag constants
|
|
662
|
+
OTSL_NL = "<nl>"
|
|
663
|
+
OTSL_FCEL = "<fcel>"
|
|
664
|
+
OTSL_ECEL = "<ecel>"
|
|
665
|
+
OTSL_LCEL = "<lcel>"
|
|
666
|
+
OTSL_UCEL = "<ucel>"
|
|
667
|
+
OTSL_XCEL = "<xcel>"
|
|
668
|
+
|
|
669
|
+
NON_CAPTURING_TAG_GROUP = "(?:<fcel>|<ecel>|<nl>|<lcel>|<ucel>|<xcel>)"
|
|
670
|
+
OTSL_FIND_PATTERN = re.compile(
|
|
671
|
+
f"{NON_CAPTURING_TAG_GROUP}.*?(?={NON_CAPTURING_TAG_GROUP}|$)", flags=re.DOTALL
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def otsl_extract_tokens_and_text(s: str):
|
|
676
|
+
"""
|
|
677
|
+
Extract OTSL tags and text parts from the input string.
|
|
678
|
+
|
|
679
|
+
Args:
|
|
680
|
+
s (str): OTSL string.
|
|
681
|
+
|
|
682
|
+
Returns:
|
|
683
|
+
Tuple[List[str], List[str]]: (tokens, text_parts)
|
|
684
|
+
"""
|
|
685
|
+
pattern = (
|
|
686
|
+
r"("
|
|
687
|
+
+ r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL])
|
|
688
|
+
+ r")"
|
|
689
|
+
)
|
|
690
|
+
tokens = re.findall(pattern, s)
|
|
691
|
+
text_parts = re.split(pattern, s)
|
|
692
|
+
text_parts = [token for token in text_parts if token.strip()]
|
|
693
|
+
return tokens, text_parts
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def otsl_parse_texts(texts, tokens):
|
|
697
|
+
"""
|
|
698
|
+
Parse OTSL text and tags into TableCell objects and tag structure.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
texts (List[str]): List of tokens and text.
|
|
702
|
+
tokens (List[str]): List of OTSL tags.
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
Tuple[List[TableCell], List[List[str]]]: (table_cells, split_row_tokens)
|
|
706
|
+
"""
|
|
707
|
+
split_word = OTSL_NL
|
|
708
|
+
split_row_tokens = [
|
|
709
|
+
list(y)
|
|
710
|
+
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
711
|
+
if not x
|
|
712
|
+
]
|
|
713
|
+
table_cells = []
|
|
714
|
+
r_idx = 0
|
|
715
|
+
c_idx = 0
|
|
716
|
+
|
|
717
|
+
# Ensure matrix completeness
|
|
718
|
+
if split_row_tokens:
|
|
719
|
+
max_cols = max(len(row) for row in split_row_tokens)
|
|
720
|
+
for row in split_row_tokens:
|
|
721
|
+
while len(row) < max_cols:
|
|
722
|
+
row.append(OTSL_ECEL)
|
|
723
|
+
new_texts = []
|
|
724
|
+
text_idx = 0
|
|
725
|
+
for row in split_row_tokens:
|
|
726
|
+
for token in row:
|
|
727
|
+
new_texts.append(token)
|
|
728
|
+
if text_idx < len(texts) and texts[text_idx] == token:
|
|
729
|
+
text_idx += 1
|
|
730
|
+
if text_idx < len(texts) and texts[text_idx] not in [
|
|
731
|
+
OTSL_NL,
|
|
732
|
+
OTSL_FCEL,
|
|
733
|
+
OTSL_ECEL,
|
|
734
|
+
OTSL_LCEL,
|
|
735
|
+
OTSL_UCEL,
|
|
736
|
+
OTSL_XCEL,
|
|
737
|
+
]:
|
|
738
|
+
new_texts.append(texts[text_idx])
|
|
739
|
+
text_idx += 1
|
|
740
|
+
new_texts.append(OTSL_NL)
|
|
741
|
+
if text_idx < len(texts) and texts[text_idx] == OTSL_NL:
|
|
742
|
+
text_idx += 1
|
|
743
|
+
texts = new_texts
|
|
744
|
+
|
|
745
|
+
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
746
|
+
span = 0
|
|
747
|
+
c_idx_iter = c_idx
|
|
748
|
+
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
749
|
+
c_idx_iter += 1
|
|
750
|
+
span += 1
|
|
751
|
+
if c_idx_iter >= len(tokens[r_idx]):
|
|
752
|
+
return span
|
|
753
|
+
return span
|
|
754
|
+
|
|
755
|
+
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
756
|
+
span = 0
|
|
757
|
+
r_idx_iter = r_idx
|
|
758
|
+
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
759
|
+
r_idx_iter += 1
|
|
760
|
+
span += 1
|
|
761
|
+
if r_idx_iter >= len(tokens):
|
|
762
|
+
return span
|
|
763
|
+
return span
|
|
764
|
+
|
|
765
|
+
for i, text in enumerate(texts):
|
|
766
|
+
cell_text = ""
|
|
767
|
+
if text in [OTSL_FCEL, OTSL_ECEL]:
|
|
768
|
+
row_span = 1
|
|
769
|
+
col_span = 1
|
|
770
|
+
right_offset = 1
|
|
771
|
+
if text != OTSL_ECEL:
|
|
772
|
+
cell_text = texts[i + 1]
|
|
773
|
+
right_offset = 2
|
|
774
|
+
|
|
775
|
+
next_right_cell = (
|
|
776
|
+
texts[i + right_offset] if i + right_offset < len(texts) else ""
|
|
777
|
+
)
|
|
778
|
+
next_bottom_cell = ""
|
|
779
|
+
if r_idx + 1 < len(split_row_tokens):
|
|
780
|
+
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
781
|
+
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
782
|
+
|
|
783
|
+
if next_right_cell in [OTSL_LCEL, OTSL_XCEL]:
|
|
784
|
+
col_span += count_right(
|
|
785
|
+
split_row_tokens, c_idx + 1, r_idx, [OTSL_LCEL, OTSL_XCEL]
|
|
786
|
+
)
|
|
787
|
+
if next_bottom_cell in [OTSL_UCEL, OTSL_XCEL]:
|
|
788
|
+
row_span += count_down(
|
|
789
|
+
split_row_tokens, c_idx, r_idx + 1, [OTSL_UCEL, OTSL_XCEL]
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
table_cells.append(
|
|
793
|
+
TableCell(
|
|
794
|
+
text=cell_text.strip(),
|
|
795
|
+
row_span=row_span,
|
|
796
|
+
col_span=col_span,
|
|
797
|
+
start_row_offset_idx=r_idx,
|
|
798
|
+
end_row_offset_idx=r_idx + row_span,
|
|
799
|
+
start_col_offset_idx=c_idx,
|
|
800
|
+
end_col_offset_idx=c_idx + col_span,
|
|
801
|
+
)
|
|
802
|
+
)
|
|
803
|
+
if text in [OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]:
|
|
804
|
+
c_idx += 1
|
|
805
|
+
if text == OTSL_NL:
|
|
806
|
+
r_idx += 1
|
|
807
|
+
c_idx = 0
|
|
808
|
+
return table_cells, split_row_tokens
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def export_to_html(table_data: TableData):
|
|
812
|
+
"""
|
|
813
|
+
Export TableData to HTML table.
|
|
814
|
+
|
|
815
|
+
Args:
|
|
816
|
+
table_data (TableData): TableData object.
|
|
817
|
+
|
|
818
|
+
Returns:
|
|
819
|
+
str: HTML string.
|
|
820
|
+
"""
|
|
821
|
+
nrows = table_data.num_rows
|
|
822
|
+
ncols = table_data.num_cols
|
|
823
|
+
if len(table_data.table_cells) == 0:
|
|
824
|
+
return ""
|
|
825
|
+
body = ""
|
|
826
|
+
grid = table_data.grid
|
|
827
|
+
for i in range(nrows):
|
|
828
|
+
body += "<tr>"
|
|
829
|
+
for j in range(ncols):
|
|
830
|
+
cell: TableCell = grid[i][j]
|
|
831
|
+
rowspan, rowstart = (cell.row_span, cell.start_row_offset_idx)
|
|
832
|
+
colspan, colstart = (cell.col_span, cell.start_col_offset_idx)
|
|
833
|
+
if rowstart != i or colstart != j:
|
|
834
|
+
continue
|
|
835
|
+
content = html.escape(cell.text.strip())
|
|
836
|
+
celltag = "th" if cell.column_header else "td"
|
|
837
|
+
opening_tag = f"{celltag}"
|
|
838
|
+
if rowspan > 1:
|
|
839
|
+
opening_tag += f' rowspan="{rowspan}"'
|
|
840
|
+
if colspan > 1:
|
|
841
|
+
opening_tag += f' colspan="{colspan}"'
|
|
842
|
+
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
843
|
+
body += "</tr>"
|
|
844
|
+
body = f"<table>{body}</table>"
|
|
845
|
+
return body
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
def otsl_pad_to_sqr_v2(otsl_str: str) -> str:
|
|
849
|
+
"""
|
|
850
|
+
Pad OTSL string to a square (rectangular) format, ensuring each row has equal number of cells.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
otsl_str (str): OTSL string.
|
|
854
|
+
|
|
855
|
+
Returns:
|
|
856
|
+
str: Padded OTSL string.
|
|
857
|
+
"""
|
|
858
|
+
assert isinstance(otsl_str, str)
|
|
859
|
+
otsl_str = otsl_str.strip()
|
|
860
|
+
if OTSL_NL not in otsl_str:
|
|
861
|
+
return otsl_str + OTSL_NL
|
|
862
|
+
lines = otsl_str.split(OTSL_NL)
|
|
863
|
+
row_data = []
|
|
864
|
+
for line in lines:
|
|
865
|
+
if not line:
|
|
866
|
+
continue
|
|
867
|
+
raw_cells = OTSL_FIND_PATTERN.findall(line)
|
|
868
|
+
if not raw_cells:
|
|
869
|
+
continue
|
|
870
|
+
total_len = len(raw_cells)
|
|
871
|
+
min_len = 0
|
|
872
|
+
for i, cell_str in enumerate(raw_cells):
|
|
873
|
+
if cell_str.startswith(OTSL_FCEL):
|
|
874
|
+
min_len = i + 1
|
|
875
|
+
row_data.append(
|
|
876
|
+
{"raw_cells": raw_cells, "total_len": total_len, "min_len": min_len}
|
|
877
|
+
)
|
|
878
|
+
if not row_data:
|
|
879
|
+
return OTSL_NL
|
|
880
|
+
global_min_width = max(row["min_len"] for row in row_data) if row_data else 0
|
|
881
|
+
max_total_len = max(row["total_len"] for row in row_data) if row_data else 0
|
|
882
|
+
search_start = global_min_width
|
|
883
|
+
search_end = max(global_min_width, max_total_len)
|
|
884
|
+
min_total_cost = float("inf")
|
|
885
|
+
optimal_width = search_end
|
|
886
|
+
|
|
887
|
+
for width in range(search_start, search_end + 1):
|
|
888
|
+
current_total_cost = sum(abs(row["total_len"] - width) for row in row_data)
|
|
889
|
+
if current_total_cost < min_total_cost:
|
|
890
|
+
min_total_cost = current_total_cost
|
|
891
|
+
optimal_width = width
|
|
892
|
+
|
|
893
|
+
repaired_lines = []
|
|
894
|
+
for row in row_data:
|
|
895
|
+
cells = row["raw_cells"]
|
|
896
|
+
current_len = len(cells)
|
|
897
|
+
if current_len > optimal_width:
|
|
898
|
+
new_cells = cells[:optimal_width]
|
|
899
|
+
else:
|
|
900
|
+
padding = [OTSL_ECEL] * (optimal_width - current_len)
|
|
901
|
+
new_cells = cells + padding
|
|
902
|
+
repaired_lines.append("".join(new_cells))
|
|
903
|
+
return OTSL_NL.join(repaired_lines) + OTSL_NL
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def convert_otsl_to_html(otsl_content: str):
|
|
907
|
+
"""
|
|
908
|
+
Convert OTSL-v1.0 string to HTML. Only 6 tags allowed: <fcel>, <ecel>, <nl>, <lcel>, <ucel>, <xcel>.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
otsl_content (str): OTSL string.
|
|
912
|
+
|
|
913
|
+
Returns:
|
|
914
|
+
str: HTML table.
|
|
915
|
+
"""
|
|
916
|
+
otsl_content = otsl_pad_to_sqr_v2(otsl_content)
|
|
917
|
+
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
918
|
+
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
919
|
+
table_data = TableData(
|
|
920
|
+
num_rows=len(split_row_tokens),
|
|
921
|
+
num_cols=(max(len(row) for row in split_row_tokens) if split_row_tokens else 0),
|
|
922
|
+
table_cells=table_cells,
|
|
923
|
+
)
|
|
924
|
+
return export_to_html(table_data)
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def find_shortest_repeating_substring(s: str) -> Union[str, None]:
|
|
928
|
+
"""
|
|
929
|
+
Find the shortest substring that repeats to form the entire string.
|
|
930
|
+
|
|
931
|
+
Args:
|
|
932
|
+
s (str): Input string.
|
|
933
|
+
|
|
934
|
+
Returns:
|
|
935
|
+
str or None: Shortest repeating substring, or None if not found.
|
|
936
|
+
"""
|
|
937
|
+
n = len(s)
|
|
938
|
+
for i in range(1, n // 2 + 1):
|
|
939
|
+
if n % i == 0:
|
|
940
|
+
substring = s[:i]
|
|
941
|
+
if substring * (n // i) == s:
|
|
942
|
+
return substring
|
|
943
|
+
return None
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def find_repeating_suffix(
|
|
947
|
+
s: str, min_len: int = 8, min_repeats: int = 5
|
|
948
|
+
) -> Union[Tuple[str, str, int], None]:
|
|
949
|
+
"""
|
|
950
|
+
Detect if string ends with a repeating phrase.
|
|
951
|
+
|
|
952
|
+
Args:
|
|
953
|
+
s (str): Input string.
|
|
954
|
+
min_len (int): Minimum length of unit.
|
|
955
|
+
min_repeats (int): Minimum repeat count.
|
|
956
|
+
|
|
957
|
+
Returns:
|
|
958
|
+
Tuple[str, str, int] or None: (prefix, unit, count) if found, else None.
|
|
959
|
+
"""
|
|
960
|
+
for i in range(len(s) // (min_repeats), min_len - 1, -1):
|
|
961
|
+
unit = s[-i:]
|
|
962
|
+
if s.endswith(unit * min_repeats):
|
|
963
|
+
count = 0
|
|
964
|
+
temp_s = s
|
|
965
|
+
while temp_s.endswith(unit):
|
|
966
|
+
temp_s = temp_s[:-i]
|
|
967
|
+
count += 1
|
|
968
|
+
start_index = len(s) - (count * i)
|
|
969
|
+
return s[:start_index], unit, count
|
|
970
|
+
return None
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
def truncate_repetitive_content(
|
|
974
|
+
content: str, line_threshold: int = 10, char_threshold: int = 10, min_len: int = 10
|
|
975
|
+
) -> str:
|
|
976
|
+
"""
|
|
977
|
+
Detect and truncate character-level, phrase-level, or line-level repetition in content.
|
|
978
|
+
|
|
979
|
+
Args:
|
|
980
|
+
content (str): Input text.
|
|
981
|
+
line_threshold (int): Min lines for line-level truncation.
|
|
982
|
+
char_threshold (int): Min repeats for char-level truncation.
|
|
983
|
+
min_len (int): Min length for char-level check.
|
|
984
|
+
|
|
985
|
+
Returns:
|
|
986
|
+
Union[str, str]: (truncated_content, info_string)
|
|
987
|
+
"""
|
|
988
|
+
stripped_content = content.strip()
|
|
989
|
+
if not stripped_content:
|
|
990
|
+
return content
|
|
991
|
+
|
|
992
|
+
# Priority 1: Phrase-level suffix repetition in long single lines.
|
|
993
|
+
if "\n" not in stripped_content and len(stripped_content) > 100:
|
|
994
|
+
suffix_match = find_repeating_suffix(stripped_content, min_len=8, min_repeats=5)
|
|
995
|
+
if suffix_match:
|
|
996
|
+
prefix, repeating_unit, count = suffix_match
|
|
997
|
+
if len(repeating_unit) * count > len(stripped_content) * 0.5:
|
|
998
|
+
return prefix
|
|
999
|
+
|
|
1000
|
+
# Priority 2: Full-string character-level repetition (e.g., 'ababab')
|
|
1001
|
+
if "\n" not in stripped_content and len(stripped_content) > min_len:
|
|
1002
|
+
repeating_unit = find_shortest_repeating_substring(stripped_content)
|
|
1003
|
+
if repeating_unit:
|
|
1004
|
+
count = len(stripped_content) // len(repeating_unit)
|
|
1005
|
+
if count >= char_threshold:
|
|
1006
|
+
return repeating_unit
|
|
1007
|
+
|
|
1008
|
+
# Priority 3: Line-level repetition (e.g., same line repeated many times)
|
|
1009
|
+
lines = [line.strip() for line in content.split("\n") if line.strip()]
|
|
1010
|
+
if not lines:
|
|
1011
|
+
return content
|
|
1012
|
+
total_lines = len(lines)
|
|
1013
|
+
if total_lines < line_threshold:
|
|
1014
|
+
return content
|
|
1015
|
+
line_counts = Counter(lines)
|
|
1016
|
+
most_common_line, count = line_counts.most_common(1)[0]
|
|
1017
|
+
if count >= line_threshold and (count / total_lines) >= 0.8:
|
|
1018
|
+
return most_common_line
|
|
1019
|
+
|
|
1020
|
+
return content
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
def crop_margin(img):
|
|
1024
|
+
import cv2
|
|
1025
|
+
|
|
1026
|
+
if len(img.shape) == 3:
|
|
1027
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
1028
|
+
else:
|
|
1029
|
+
gray = img.copy()
|
|
1030
|
+
|
|
1031
|
+
if gray.dtype != np.uint8:
|
|
1032
|
+
gray = gray.astype(np.uint8)
|
|
1033
|
+
|
|
1034
|
+
max_val = gray.max()
|
|
1035
|
+
min_val = gray.min()
|
|
1036
|
+
|
|
1037
|
+
if max_val == min_val:
|
|
1038
|
+
return img
|
|
1039
|
+
|
|
1040
|
+
data = (gray - min_val) / (max_val - min_val) * 255
|
|
1041
|
+
data = data.astype(np.uint8)
|
|
1042
|
+
|
|
1043
|
+
_, binary = cv2.threshold(data, 200, 255, cv2.THRESH_BINARY_INV)
|
|
1044
|
+
coords = cv2.findNonZero(binary)
|
|
1045
|
+
|
|
1046
|
+
if coords is None:
|
|
1047
|
+
return img
|
|
1048
|
+
|
|
1049
|
+
x, y, w, h = cv2.boundingRect(coords)
|
|
1050
|
+
cropped = img[y : y + h, x : x + w]
|
|
1051
|
+
|
|
1052
|
+
return cropped
|