doc-page-extractor 0.0.8__tar.gz → 0.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (40) hide show
  1. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/PKG-INFO +3 -2
  2. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/extractor.py +9 -146
  3. doc_page_extractor-0.0.10/doc_page_extractor/layout_order.py +240 -0
  4. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/ocr.py +1 -3
  5. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/overlap.py +11 -9
  6. doc_page_extractor-0.0.10/doc_page_extractor/plot.py +91 -0
  7. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor.egg-info/PKG-INFO +3 -2
  8. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor.egg-info/SOURCES.txt +1 -0
  9. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/setup.py +1 -1
  10. doc_page_extractor-0.0.8/doc_page_extractor/plot.py +0 -38
  11. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/LICENSE +0 -0
  12. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/README.md +0 -0
  13. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/__init__.py +0 -0
  14. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/clipper.py +0 -0
  15. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/downloader.py +0 -0
  16. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/layoutreader.py +0 -0
  17. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/ocr_corrector.py +0 -0
  18. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/__init__.py +0 -0
  19. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  20. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  21. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/imaug.py +0 -0
  22. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/operators.py +0 -0
  23. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  24. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  25. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  26. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  27. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  28. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  29. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/onnxocr/utils.py +0 -0
  30. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/raw_optimizer.py +0 -0
  31. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/rectangle.py +0 -0
  32. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/rotation.py +0 -0
  33. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/types.py +0 -0
  34. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor/utils.py +0 -0
  35. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
  36. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor.egg-info/requires.txt +0 -0
  37. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/doc_page_extractor.egg-info/top_level.txt +0 -0
  38. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/setup.cfg +0 -0
  39. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/tests/__init__.py +0 -0
  40. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.10}/tests/test_history_bus.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -19,6 +19,7 @@ Dynamic: author-email
19
19
  Dynamic: description
20
20
  Dynamic: description-content-type
21
21
  Dynamic: home-page
22
+ Dynamic: license-file
22
23
  Dynamic: requires-dist
23
24
  Dynamic: summary
24
25
 
@@ -1,20 +1,18 @@
1
1
  import os
2
2
 
3
- from typing import Literal, Iterable
3
+ from typing import Literal
4
4
  from pathlib import Path
5
5
  from PIL.Image import Image
6
- from transformers import LayoutLMv3ForTokenClassification
7
6
  from doclayout_yolo import YOLOv10
8
7
 
9
- from .layoutreader import prepare_inputs, boxes2inputs, parse_logits
10
8
  from .ocr import OCR
11
9
  from .ocr_corrector import correct_fragments
12
10
  from .raw_optimizer import RawOptimizer
13
11
  from .rectangle import intersection_area, Rectangle
14
12
  from .types import ExtractedResult, OCRFragment, LayoutClass, Layout
15
13
  from .downloader import download
16
- from .overlap import regroup_lines, remove_overlap_layouts
17
- from .utils import ensure_dir
14
+ from .layout_order import LayoutOrder
15
+ from .overlap import merge_fragments_as_line, remove_overlap_layouts
18
16
 
19
17
 
20
18
  class DocExtractor:
@@ -23,15 +21,15 @@ class DocExtractor:
23
21
  model_dir_path: str,
24
22
  device: Literal["cpu", "cuda"] = "cpu",
25
23
  ocr_for_each_layouts: bool = True,
26
- order_by_layoutreader: bool = False,
27
24
  ):
28
25
  self._model_dir_path: str = model_dir_path
29
26
  self._device: Literal["cpu", "cuda"] = device
30
27
  self._ocr_for_each_layouts: bool = ocr_for_each_layouts
31
- self._order_by_layoutreader: bool = order_by_layoutreader
32
28
  self._ocr: OCR = OCR(device, model_dir_path)
33
29
  self._yolo: YOLOv10 | None = None
34
- self._layout: LayoutLMv3ForTokenClassification | None = None
30
+ self._layout_order: LayoutOrder = LayoutOrder(
31
+ model_path=os.path.join(model_dir_path, "layoutreader"),
32
+ )
35
33
 
36
34
  def extract(
37
35
  self,
@@ -42,7 +40,6 @@ class DocExtractor:
42
40
  raw_optimizer = RawOptimizer(image, adjust_points)
43
41
  fragments = list(self._ocr.search_fragments(raw_optimizer.image_np))
44
42
  raw_optimizer.receive_raw_fragments(fragments)
45
-
46
43
  layouts = self._get_layouts(raw_optimizer.image)
47
44
  layouts = self._layouts_matched_by_fragments(fragments, layouts)
48
45
  layouts = remove_overlap_layouts(layouts)
@@ -50,18 +47,12 @@ class DocExtractor:
50
47
  if self._ocr_for_each_layouts:
51
48
  self._correct_fragments_by_ocr_layouts(raw_optimizer.image, layouts)
52
49
 
53
- if self._order_by_layoutreader:
54
- width, height = raw_optimizer.image.size
55
- self._order_fragments_by_ai(width, height, layouts)
56
- else:
57
- self._order_fragments_by_y(layouts)
58
-
50
+ layouts = self._layout_order.sort(layouts, raw_optimizer.image.size)
59
51
  layouts = [layout for layout in layouts if self._should_keep_layout(layout)]
52
+
60
53
  for layout in layouts:
61
- layout.fragments = regroup_lines(layout.fragments)
62
- layout.fragments.sort(key=lambda fragment: fragment.order)
54
+ layout.fragments = merge_fragments_as_line(layout.fragments)
63
55
 
64
- layouts = self._sort_layouts(layouts)
65
56
  raw_optimizer.receive_raw_layouts(layouts)
66
57
 
67
58
  return ExtractedResult(
@@ -166,59 +157,6 @@ class DocExtractor:
166
157
  self._yolo = YOLOv10(str(yolo_model_path))
167
158
  return self._yolo
168
159
 
169
- def _order_fragments_by_y(self, layouts: list[Layout]):
170
- fragments = list(self._iter_fragments(layouts))
171
- fragments.sort(key=lambda f: f.rect.lt[1] + f.rect.rt[1])
172
- for i, fragment in enumerate(fragments):
173
- fragment.order = i
174
-
175
- def _order_fragments_by_ai(self, width: int, height: int, layouts: list[Layout]):
176
- if width == 0 or height == 0:
177
- return
178
-
179
- layout_model = self._get_layout()
180
- boxes: list[list[int]] = []
181
- steps: float = 1000.0 # max value of layoutreader
182
- x_rate: float = 1.0
183
- y_rate: float = 1.0
184
- x_offset: float = 0.0
185
- y_offset: float = 0.0
186
- if width > height:
187
- y_rate = height / width
188
- y_offset = (1.0 - y_rate) / 2.0
189
- else:
190
- x_rate = width / height
191
- x_offset = (1.0 - x_rate) / 2.0
192
-
193
- for left, top, right, bottom in self._collect_rate_boxes(
194
- fragments=self._iter_fragments(layouts),
195
- ):
196
- boxes.append([
197
- round((left * x_rate + x_offset) * steps),
198
- round((top * y_rate + y_offset) * steps),
199
- round((right * x_rate + x_offset) * steps),
200
- round((bottom * y_rate + y_offset) * steps),
201
- ])
202
- inputs = boxes2inputs(boxes)
203
- inputs = prepare_inputs(inputs, layout_model)
204
- logits = layout_model(**inputs).logits.cpu().squeeze(0)
205
- orders: list[int] = parse_logits(logits, len(boxes))
206
-
207
- for order, fragment in zip(orders, self._iter_fragments(layouts)):
208
- fragment.order = order
209
-
210
- def _get_layout(self) -> LayoutLMv3ForTokenClassification:
211
- if self._layout is None:
212
- cache_dir = ensure_dir(
213
- os.path.join(self._model_dir_path, "layoutreader"),
214
- )
215
- self._layout = LayoutLMv3ForTokenClassification.from_pretrained(
216
- pretrained_model_name_or_path="hantian/layoutreader",
217
- cache_dir=cache_dir,
218
- local_files_only=os.path.exists(os.path.join(cache_dir, "models--hantian--layoutreader")),
219
- )
220
- return self._layout
221
-
222
160
  def _should_keep_layout(self, layout: Layout) -> bool:
223
161
  if len(layout.fragments) > 0:
224
162
  return True
@@ -229,78 +167,3 @@ class DocExtractor:
229
167
  cls == LayoutClass.ISOLATE_FORMULA
230
168
  )
231
169
 
232
- def _sort_layouts(self, layouts: list[Layout]) -> list[Layout]:
233
- layouts.sort(key=lambda layout: layout.rect.lt[1] + layout.rect.rt[1])
234
-
235
- sorted_layouts: list[tuple[int, Layout]] = []
236
- empty_layouts: list[tuple[int, Layout]] = []
237
-
238
- for i, layout in enumerate(layouts):
239
- if len(layout.fragments) > 0:
240
- sorted_layouts.append((i, layout))
241
- else:
242
- empty_layouts.append((i, layout))
243
-
244
- # try to maintain the order of empty layouts and other layouts as much as possible
245
- for i, layout in empty_layouts:
246
- max_less_index: int = -1
247
- max_less_layout: Layout | None = None
248
- max_less_index_in_enumerated: int = -1
249
- for j, (k, sorted_layout) in enumerate(sorted_layouts):
250
- if k < i and k > max_less_index:
251
- max_less_index = k
252
- max_less_layout = sorted_layout
253
- max_less_index_in_enumerated = j
254
-
255
- if max_less_layout is None:
256
- sorted_layouts.insert(0, (i, layout))
257
- else:
258
- sorted_layouts.insert(max_less_index_in_enumerated + 1, (i, layout))
259
-
260
- return [layout for _, layout in sorted_layouts]
261
-
262
- def _collect_rate_boxes(self, fragments: Iterable[OCRFragment]):
263
- boxes = self._get_boxes(fragments)
264
- left = float("inf")
265
- top = float("inf")
266
- right = float("-inf")
267
- bottom = float("-inf")
268
-
269
- for _left, _top, _right, _bottom in boxes:
270
- left = min(left, _left)
271
- top = min(top, _top)
272
- right = max(right, _right)
273
- bottom = max(bottom, _bottom)
274
-
275
- width = right - left
276
- height = bottom - top
277
-
278
- if width == 0 or height == 0:
279
- return
280
-
281
- for _left, _top, _right, _bottom in boxes:
282
- yield (
283
- (_left - left) / width,
284
- (_top - top) / height,
285
- (_right - left) / width,
286
- (_bottom - top) / height,
287
- )
288
-
289
- def _get_boxes(self, fragments: Iterable[OCRFragment]):
290
- boxes: list[tuple[float, float, float, float]] = []
291
- for fragment in fragments:
292
- left: float = float("inf")
293
- top: float = float("inf")
294
- right: float = float("-inf")
295
- bottom: float = float("-inf")
296
- for x, y in fragment.rect:
297
- left = min(left, x)
298
- top = min(top, y)
299
- right = max(right, x)
300
- bottom = max(bottom, y)
301
- boxes.append((left, top, right, bottom))
302
- return boxes
303
-
304
- def _iter_fragments(self, layouts: list[Layout]):
305
- for layout in layouts:
306
- yield from layout.fragments
@@ -0,0 +1,240 @@
1
+ import os
2
+ import torch
3
+
4
+ from typing import Generator
5
+ from dataclasses import dataclass
6
+ from transformers import LayoutLMv3ForTokenClassification
7
+
8
+ from .types import Layout, LayoutClass
9
+ from .layoutreader import prepare_inputs, boxes2inputs, parse_logits
10
+ from .utils import ensure_dir
11
+
12
+
13
+ @dataclass
14
+ class _BBox:
15
+ layout_index: int
16
+ fragment_index: int
17
+ virtual: bool
18
+ order: int
19
+ value: tuple[float, float, float, float]
20
+
21
+ class LayoutOrder:
22
+ def __init__(self, model_path: str):
23
+ self._model_path: str = model_path
24
+ self._model: LayoutLMv3ForTokenClassification | None = None
25
+
26
+ def _get_model(self) -> LayoutLMv3ForTokenClassification:
27
+ if self._model is None:
28
+ model_path = ensure_dir(self._model_path)
29
+ self._model = LayoutLMv3ForTokenClassification.from_pretrained(
30
+ pretrained_model_name_or_path="hantian/layoutreader",
31
+ cache_dir=model_path,
32
+ local_files_only=os.path.exists(os.path.join(model_path, "models--hantian--layoutreader")),
33
+ )
34
+ return self._model
35
+
36
+ def sort(self, layouts: list[Layout], size: tuple[int, int]) -> list[Layout]:
37
+ width, height = size
38
+ if width == 0 or height == 0:
39
+ return layouts
40
+
41
+ bbox_list = self._order_and_get_bbox_list(
42
+ layouts=layouts,
43
+ width=width,
44
+ height=height,
45
+ )
46
+ if bbox_list is None:
47
+ return layouts
48
+
49
+ return self._sort_layouts_and_fragments(layouts, bbox_list)
50
+
51
+ def _order_and_get_bbox_list(
52
+ self,
53
+ layouts: list[Layout],
54
+ width: int,
55
+ height: int,
56
+ ) -> list[_BBox] | None:
57
+
58
+ line_height = self._line_height(layouts)
59
+ bbox_list: list[_BBox] = []
60
+
61
+ for i, layout in enumerate(layouts):
62
+ if layout.cls == LayoutClass.PLAIN_TEXT and \
63
+ len(layout.fragments) > 0:
64
+ for j, fragment in enumerate(layout.fragments):
65
+ bbox_list.append(_BBox(
66
+ layout_index=i,
67
+ fragment_index=j,
68
+ virtual=False,
69
+ order=0,
70
+ value=fragment.rect.wrapper,
71
+ ))
72
+ else:
73
+ bbox_list.extend(
74
+ self._generate_virtual_lines(
75
+ layout=layout,
76
+ layout_index=i,
77
+ line_height=line_height,
78
+ width=width,
79
+ height=height,
80
+ ),
81
+ )
82
+
83
+ if len(bbox_list) > 200:
84
+ # https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L522
85
+ return None
86
+
87
+ layoutreader_size = 1000.0
88
+ x_scale = layoutreader_size / float(width)
89
+ y_scale = layoutreader_size / float(height)
90
+
91
+ for bbox in bbox_list:
92
+ x0, y0, x1, y1 = self._squeeze(bbox.value, width, height)
93
+ x0 = round(x0 * x_scale)
94
+ y0 = round(y0 * y_scale)
95
+ x1 = round(x1 * x_scale)
96
+ y1 = round(y1 * y_scale)
97
+ bbox.value = (x0, y0, x1, y1)
98
+
99
+ bbox_list.sort(key=lambda b: b.value) # 必须排序,乱序传入 layoutreader 会令它无法识别正确顺序
100
+ model = self._get_model()
101
+
102
+ with torch.no_grad():
103
+ inputs = boxes2inputs([list(bbox.value) for bbox in bbox_list])
104
+ inputs = prepare_inputs(inputs, model)
105
+ logits = model(**inputs).logits.cpu().squeeze(0)
106
+ orders = parse_logits(logits, len(bbox_list))
107
+
108
+ sorted_bbox_list = [bbox_list[i] for i in orders]
109
+ for i, bbox in enumerate(sorted_bbox_list):
110
+ bbox.order = i
111
+
112
+ return sorted_bbox_list
113
+
114
+ def _sort_layouts_and_fragments(self, layouts: list[Layout], bbox_list: list[_BBox]):
115
+ layout_bbox_list: list[list[_BBox]] = [[] for _ in range(len(layouts))]
116
+ for bbox in bbox_list:
117
+ layout_bbox_list[bbox.layout_index].append(bbox)
118
+
119
+ layouts_with_median_order: list[tuple[Layout, float]] = []
120
+ for layout_index, bbox_list in enumerate(layout_bbox_list):
121
+ layout = layouts[layout_index]
122
+ orders = [b.order for b in bbox_list] # virtual bbox 保证了 orders 不可能为空
123
+ median_order = self._median(orders)
124
+ layouts_with_median_order.append((layout, median_order))
125
+
126
+ for layout, bbox_list in zip(layouts, layout_bbox_list):
127
+ for bbox in bbox_list:
128
+ if not bbox.virtual:
129
+ layout.fragments[bbox.fragment_index].order = bbox.order
130
+ if all(not bbox.virtual for bbox in bbox_list):
131
+ layout.fragments.sort(key=lambda f: f.order)
132
+
133
+ layouts_with_median_order.sort(key=lambda x: x[1])
134
+ layouts = [layout for layout, _ in layouts_with_median_order]
135
+ next_fragment_order: int = 0
136
+
137
+ for layout in layouts:
138
+ for fragment in layout.fragments:
139
+ fragment.order = next_fragment_order
140
+ next_fragment_order += 1
141
+
142
+ return layouts
143
+
144
+ def _line_height(self, layouts: list[Layout]) -> float:
145
+ line_height: float = 0.0
146
+ count: int = 0
147
+ for layout in layouts:
148
+ for fragment in layout.fragments:
149
+ _, height = fragment.rect.size
150
+ line_height += height
151
+ count += 1
152
+ if count == 0:
153
+ return 10.0
154
+ return line_height / float(count)
155
+
156
+ def _generate_virtual_lines(
157
+ self,
158
+ layout: Layout,
159
+ layout_index: int,
160
+ line_height: float,
161
+ width: int,
162
+ height: int,
163
+ ) -> Generator[_BBox, None, None]:
164
+
165
+ # https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L451-L490
166
+ x0, y0, x1, y1 = layout.rect.wrapper
167
+ layout_height = y1 - y0
168
+ layout_weight = x1 - x0
169
+ lines = int(layout_height / line_height)
170
+
171
+ if layout_height <= line_height * 2:
172
+ yield _BBox(
173
+ layout_index=layout_index,
174
+ fragment_index=0,
175
+ virtual=True,
176
+ order=0,
177
+ value=(x0, y0, x1, y1),
178
+ )
179
+ return
180
+
181
+ elif layout_height <= height * 0.25 or \
182
+ width * 0.5 <= layout_weight or \
183
+ width * 0.25 < layout_weight:
184
+ if layout_weight > width * 0.4:
185
+ lines = 3
186
+ elif layout_weight <= width * 0.25:
187
+ if layout_height / layout_weight > 1.2: # 细长的不分
188
+ yield _BBox(
189
+ layout_index=layout_index,
190
+ fragment_index=0,
191
+ virtual=True,
192
+ order=0,
193
+ value=(x0, y0, x1, y1),
194
+ )
195
+ return
196
+ else: # 不细长的还是分成两行
197
+ lines = 2
198
+
199
+ lines = max(1, lines)
200
+ line_height = (y1 - y0) / lines
201
+ current_y = y0
202
+
203
+ for i in range(lines):
204
+ yield _BBox(
205
+ layout_index=layout_index,
206
+ fragment_index=i,
207
+ virtual=True,
208
+ order=0,
209
+ value=(x0, current_y, x1, current_y + line_height),
210
+ )
211
+ current_y += line_height
212
+
213
+ def _median(self, numbers: list[int]) -> float:
214
+ sorted_numbers = sorted(numbers)
215
+ n = len(sorted_numbers)
216
+
217
+ # 判断是奇数还是偶数个元素
218
+ if n % 2 == 1:
219
+ # 奇数情况,直接取中间的数
220
+ return float(sorted_numbers[n // 2])
221
+ else:
222
+ # 偶数情况,取中间两个数的平均值
223
+ mid1 = sorted_numbers[n // 2 - 1]
224
+ mid2 = sorted_numbers[n // 2]
225
+ return float((mid1 + mid2) / 2)
226
+
227
+ def _squeeze(self, bbox: _BBox, width: int, height: int) -> _BBox:
228
+ x0, y0, x1, y1 = bbox
229
+ x0 = self._squeeze_value(x0, width)
230
+ x1 = self._squeeze_value(x1, width)
231
+ y0 = self._squeeze_value(y0, height)
232
+ y1 = self._squeeze_value(y1, height)
233
+ return x0, y0, x1, y1
234
+
235
+ def _squeeze_value(self, position: float, size: int) -> float:
236
+ if position < 0:
237
+ position = 0.0
238
+ if position > size:
239
+ position = float(size)
240
+ return position
@@ -58,7 +58,6 @@ class OCR:
58
58
  self._text_system: TextSystem | None = None
59
59
 
60
60
  def search_fragments(self, image: np.ndarray) -> Generator[OCRFragment, None, None]:
61
- index: int = 0
62
61
  for box, res in self._ocr(image):
63
62
  text, rank = res
64
63
  if is_space_text(text):
@@ -74,12 +73,11 @@ class OCR:
74
73
  continue
75
74
 
76
75
  yield OCRFragment(
77
- order=index,
76
+ order=0,
78
77
  text=text,
79
78
  rank=rank,
80
79
  rect=rect,
81
80
  )
82
- index += 1
83
81
 
84
82
  def _ocr(self, image: np.ndarray) -> Generator[tuple[list[list[float]], tuple[str, float]], None, None]:
85
83
  text_system = self._get_text_system()
@@ -60,7 +60,7 @@ class _OverlapMatrixContext:
60
60
  rate >= _INCLUDES_MIN_RATE:
61
61
  yield i
62
62
 
63
- def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
63
+ def merge_fragments_as_line(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
64
64
  fragments: list[OCRFragment] = []
65
65
  for group in _split_fragments_into_groups(origin_fragments):
66
66
  if len(group) == 1:
@@ -88,7 +88,7 @@ def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
88
88
  x2 = max(x2, x)
89
89
  y2 = max(y2, y)
90
90
 
91
- if len(proto_texts_len) == 0:
91
+ if proto_texts_len == 0:
92
92
  continue
93
93
 
94
94
  fragments.append(OCRFragment(
@@ -115,17 +115,19 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
115
115
  height = y2 - y1
116
116
  median = (y1 + y2) / 2.0
117
117
 
118
+ if height == 0:
119
+ continue
120
+
118
121
  if len(group) > 0:
119
122
  next_mean_median = (sum_median + median) / (len(group) + 1)
120
123
  next_mean_height = (sum_height + height) / (len(group) + 1)
121
124
 
122
- if next_mean_height > 0:
123
- deviation_rate = abs(median - next_mean_median) / next_mean_height
124
- if deviation_rate > max_deviation_rate:
125
- yield group
126
- group = []
127
- sum_height = 0.0
128
- sum_median = 0.0
125
+ deviation_rate = abs(median - next_mean_median) / next_mean_height
126
+ if deviation_rate > max_deviation_rate:
127
+ yield group
128
+ group = []
129
+ sum_height = 0.0
130
+ sum_median = 0.0
129
131
 
130
132
  group.append(fragment)
131
133
  sum_height += height
@@ -0,0 +1,91 @@
1
+ from typing import Iterable
2
+ from PIL import ImageDraw
3
+ from PIL.ImageFont import load_default, FreeTypeFont
4
+ from PIL.Image import Image
5
+ from .types import Layout, LayoutClass
6
+ from .rectangle import Point
7
+
8
+ _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
9
+ _Color = tuple[int, int, int]
10
+
11
+ def plot(image: Image, layouts: Iterable[Layout]) -> None:
12
+ layout_font = load_default(size=35)
13
+ fragment_font = load_default(size=25)
14
+ draw = ImageDraw.Draw(image, mode="RGBA")
15
+
16
+ def _draw_number(position: Point, number: int, font: FreeTypeFont, bold: bool, color: _Color) -> None:
17
+ nonlocal draw
18
+ x, y = position
19
+ text = str(object=number)
20
+ width = len(text) * font.size
21
+ offset = round(font.size * 0.15)
22
+
23
+ for dx, dy in _generate_delta(bold):
24
+ draw.text(
25
+ xy=(x + dx - width - offset, y + dy),
26
+ text=text,
27
+ font=font,
28
+ fill=color,
29
+ )
30
+
31
+ for layout in layouts:
32
+ draw.polygon(
33
+ xy=[p for p in layout.rect],
34
+ outline=_layout_color(layout),
35
+ width=5,
36
+ )
37
+
38
+ for layout in layouts:
39
+ for fragment in layout.fragments:
40
+ draw.polygon(
41
+ xy=[p for p in fragment.rect],
42
+ outline=_FRAGMENT_COLOR,
43
+ width=3,
44
+ )
45
+ _draw_number(
46
+ position=fragment.rect.lt,
47
+ number=fragment.order + 1,
48
+ font=fragment_font,
49
+ bold=False,
50
+ color=_FRAGMENT_COLOR,
51
+ )
52
+
53
+ for i, layout in enumerate(layouts):
54
+ _draw_number(
55
+ position=layout.rect.lt,
56
+ number=i + 1,
57
+ font=layout_font,
58
+ bold=True,
59
+ color=_layout_color(layout),
60
+ )
61
+
62
+ def _generate_delta(bold: bool):
63
+ if bold:
64
+ for dx in range(-1, 2):
65
+ for dy in range(-1, 2):
66
+ yield dx, dy
67
+ else:
68
+ yield 0, 0
69
+
70
+ def _layout_color(layout: Layout) -> _Color:
71
+ cls = layout.cls
72
+ if cls == LayoutClass.TITLE:
73
+ return (0x0A, 0x12, 0x2C) # Dark
74
+ elif cls == LayoutClass.PLAIN_TEXT:
75
+ return (0x3C, 0x67, 0x90) # Blue
76
+ elif cls == LayoutClass.ABANDON:
77
+ return (0xC0, 0xBB, 0xA9) # Gray
78
+ elif cls == LayoutClass.FIGURE:
79
+ return (0x5B, 0x91, 0x3C) # Dark Green
80
+ elif cls == LayoutClass.FIGURE_CAPTION:
81
+ return (0x77, 0xB3, 0x54) # Green
82
+ elif cls == LayoutClass.TABLE:
83
+ return (0x44, 0x17, 0x52) # Dark Purple
84
+ elif cls == LayoutClass.TABLE_CAPTION:
85
+ return (0x81, 0x75, 0xA0) # Purple
86
+ elif cls == LayoutClass.TABLE_FOOTNOTE:
87
+ return (0xEF, 0xB6, 0xC9) # Pink Purple
88
+ elif cls == LayoutClass.ISOLATE_FORMULA:
89
+ return (0xFA, 0x38, 0x27) # Red
90
+ elif cls == LayoutClass.FORMULA_CAPTION:
91
+ return (0xFF, 0x9D, 0x24) # Orange
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -19,6 +19,7 @@ Dynamic: author-email
19
19
  Dynamic: description
20
20
  Dynamic: description-content-type
21
21
  Dynamic: home-page
22
+ Dynamic: license-file
22
23
  Dynamic: requires-dist
23
24
  Dynamic: summary
24
25
 
@@ -5,6 +5,7 @@ doc_page_extractor/__init__.py
5
5
  doc_page_extractor/clipper.py
6
6
  doc_page_extractor/downloader.py
7
7
  doc_page_extractor/extractor.py
8
+ doc_page_extractor/layout_order.py
8
9
  doc_page_extractor/layoutreader.py
9
10
  doc_page_extractor/ocr.py
10
11
  doc_page_extractor/ocr_corrector.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="doc-page-extractor",
5
- version="0.0.8",
5
+ version="0.0.10",
6
6
  author="Tao Zeyu",
7
7
  author_email="i@taozeyu.com",
8
8
  url="https://github.com/Moskize91/doc-page-extractor",
@@ -1,38 +0,0 @@
1
- from typing import Iterable
2
- from PIL import ImageDraw
3
- from PIL.Image import Image
4
- from .types import Layout, LayoutClass
5
-
6
- _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
7
-
8
- def plot(image: Image, layouts: Iterable[Layout]):
9
- draw = ImageDraw.Draw(image, mode="RGBA")
10
- for layout in layouts:
11
- draw.polygon([p for p in layout.rect], outline=_layout_color(layout), width=5)
12
-
13
- for layout in layouts:
14
- for fragments in layout.fragments:
15
- draw.polygon([p for p in fragments.rect], outline=_FRAGMENT_COLOR, width=3)
16
-
17
- def _layout_color(layout: Layout) -> tuple[int, int, int]:
18
- cls = layout.cls
19
- if cls == LayoutClass.TITLE:
20
- return (0x0A, 0x12, 0x2C) # Dark
21
- elif cls == LayoutClass.PLAIN_TEXT:
22
- return (0x3C, 0x67, 0x90) # Blue
23
- elif cls == LayoutClass.ABANDON:
24
- return (0xC0, 0xBB, 0xA9) # Gray
25
- elif cls == LayoutClass.FIGURE:
26
- return (0x5B, 0x91, 0x3C) # Dark Green
27
- elif cls == LayoutClass.FIGURE_CAPTION:
28
- return (0x77, 0xB3, 0x54) # Green
29
- elif cls == LayoutClass.TABLE:
30
- return (0x44, 0x17, 0x52) # Dark Purple
31
- elif cls == LayoutClass.TABLE_CAPTION:
32
- return (0x81, 0x75, 0xA0) # Purple
33
- elif cls == LayoutClass.TABLE_FOOTNOTE:
34
- return (0xEF, 0xB6, 0xC9) # Pink Purple
35
- elif cls == LayoutClass.ISOLATE_FORMULA:
36
- return (0xFA, 0x38, 0x27) # Red
37
- elif cls == LayoutClass.FORMULA_CAPTION:
38
- return (0xFF, 0x9D, 0x24) # Orange