doc-page-extractor 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (26) hide show
  1. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/PKG-INFO +1 -2
  2. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/extractor.py +74 -78
  3. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/ocr.py +44 -11
  4. doc_page_extractor-0.0.4/doc_page_extractor/ocr_corrector.py +126 -0
  5. doc_page_extractor-0.0.4/doc_page_extractor/overlap.py +156 -0
  6. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/plot.py +2 -2
  7. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/rectangle.py +13 -0
  8. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor.egg-info/PKG-INFO +1 -2
  9. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor.egg-info/SOURCES.txt +5 -1
  10. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor.egg-info/requires.txt +0 -1
  11. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor.egg-info/top_level.txt +1 -0
  12. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/setup.py +1 -2
  13. doc_page_extractor-0.0.4/tests/__init__.py +0 -0
  14. doc_page_extractor-0.0.4/tests/test_history_bus.py +55 -0
  15. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/LICENSE +0 -0
  16. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/README.md +0 -0
  17. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/__init__.py +0 -0
  18. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/clipper.py +0 -0
  19. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/downloader.py +0 -0
  20. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/layoutreader.py +0 -0
  21. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/raw_optimizer.py +0 -0
  22. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/rotation.py +0 -0
  23. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/types.py +0 -0
  24. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor/utils.py +0 -0
  25. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
  26. {doc_page_extractor-0.0.2 → doc_page_extractor-0.0.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -12,7 +12,6 @@ Requires-Dist: pillow<11.0,>=10.3
12
12
  Requires-Dist: shapely<3.0,>=2.0.0
13
13
  Requires-Dist: transformers<5.0,>=4.48.0
14
14
  Requires-Dist: doclayout_yolo>=0.0.3
15
- Requires-Dist: paddlepaddle<3.0,>=2.6.0
16
15
  Requires-Dist: paddleocr==2.9.0
17
16
  Dynamic: author
18
17
  Dynamic: author-email
@@ -1,9 +1,7 @@
1
1
  import os
2
- import sys
3
2
  import torch
4
- import numpy as np
5
3
 
6
- from typing import Literal, Generator
4
+ from typing import Literal, Iterable
7
5
  from pathlib import Path
8
6
  from PIL.Image import Image
9
7
  from transformers import LayoutLMv3ForTokenClassification
@@ -11,11 +9,13 @@ from doclayout_yolo import YOLOv10
11
9
 
12
10
  from .layoutreader import prepare_inputs, boxes2inputs, parse_logits
13
11
  from .ocr import OCR, PaddleLang
12
+ from .ocr_corrector import correct_fragments
14
13
  from .raw_optimizer import RawOptimizer
15
14
  from .rectangle import intersection_area, Rectangle
16
15
  from .types import ExtractedResult, OCRFragment, LayoutClass, Layout
17
16
  from .downloader import download
18
- from .utils import ensure_dir, is_space_text
17
+ from .overlap import regroup_lines, remove_overlap_layouts
18
+ from .utils import ensure_dir
19
19
 
20
20
 
21
21
  class DocExtractor:
@@ -23,10 +23,12 @@ class DocExtractor:
23
23
  self,
24
24
  model_dir_path: str,
25
25
  device: Literal["cpu", "cuda"] = "cpu",
26
- order_by_layoutreader: bool = True,
26
+ ocr_for_each_layouts: bool = True,
27
+ order_by_layoutreader: bool = False,
27
28
  ):
28
29
  self._model_dir_path: str = model_dir_path
29
30
  self._device: Literal["cpu", "cuda"] = device
31
+ self._ocr_for_each_layouts: bool = ocr_for_each_layouts
30
32
  self._order_by_layoutreader: bool = order_by_layoutreader
31
33
  self._ocr: OCR = OCR(device, os.path.join(model_dir_path, "paddle"))
32
34
  self._yolo: YOLOv10 | None = None
@@ -44,15 +46,28 @@ class DocExtractor:
44
46
  ) -> ExtractedResult:
45
47
 
46
48
  raw_optimizer = RawOptimizer(image, adjust_points)
47
- fragments = list(self._search_orc_fragments(raw_optimizer.image_np, lang))
49
+ fragments = list(self._ocr.search_fragments(raw_optimizer.image_np, lang))
48
50
  raw_optimizer.receive_raw_fragments(fragments)
49
51
 
52
+ layouts = self._get_layouts(raw_optimizer.image)
53
+ layouts = self._layouts_matched_by_fragments(fragments, layouts)
54
+ layouts = remove_overlap_layouts(layouts)
55
+
56
+ if self._ocr_for_each_layouts:
57
+ self._correct_fragments_by_ocr_layouts(raw_optimizer.image, layouts, lang)
58
+
50
59
  if self._order_by_layoutreader:
51
60
  width, height = raw_optimizer.image.size
52
- self._order_fragments(width, height, fragments)
61
+ self._order_fragments_by_ai(width, height, layouts)
62
+ else:
63
+ self._order_fragments_by_y(layouts)
53
64
 
54
- layouts = self._get_layouts(raw_optimizer.image)
55
- layouts = self._layouts_matched_by_fragments(fragments, layouts)
65
+ layouts = [layout for layout in layouts if self._should_keep_layout(layout)]
66
+ for layout in layouts:
67
+ layout.fragments = regroup_lines(layout.fragments)
68
+ layout.fragments.sort(key=lambda fragment: fragment.order)
69
+
70
+ layouts = self._sort_layouts(layouts)
56
71
  raw_optimizer.receive_raw_layouts(layouts)
57
72
 
58
73
  return ExtractedResult(
@@ -62,57 +77,6 @@ class DocExtractor:
62
77
  adjusted_image=raw_optimizer.adjusted_image,
63
78
  )
64
79
 
65
- def _search_orc_fragments(self, image: np.ndarray, lang: PaddleLang) -> Generator[OCRFragment, None, None]:
66
- index: int = 0
67
- for item in self._ocr.do(lang, image):
68
- for line in item:
69
- react: list[list[float]] = line[0]
70
- text, rank = line[1]
71
- if is_space_text(text):
72
- continue
73
- yield OCRFragment(
74
- order=index,
75
- text=text,
76
- rank=rank,
77
- rect=Rectangle(
78
- lt=(react[0][0], react[0][1]),
79
- rt=(react[1][0], react[1][1]),
80
- rb=(react[2][0], react[2][1]),
81
- lb=(react[3][0], react[3][1]),
82
- ),
83
- )
84
- index += 1
85
-
86
- def _order_fragments(self, width: int, height: int, fragments: list[OCRFragment]):
87
- layout_model = self._get_layout()
88
- boxes: list[list[int]] = []
89
- steps: float = 1000.0 # max value of layoutreader
90
- x_rate: float = 1.0
91
- y_rate: float = 1.0
92
- x_offset: float = 0.0
93
- y_offset: float = 0.0
94
- if width > height:
95
- y_rate = height / width
96
- y_offset = (1.0 - y_rate) / 2.0
97
- else:
98
- x_rate = width / height
99
- x_offset = (1.0 - x_rate) / 2.0
100
-
101
- for left, top, right, bottom in self._collect_rate_boxes(fragments):
102
- boxes.append([
103
- round((left * x_rate + x_offset) * steps),
104
- round((top * y_rate + y_offset) * steps),
105
- round((right * x_rate + x_offset) * steps),
106
- round((bottom * y_rate + y_offset) * steps),
107
- ])
108
- inputs = boxes2inputs(boxes)
109
- inputs = prepare_inputs(inputs, layout_model)
110
- logits = layout_model(**inputs).logits.cpu().squeeze(0)
111
- orders: list[int] = parse_logits(logits, len(boxes))
112
-
113
- for order, fragment in zip(orders, fragments):
114
- fragment.order = order
115
-
116
80
  def _get_layouts(self, source: Image) -> list[Layout]:
117
81
  # about source parameter to see:
118
82
  # https://github.com/opendatalab/DocLayout-YOLO/blob/7c4be36bc61f11b67cf4a44ee47f3c41e9800a91/doclayout_yolo/data/build.py#L157-L175
@@ -152,14 +116,11 @@ class DocExtractor:
152
116
  if layout is not None:
153
117
  layout.fragments.append(fragment)
154
118
  break
119
+ return layouts
155
120
 
121
+ def _correct_fragments_by_ocr_layouts(self, source: Image, layouts: list[Layout], lang: PaddleLang):
156
122
  for layout in layouts:
157
- layout.fragments.sort(key=lambda x: x.order)
158
-
159
- layouts = [layout for layout in layouts if self._should_keep_layout(layout)]
160
- layouts = self._sort_layouts(layouts)
161
-
162
- return layouts
123
+ correct_fragments(self._ocr, source, layout, lang)
163
124
 
164
125
  def _split_layouts_by_group(self, layouts: list[Layout]):
165
126
  texts_layouts: list[Layout] = []
@@ -197,13 +158,6 @@ class DocExtractor:
197
158
 
198
159
  return min_layout
199
160
 
200
- def _layout_order(self, layout: Layout) -> int:
201
- fragments = layout.fragments
202
- if len(fragments) == 0:
203
- return sys.maxsize
204
- else:
205
- return fragments[0].order
206
-
207
161
  def _get_yolo(self) -> YOLOv10:
208
162
  if self._yolo is None:
209
163
  yolo_model_url = "https://huggingface.co/opendatalab/PDF-Extract-Kit-1.0/resolve/main/models/Layout/YOLO/doclayout_yolo_ft.pt"
@@ -214,6 +168,44 @@ class DocExtractor:
214
168
  self._yolo = YOLOv10(str(yolo_model_path))
215
169
  return self._yolo
216
170
 
171
+ def _order_fragments_by_y(self, layouts: list[Layout]):
172
+ fragments = list(self._iter_fragments(layouts))
173
+ fragments.sort(key=lambda f: f.rect.lt[1] + f.rect.rt[1])
174
+ for i, fragment in enumerate(fragments):
175
+ fragment.order = i
176
+
177
+ def _order_fragments_by_ai(self, width: int, height: int, layouts: list[Layout]):
178
+ layout_model = self._get_layout()
179
+ boxes: list[list[int]] = []
180
+ steps: float = 1000.0 # max value of layoutreader
181
+ x_rate: float = 1.0
182
+ y_rate: float = 1.0
183
+ x_offset: float = 0.0
184
+ y_offset: float = 0.0
185
+ if width > height:
186
+ y_rate = height / width
187
+ y_offset = (1.0 - y_rate) / 2.0
188
+ else:
189
+ x_rate = width / height
190
+ x_offset = (1.0 - x_rate) / 2.0
191
+
192
+ for left, top, right, bottom in self._collect_rate_boxes(
193
+ fragments=self._iter_fragments(layouts),
194
+ ):
195
+ boxes.append([
196
+ round((left * x_rate + x_offset) * steps),
197
+ round((top * y_rate + y_offset) * steps),
198
+ round((right * x_rate + x_offset) * steps),
199
+ round((bottom * y_rate + y_offset) * steps),
200
+ ])
201
+ inputs = boxes2inputs(boxes)
202
+ inputs = prepare_inputs(inputs, layout_model)
203
+ logits = layout_model(**inputs).logits.cpu().squeeze(0)
204
+ orders: list[int] = parse_logits(logits, len(boxes))
205
+
206
+ for order, fragment in zip(orders, self._iter_fragments(layouts)):
207
+ fragment.order = order
208
+
217
209
  def _get_layout(self) -> LayoutLMv3ForTokenClassification:
218
210
  if self._layout is None:
219
211
  cache_dir = ensure_dir(
@@ -237,6 +229,8 @@ class DocExtractor:
237
229
  )
238
230
 
239
231
  def _sort_layouts(self, layouts: list[Layout]) -> list[Layout]:
232
+ layouts.sort(key=lambda layout: layout.rect.lt[1] + layout.rect.rt[1])
233
+
240
234
  sorted_layouts: list[tuple[int, Layout]] = []
241
235
  empty_layouts: list[tuple[int, Layout]] = []
242
236
 
@@ -246,11 +240,9 @@ class DocExtractor:
246
240
  else:
247
241
  empty_layouts.append((i, layout))
248
242
 
249
- sorted_layouts.sort(key=lambda x: x[1].fragments[0].order)
250
-
251
243
  # try to maintain the order of empty layouts and other layouts as much as possible
252
244
  for i, layout in empty_layouts:
253
- max_less_index: int = len(layouts)
245
+ max_less_index: int = -1
254
246
  max_less_layout: Layout | None = None
255
247
  max_less_index_in_enumerated: int = -1
256
248
  for j, (k, sorted_layout) in enumerate(sorted_layouts):
@@ -266,7 +258,7 @@ class DocExtractor:
266
258
 
267
259
  return [layout for _, layout in sorted_layouts]
268
260
 
269
- def _collect_rate_boxes(self, fragments: list[OCRFragment]):
261
+ def _collect_rate_boxes(self, fragments: Iterable[OCRFragment]):
270
262
  boxes = self._get_boxes(fragments)
271
263
  left = float("inf")
272
264
  top = float("inf")
@@ -290,7 +282,7 @@ class DocExtractor:
290
282
  (_bottom - top) / height,
291
283
  )
292
284
 
293
- def _get_boxes(self, fragments: list[OCRFragment]):
285
+ def _get_boxes(self, fragments: Iterable[OCRFragment]):
294
286
  boxes: list[tuple[float, float, float, float]] = []
295
287
  for fragment in fragments:
296
288
  left: float = float("inf")
@@ -304,3 +296,7 @@ class DocExtractor:
304
296
  bottom = max(bottom, y)
305
297
  boxes.append((left, top, right, bottom))
306
298
  return boxes
299
+
300
+ def _iter_fragments(self, layouts: list[Layout]):
301
+ for layout in layouts:
302
+ yield from layout.fragments
@@ -2,9 +2,11 @@ import os
2
2
  import numpy as np
3
3
  import cv2
4
4
 
5
- from typing import Literal, Any
5
+ from typing import Any, Literal, Generator
6
6
  from paddleocr import PaddleOCR
7
- from .utils import ensure_dir
7
+ from .types import OCRFragment
8
+ from .rectangle import Rectangle
9
+ from .utils import is_space_text, ensure_dir
8
10
 
9
11
 
10
12
  # https://github.com/PaddlePaddle/PaddleOCR/blob/2c0c4beb0606819735a16083cdebf652939c781a/paddleocr.py#L108-L157
@@ -16,16 +18,33 @@ class OCR:
16
18
  self,
17
19
  device: Literal["cpu", "cuda"],
18
20
  model_dir_path: str,
19
- bin: bool = True,
20
- inv: bool = False,
21
21
  ):
22
22
  self._device: Literal["cpu", "cuda"] = device
23
23
  self._model_dir_path: str = model_dir_path
24
24
  self._ocr_and_lan: tuple[PaddleOCR, PaddleLang] | None = None
25
- self._bin: bool = bin
26
- self._inv: bool = inv
27
25
 
28
- def do(self, lang: PaddleLang, image: np.ndarray) -> list[Any]:
26
+ def search_fragments(self, image: np.ndarray, lang: PaddleLang) -> Generator[OCRFragment, None, None]:
27
+ index: int = 0
28
+ for item in self._handle(lang, image):
29
+ for line in item:
30
+ react: list[list[float]] = line[0]
31
+ text, rank = line[1]
32
+ if is_space_text(text):
33
+ continue
34
+ yield OCRFragment(
35
+ order=index,
36
+ text=text,
37
+ rank=rank,
38
+ rect=Rectangle(
39
+ lt=(react[0][0], react[0][1]),
40
+ rt=(react[1][0], react[1][1]),
41
+ rb=(react[2][0], react[2][1]),
42
+ lb=(react[3][0], react[3][1]),
43
+ ),
44
+ )
45
+ index += 1
46
+
47
+ def _handle(self, lang: PaddleLang, image: np.ndarray) -> list[Any]:
29
48
  ocr = self._get_ocr(lang)
30
49
  image = self._preprocess_image(image)
31
50
  # about img parameter to see
@@ -59,10 +78,24 @@ class OCR:
59
78
 
60
79
  def _preprocess_image(self, image: np.ndarray) -> np.ndarray:
61
80
  image = self._alpha_to_color(image, (255, 255, 255))
62
- if self._inv:
63
- image = cv2.bitwise_not(image)
64
- if self._bin:
65
- image = self._binarize_img(image)
81
+ # image = cv2.bitwise_not(image) # inv
82
+ # image = self._binarize_img(image) # bin
83
+ image = cv2.normalize(
84
+ src=image,
85
+ dst=np.zeros((image.shape[0], image.shape[1])),
86
+ alpha=0,
87
+ beta=255,
88
+ norm_type=cv2.NORM_MINMAX,
89
+ )
90
+ image = cv2.fastNlMeansDenoisingColored(
91
+ src=image,
92
+ dst=None,
93
+ h=10,
94
+ hColor=10,
95
+ templateWindowSize=7,
96
+ searchWindowSize=15,
97
+ )
98
+ # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # image to gray
66
99
  return image
67
100
 
68
101
  def _alpha_to_color(self, image: np.ndarray, alpha_color: tuple[float, float, float]) -> np.ndarray:
@@ -0,0 +1,126 @@
1
+ import numpy as np
2
+
3
+ from typing import Iterable
4
+ from shapely.geometry import Polygon
5
+ from PIL.Image import new, Image, Resampling
6
+ from .types import Layout, OCRFragment
7
+ from .ocr import OCR, PaddleLang
8
+ from .overlap import overlap_rate
9
+ from .rectangle import Point, Rectangle
10
+
11
+
12
+ _MIN_RATE = 0.5
13
+
14
+ def correct_fragments(ocr: OCR, source: Image, layout: Layout, lang: PaddleLang):
15
+ x1, y1, x2, y2 = layout.rect.wrapper
16
+ image: Image = source.crop((
17
+ round(x1), round(y1),
18
+ round(x2), round(y2),
19
+ ))
20
+ image, dx, dy, scale = _adjust_image(image)
21
+ image_np = np.array(image)
22
+ ocr_fragments = list(ocr.search_fragments(image_np, lang))
23
+ corrected_fragments: list[OCRFragment] = []
24
+
25
+ for fragment in ocr_fragments:
26
+ _apply_fragment(fragment.rect, layout, dx, dy, scale)
27
+
28
+ matched_fragments, not_matched_fragments = _match_fragments(
29
+ zone_rect=layout.rect,
30
+ fragments1=layout.fragments,
31
+ fragments2=ocr_fragments,
32
+ )
33
+ for fragment1, fragment2 in matched_fragments:
34
+ if fragment1.rank > fragment2.rank:
35
+ corrected_fragments.append(fragment1)
36
+ else:
37
+ corrected_fragments.append(fragment2)
38
+
39
+ corrected_fragments.extend(not_matched_fragments)
40
+ layout.fragments = corrected_fragments
41
+
42
+ def _adjust_image(image: Image) -> tuple[Image, int, int, float]:
43
+ # after testing, adding white borders to images can reduce
44
+ # the possibility of some text not being recognized
45
+ border_size: int = 50
46
+ adjusted_size: int = 1024 - 2 * border_size
47
+ width, height = image.size
48
+ core_width = float(max(adjusted_size, width))
49
+ core_height = float(max(adjusted_size, height))
50
+
51
+ scale_x = core_width / width
52
+ scale_y = core_height / height
53
+ scale = min(scale_x, scale_y)
54
+ adjusted_width = width * scale
55
+ adjusted_height = height * scale
56
+
57
+ dx = (core_width - adjusted_width) / 2.0
58
+ dy = (core_height - adjusted_height) / 2.0
59
+ dx = round(dx) + border_size
60
+ dy = round(dy) + border_size
61
+
62
+ if scale != 1.0:
63
+ width = round(width * scale)
64
+ height = round(height * scale)
65
+ image = image.resize((width, height), Resampling.BICUBIC)
66
+
67
+ width = round(core_width) + 2 * border_size
68
+ height = round(core_height) + 2 * border_size
69
+ new_image = new("RGB", (width, height), (255, 255, 255))
70
+ new_image.paste(image, (dx, dy))
71
+
72
+ return new_image, dx, dy, scale
73
+
74
+ def _apply_fragment(rect: Rectangle, layout: Layout, dx: int, dy: int, scale: float):
75
+ rect.lt = _apply_point(rect.lt, layout, dx, dy, scale)
76
+ rect.lb = _apply_point(rect.lb, layout, dx, dy, scale)
77
+ rect.rb = _apply_point(rect.rb, layout, dx, dy, scale)
78
+ rect.rt = _apply_point(rect.rt, layout, dx, dy, scale)
79
+
80
+ def _apply_point(point: Point, layout: Layout, dx: int, dy: int, scale: float) -> Point:
81
+ x, y = point
82
+ x = (x - dx) / scale + layout.rect.lt[0]
83
+ y = (y - dy) / scale + layout.rect.lt[1]
84
+ return x, y
85
+
86
+ def _match_fragments(
87
+ zone_rect: Rectangle,
88
+ fragments1: Iterable[OCRFragment],
89
+ fragments2: Iterable[OCRFragment],
90
+ ) -> tuple[list[tuple[OCRFragment, OCRFragment]], list[OCRFragment]]:
91
+
92
+ zone_polygon = Polygon(zone_rect)
93
+ fragments2: list[OCRFragment] = list(fragments2)
94
+ matched_fragments: list[tuple[OCRFragment, OCRFragment]] = []
95
+ not_matched_fragments: list[OCRFragment] = []
96
+
97
+ for fragment1 in fragments1:
98
+ polygon1 = Polygon(fragment1.rect)
99
+ polygon1 = zone_polygon.intersection(polygon1)
100
+ if polygon1.is_empty:
101
+ continue
102
+
103
+ beast_j = -1
104
+ beast_rate = 0.0
105
+
106
+ for j, fragment2 in enumerate(fragments2):
107
+ polygon2 = Polygon(fragment2.rect)
108
+ rate = overlap_rate(polygon1, polygon2)
109
+ if rate < _MIN_RATE:
110
+ continue
111
+
112
+ if rate > beast_rate:
113
+ beast_j = j
114
+ beast_rate = rate
115
+
116
+ if beast_j != -1:
117
+ matched_fragments.append((
118
+ fragment1,
119
+ fragments2[beast_j],
120
+ ))
121
+ del fragments2[beast_j]
122
+ else:
123
+ not_matched_fragments.append(fragment1)
124
+
125
+ not_matched_fragments.extend(fragments2)
126
+ return matched_fragments, not_matched_fragments
@@ -0,0 +1,156 @@
1
+ from typing import Generator
2
+ from shapely.geometry import Polygon
3
+ from .types import Layout, OCRFragment
4
+ from .rectangle import Rectangle
5
+
6
+
7
+ _INCLUDES_MIN_RATE = 0.99
8
+
9
+ def remove_overlap_layouts(layouts: list[Layout]) -> list[Layout]:
10
+ ctx = _OverlapMatrixContext(layouts)
11
+ # the reason for repeating this multiple times is that deleting a layout
12
+ # may cause its parent layout to change from an originally non-deletable
13
+ # state to a deletable state.
14
+ while True:
15
+ removed_count = len(ctx.removed_indexes)
16
+ for i, layout in enumerate(layouts):
17
+ if i in ctx.removed_indexes or \
18
+ any(0.0 < rate < _INCLUDES_MIN_RATE for rate in ctx.rates_with_other(i)) or \
19
+ all(0.0 == rate for rate in ctx.rates_with_other(i)):
20
+ continue
21
+
22
+ if len(layout.fragments) == 0:
23
+ ctx.removed_indexes.add(i)
24
+ else:
25
+ for j in ctx.search_includes_indexes(i):
26
+ ctx.removed_indexes.add(j)
27
+ layout.fragments.extend(layouts[j].fragments)
28
+
29
+ if len(ctx.removed_indexes) == removed_count:
30
+ break
31
+
32
+ return [
33
+ layout for i, layout in enumerate(layouts)
34
+ if i not in ctx.removed_indexes
35
+ ]
36
+
37
+ class _OverlapMatrixContext:
38
+ def __init__(self, layouts: list[Layout]):
39
+ length: int = len(layouts)
40
+ polygons: list[Polygon] = [Polygon(layout.rect) for layout in layouts]
41
+ self.rate_matrix: list[list[float]] = [[1.0 for _ in range(length)] for _ in range(length)]
42
+ self.removed_indexes: set[int] = set()
43
+ for i in range(length):
44
+ polygon1 = polygons[i]
45
+ rates = self.rate_matrix[i]
46
+ for j in range(length):
47
+ if i != j:
48
+ polygon2 = polygons[j]
49
+ rates[j] = overlap_rate(polygon1, polygon2)
50
+
51
+ def rates_with_other(self, index: int):
52
+ for i, rate in enumerate(self.rate_matrix[index]):
53
+ if i != index and i not in self.removed_indexes:
54
+ yield rate
55
+
56
+ def search_includes_indexes(self, index: int):
57
+ for i, rate in enumerate(self.rate_matrix[index]):
58
+ if i != index and \
59
+ i not in self.removed_indexes and \
60
+ rate >= _INCLUDES_MIN_RATE:
61
+ yield i
62
+
63
+ def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
64
+ fragments: list[OCRFragment] = []
65
+ for group in _split_fragments_into_groups(origin_fragments):
66
+ if len(group) == 1:
67
+ fragments.append(group[0])
68
+ continue
69
+
70
+ min_order: float = float("inf")
71
+ texts: list[str] = []
72
+ text_rate_weights: float = 0.0
73
+ proto_texts_len: int = 0
74
+
75
+ x1: float = float("inf")
76
+ y1: float = float("inf")
77
+ x2: float = float("-inf")
78
+ y2: float = float("-inf")
79
+
80
+ for fragment in sorted(group, key=lambda x: x.rect.lt[0] + x.rect.lb[0]):
81
+ proto_texts_len += len(fragment.text)
82
+ text_rate_weights += fragment.rank * len(fragment.text)
83
+ texts.append(fragment.text)
84
+ min_order = min(min_order, fragment.order)
85
+ for x, y in fragment.rect:
86
+ x1 = min(x1, x)
87
+ y1 = min(y1, y)
88
+ x2 = max(x2, x)
89
+ y2 = max(y2, y)
90
+
91
+ fragments.append(OCRFragment(
92
+ order=min_order,
93
+ text=" ".join(texts),
94
+ rank=text_rate_weights / proto_texts_len,
95
+ rect=Rectangle(
96
+ lt=(x1, y1),
97
+ rt=(x2, y1),
98
+ lb=(x1, y2),
99
+ rb=(x2, y2),
100
+ ),
101
+ ))
102
+ return fragments
103
+
104
+ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list[OCRFragment], None, None]:
105
+ group: list[OCRFragment] = []
106
+ sum_height: float = 0.0
107
+ sum_median: float = 0.0
108
+ max_deviation_rate = 0.35
109
+
110
+ for fragment in sorted(fragments, key=lambda x: x.rect.lt[1] + x.rect.rt[1]):
111
+ _, y1, _, y2 = fragment.rect.wrapper
112
+ height = y2 - y1
113
+ median = (y1 + y2) / 2.0
114
+
115
+ if len(group) > 0:
116
+ next_mean_median = (sum_median + median) / (len(group) + 1)
117
+ next_mean_height = (sum_height + height) / (len(group) + 1)
118
+ deviation_rate = abs(median - next_mean_median) / next_mean_height
119
+
120
+ if deviation_rate > max_deviation_rate:
121
+ yield group
122
+ group = []
123
+ sum_height = 0.0
124
+ sum_median = 0.0
125
+
126
+ group.append(fragment)
127
+ sum_height += height
128
+ sum_median += median
129
+
130
+ if len(group) > 0:
131
+ yield group
132
+
133
+ # calculating overlap ratio: The reason why area is not used is
134
+ # that most of the measurements are of rectangles representing text lines.
135
+ # they are very sensitive to changes in height because they are very thin and long.
136
+ # In order to make it equally sensitive to length and width, the ratio of area is not used.
137
+ def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
138
+ intersection: Polygon = polygon1.intersection(polygon2)
139
+ if intersection.is_empty:
140
+ return 0.0
141
+ else:
142
+ overlay_width, overlay_height = _polygon_size(intersection)
143
+ polygon2_width, polygon2_height = _polygon_size(polygon2)
144
+ return (overlay_width / polygon2_width + overlay_height / polygon2_height) / 2.0
145
+
146
+ def _polygon_size(polygon: Polygon) -> tuple[float, float]:
147
+ x1: float = float("inf")
148
+ y1: float = float("inf")
149
+ x2: float = float("-inf")
150
+ y2: float = float("-inf")
151
+ for x, y in polygon.exterior.coords:
152
+ x1 = min(x1, x)
153
+ y1 = min(y1, y)
154
+ x2 = max(x2, x)
155
+ y2 = max(y2, y)
156
+ return x2 - x1, y2 - y1
@@ -8,11 +8,11 @@ _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
8
8
  def plot(image: Image, layouts: Iterable[Layout]):
9
9
  draw = ImageDraw.Draw(image, mode="RGBA")
10
10
  for layout in layouts:
11
- draw.polygon([p for p in layout.rect], outline=_layout_color(layout), width=3)
11
+ draw.polygon([p for p in layout.rect], outline=_layout_color(layout), width=5)
12
12
 
13
13
  for layout in layouts:
14
14
  for fragments in layout.fragments:
15
- draw.polygon([p for p in fragments.rect], outline=_FRAGMENT_COLOR, width=1)
15
+ draw.polygon([p for p in fragments.rect], outline=_FRAGMENT_COLOR, width=3)
16
16
 
17
17
  def _layout_color(layout: Layout) -> tuple[int, int, int]:
18
18
  cls = layout.cls
@@ -44,6 +44,19 @@ class Rectangle:
44
44
  width += distance
45
45
  return width / 2, height / 2
46
46
 
47
+ @property
48
+ def wrapper(self) -> tuple[float, float, float, float]:
49
+ x1: float = float("inf")
50
+ y1: float = float("inf")
51
+ x2: float = float("-inf")
52
+ y2: float = float("-inf")
53
+ for x, y in self:
54
+ x1 = min(x1, x)
55
+ y1 = min(y1, y)
56
+ x2 = max(x2, x)
57
+ y2 = max(y2, y)
58
+ return x1, y1, x2, y2
59
+
47
60
  def intersection_area(rect1: Rectangle, rect2: Rectangle) -> float:
48
61
  poly1 = Polygon(rect1)
49
62
  poly2 = Polygon(rect2)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -12,7 +12,6 @@ Requires-Dist: pillow<11.0,>=10.3
12
12
  Requires-Dist: shapely<3.0,>=2.0.0
13
13
  Requires-Dist: transformers<5.0,>=4.48.0
14
14
  Requires-Dist: doclayout_yolo>=0.0.3
15
- Requires-Dist: paddlepaddle<3.0,>=2.6.0
16
15
  Requires-Dist: paddleocr==2.9.0
17
16
  Dynamic: author
18
17
  Dynamic: author-email
@@ -7,6 +7,8 @@ doc_page_extractor/downloader.py
7
7
  doc_page_extractor/extractor.py
8
8
  doc_page_extractor/layoutreader.py
9
9
  doc_page_extractor/ocr.py
10
+ doc_page_extractor/ocr_corrector.py
11
+ doc_page_extractor/overlap.py
10
12
  doc_page_extractor/plot.py
11
13
  doc_page_extractor/raw_optimizer.py
12
14
  doc_page_extractor/rectangle.py
@@ -17,4 +19,6 @@ doc_page_extractor.egg-info/PKG-INFO
17
19
  doc_page_extractor.egg-info/SOURCES.txt
18
20
  doc_page_extractor.egg-info/dependency_links.txt
19
21
  doc_page_extractor.egg-info/requires.txt
20
- doc_page_extractor.egg-info/top_level.txt
22
+ doc_page_extractor.egg-info/top_level.txt
23
+ tests/__init__.py
24
+ tests/test_history_bus.py
@@ -3,5 +3,4 @@ pillow<11.0,>=10.3
3
3
  shapely<3.0,>=2.0.0
4
4
  transformers<5.0,>=4.48.0
5
5
  doclayout_yolo>=0.0.3
6
- paddlepaddle<3.0,>=2.6.0
7
6
  paddleocr==2.9.0
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="doc-page-extractor",
5
- version="0.0.2",
5
+ version="0.0.4",
6
6
  author="Tao Zeyu",
7
7
  author_email="i@taozeyu.com",
8
8
  url="https://github.com/Moskize91/doc-page-extractor",
@@ -16,7 +16,6 @@ setup(
16
16
  "shapely>=2.0.0,<3.0",
17
17
  "transformers>=4.48.0,<5.0",
18
18
  "doclayout_yolo>=0.0.3",
19
- "paddlepaddle>=2.6.0,<3.0",
20
19
  "paddleocr==2.9.0", # https://github.com/Moskize91/doc-page-extractor/issues/3
21
20
  ],
22
21
  )
File without changes
@@ -0,0 +1,55 @@
1
+ import os
2
+ import unittest
3
+
4
+ from PIL import Image
5
+ from doc_page_extractor import DocExtractor, Layout, LayoutClass
6
+
7
+
8
+ class TestGroup(unittest.TestCase):
9
+ def test_history_bugs(self):
10
+ model_path = os.path.join(self._project_path(), "model")
11
+ image_path = os.path.join(self._project_path(), "tests", "images", "figure.png")
12
+ os.makedirs(model_path, exist_ok=True)
13
+
14
+ extractor = DocExtractor(model_path, "cpu")
15
+ layouts: list[tuple[LayoutClass, list[str]]]
16
+
17
+ with Image.open(image_path) as image:
18
+ result = extractor.extract(image, "ch")
19
+ layouts = [self._format_Layout(layout) for layout in result.layouts]
20
+
21
+ self.assertEqual(layouts, [
22
+ (LayoutClass.PLAIN_TEXT, [
23
+ "口的11.8%①。这既是江南农业落后的反映,又是它的原因。当战国以",
24
+ "后黄河流域因铁器牛耕的普及获得基本的开发,农区联结成一大片的",
25
+ "时候,南方农业开发始终没有突破星点状或斑块状分布的格局。由于",
26
+ "地旷人稀,耕作相当粗放,许多水田采取火耕水瓣的方式,旱田则多",
27
+ "行刀耕火种②。司马迁在《史记·货殖列传》中说:“总之,楚越之",
28
+ "地,地厂人希,饭稻囊鱼,或火耕而水瓣,果隋(蕨)赢(螺)蛤,",
29
+ "不待贾而足,地势饶食,无饥谨之患,以故皆偷生,无积聚而多",
30
+ "贫。”这种概括虽然未免太突出了南方经济的落后面,有一定片面性,",
31
+ "但大体还是反映了实际情形的。战国秦汉时期,南方与黄河流域农业",
32
+ "的差距显然拉大了。",
33
+ ]),
34
+ (LayoutClass.FIGURE, []),
35
+ (LayoutClass.FIGURE_CAPTION, [
36
+ "西晋陶水田犁耙模型(广东连县出土)"
37
+ ]),
38
+ (LayoutClass.FIGURE, []),
39
+ (LayoutClass.FIGURE_CAPTION, [
40
+ "南朝陶耙田模型 (广西苍梧倒水出土)"
41
+ ]),
42
+ (LayoutClass.PLAIN_TEXT, [
43
+ "①据赵文林、谢淑君:《中国人口史》(人民出版社1988年)有关资料统计。",
44
+ "②《盐铁论·通有》:“荆扬…………伐木而树谷,焚莱而播粟,火耕而水。”"
45
+ ]),
46
+ (LayoutClass.ABANDON, [
47
+ "136"
48
+ ]),
49
+ ])
50
+
51
+ def _format_Layout(self, layout: Layout) -> tuple[LayoutClass, list[str]]:
52
+ return layout.cls, [f.text.strip() for f in layout.fragments]
53
+
54
+ def _project_path(self) -> str:
55
+ return os.path.abspath(os.path.join(__file__, "..", ".."))