doc-page-extractor 0.0.7__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (39) hide show
  1. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/PKG-INFO +3 -2
  2. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/clipper.py +3 -0
  3. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/extractor.py +10 -0
  4. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/ocr.py +11 -6
  5. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/overlap.py +13 -2
  6. doc_page_extractor-0.0.9/doc_page_extractor/plot.py +91 -0
  7. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/rotation.py +3 -1
  8. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/PKG-INFO +3 -2
  9. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/setup.py +1 -1
  10. doc_page_extractor-0.0.7/doc_page_extractor/plot.py +0 -38
  11. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/LICENSE +0 -0
  12. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/README.md +0 -0
  13. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/__init__.py +0 -0
  14. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/downloader.py +0 -0
  15. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/layoutreader.py +0 -0
  16. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/ocr_corrector.py +0 -0
  17. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/__init__.py +0 -0
  18. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  19. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  20. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/imaug.py +0 -0
  21. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/operators.py +0 -0
  22. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  23. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  24. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  25. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  26. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  27. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  28. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/utils.py +0 -0
  29. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/raw_optimizer.py +0 -0
  30. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/rectangle.py +0 -0
  31. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/types.py +0 -0
  32. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor/utils.py +0 -0
  33. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/SOURCES.txt +0 -0
  34. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
  35. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/requires.txt +0 -0
  36. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/top_level.txt +0 -0
  37. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/setup.cfg +0 -0
  38. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/tests/__init__.py +0 -0
  39. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.9}/tests/test_history_bus.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -19,6 +19,7 @@ Dynamic: author-email
19
19
  Dynamic: description
20
20
  Dynamic: description-content-type
21
21
  Dynamic: home-page
22
+ Dynamic: license-file
22
23
  Dynamic: requires-dist
23
24
  Dynamic: summary
24
25
 
@@ -73,6 +73,9 @@ def _size_and_wrapper(rect: Rectangle):
73
73
  else:
74
74
  widths.append(distance)
75
75
 
76
+ if len(widths) == 0 and len(heights) == 0:
77
+ return 0.0, 0.0, 0.0, 0.0
78
+
76
79
  width: float = sum(widths) / len(widths)
77
80
  height: float = sum(heights) / len(heights)
78
81
  max_width: float = width
@@ -137,6 +137,10 @@ class DocExtractor:
137
137
  def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
138
138
  fragment_area = fragment.rect.area
139
139
  primary_layouts: list[(Layout, float)] = []
140
+
141
+ if fragment_area == 0.0:
142
+ return None
143
+
140
144
  for layout in layouts:
141
145
  area = intersection_area(fragment.rect, layout.rect)
142
146
  if area / fragment_area > 0.85:
@@ -169,6 +173,9 @@ class DocExtractor:
169
173
  fragment.order = i
170
174
 
171
175
  def _order_fragments_by_ai(self, width: int, height: int, layouts: list[Layout]):
176
+ if width == 0 or height == 0:
177
+ return
178
+
172
179
  layout_model = self._get_layout()
173
180
  boxes: list[list[int]] = []
174
181
  steps: float = 1000.0 # max value of layoutreader
@@ -268,6 +275,9 @@ class DocExtractor:
268
275
  width = right - left
269
276
  height = bottom - top
270
277
 
278
+ if width == 0 or height == 0:
279
+ return
280
+
271
281
  for _left, _top, _right, _bottom in boxes:
272
282
  yield (
273
283
  (_left - left) / width,
@@ -63,16 +63,21 @@ class OCR:
63
63
  text, rank = res
64
64
  if is_space_text(text):
65
65
  continue
66
+
67
+ rect = Rectangle(
68
+ lt=(box[0][0], box[0][1]),
69
+ rt=(box[1][0], box[1][1]),
70
+ rb=(box[2][0], box[2][1]),
71
+ lb=(box[3][0], box[3][1]),
72
+ )
73
+ if rect.area == 0.0:
74
+ continue
75
+
66
76
  yield OCRFragment(
67
77
  order=index,
68
78
  text=text,
69
79
  rank=rank,
70
- rect=Rectangle(
71
- lt=(box[0][0], box[0][1]),
72
- rt=(box[1][0], box[1][1]),
73
- rb=(box[2][0], box[2][1]),
74
- lb=(box[3][0], box[3][1]),
75
- ),
80
+ rect=rect,
76
81
  )
77
82
  index += 1
78
83
 
@@ -88,6 +88,9 @@ def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
88
88
  x2 = max(x2, x)
89
89
  y2 = max(y2, y)
90
90
 
91
+ if proto_texts_len == 0:
92
+ continue
93
+
91
94
  fragments.append(OCRFragment(
92
95
  order=min_order,
93
96
  text=" ".join(texts),
@@ -112,11 +115,14 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
112
115
  height = y2 - y1
113
116
  median = (y1 + y2) / 2.0
114
117
 
118
+ if height == 0:
119
+ continue
120
+
115
121
  if len(group) > 0:
116
122
  next_mean_median = (sum_median + median) / (len(group) + 1)
117
123
  next_mean_height = (sum_height + height) / (len(group) + 1)
118
- deviation_rate = abs(median - next_mean_median) / next_mean_height
119
124
 
125
+ deviation_rate = abs(median - next_mean_median) / next_mean_height
120
126
  if deviation_rate > max_deviation_rate:
121
127
  yield group
122
128
  group = []
@@ -141,7 +147,12 @@ def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
141
147
  else:
142
148
  overlay_width, overlay_height = _polygon_size(intersection)
143
149
  polygon2_width, polygon2_height = _polygon_size(polygon2)
144
- return (overlay_width / polygon2_width + overlay_height / polygon2_height) / 2.0
150
+ if polygon2_width == 0.0 or polygon2_height == 0.0:
151
+ return 0.0
152
+ return (
153
+ overlay_width / polygon2_width +
154
+ overlay_height / polygon2_height
155
+ ) / 2.0
145
156
 
146
157
  def _polygon_size(polygon: Polygon) -> tuple[float, float]:
147
158
  x1: float = float("inf")
@@ -0,0 +1,91 @@
1
+ from typing import Iterable
2
+ from PIL import ImageDraw
3
+ from PIL.ImageFont import load_default, FreeTypeFont
4
+ from PIL.Image import Image
5
+ from .types import Layout, LayoutClass
6
+ from .rectangle import Point
7
+
8
+ _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
9
+ _Color = tuple[int, int, int]
10
+
11
+ def plot(image: Image, layouts: Iterable[Layout]) -> None:
12
+ layout_font = load_default(size=35)
13
+ fragment_font = load_default(size=25)
14
+ draw = ImageDraw.Draw(image, mode="RGBA")
15
+
16
+ def _draw_number(position: Point, number: int, font: FreeTypeFont, bold: bool, color: _Color) -> None:
17
+ nonlocal draw
18
+ x, y = position
19
+ text = str(object=number)
20
+ width = len(text) * font.size
21
+ offset = round(font.size * 0.15)
22
+
23
+ for dx, dy in _generate_delta(bold):
24
+ draw.text(
25
+ xy=(x + dx - width - offset, y + dy),
26
+ text=text,
27
+ font=font,
28
+ fill=color,
29
+ )
30
+
31
+ for layout in layouts:
32
+ draw.polygon(
33
+ xy=[p for p in layout.rect],
34
+ outline=_layout_color(layout),
35
+ width=5,
36
+ )
37
+
38
+ for layout in layouts:
39
+ for fragment in layout.fragments:
40
+ draw.polygon(
41
+ xy=[p for p in fragment.rect],
42
+ outline=_FRAGMENT_COLOR,
43
+ width=3,
44
+ )
45
+ _draw_number(
46
+ position=fragment.rect.lt,
47
+ number=fragment.order + 1,
48
+ font=fragment_font,
49
+ bold=False,
50
+ color=_FRAGMENT_COLOR,
51
+ )
52
+
53
+ for i, layout in enumerate(layouts):
54
+ _draw_number(
55
+ position=layout.rect.lt,
56
+ number=i + 1,
57
+ font=layout_font,
58
+ bold=True,
59
+ color=_layout_color(layout),
60
+ )
61
+
62
+ def _generate_delta(bold: bool):
63
+ if bold:
64
+ for dx in range(-1, 2):
65
+ for dy in range(-1, 2):
66
+ yield dx, dy
67
+ else:
68
+ yield 0, 0
69
+
70
+ def _layout_color(layout: Layout) -> _Color:
71
+ cls = layout.cls
72
+ if cls == LayoutClass.TITLE:
73
+ return (0x0A, 0x12, 0x2C) # Dark
74
+ elif cls == LayoutClass.PLAIN_TEXT:
75
+ return (0x3C, 0x67, 0x90) # Blue
76
+ elif cls == LayoutClass.ABANDON:
77
+ return (0xC0, 0xBB, 0xA9) # Gray
78
+ elif cls == LayoutClass.FIGURE:
79
+ return (0x5B, 0x91, 0x3C) # Dark Green
80
+ elif cls == LayoutClass.FIGURE_CAPTION:
81
+ return (0x77, 0xB3, 0x54) # Green
82
+ elif cls == LayoutClass.TABLE:
83
+ return (0x44, 0x17, 0x52) # Dark Purple
84
+ elif cls == LayoutClass.TABLE_CAPTION:
85
+ return (0x81, 0x75, 0xA0) # Purple
86
+ elif cls == LayoutClass.TABLE_FOOTNOTE:
87
+ return (0xEF, 0xB6, 0xC9) # Pink Purple
88
+ elif cls == LayoutClass.ISOLATE_FORMULA:
89
+ return (0xFA, 0x38, 0x27) # Red
90
+ elif cls == LayoutClass.FORMULA_CAPTION:
91
+ return (0xFF, 0x9D, 0x24) # Orange
@@ -129,7 +129,9 @@ def _find_median(rotations: list[float]):
129
129
  mid2 = rotations[n // 2]
130
130
  return (mid1 + mid2) / 2
131
131
 
132
- def _find_mean(rotations: list[float]):
132
+ def _find_mean(rotations: list[float]) -> float:
133
+ if len(rotations) == 0:
134
+ return 0.0
133
135
  return sum(rotations) / len(rotations)
134
136
 
135
137
  # rotation is in [0, pi)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -19,6 +19,7 @@ Dynamic: author-email
19
19
  Dynamic: description
20
20
  Dynamic: description-content-type
21
21
  Dynamic: home-page
22
+ Dynamic: license-file
22
23
  Dynamic: requires-dist
23
24
  Dynamic: summary
24
25
 
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="doc-page-extractor",
5
- version="0.0.7",
5
+ version="0.0.9",
6
6
  author="Tao Zeyu",
7
7
  author_email="i@taozeyu.com",
8
8
  url="https://github.com/Moskize91/doc-page-extractor",
@@ -1,38 +0,0 @@
1
- from typing import Iterable
2
- from PIL import ImageDraw
3
- from PIL.Image import Image
4
- from .types import Layout, LayoutClass
5
-
6
- _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
7
-
8
- def plot(image: Image, layouts: Iterable[Layout]):
9
- draw = ImageDraw.Draw(image, mode="RGBA")
10
- for layout in layouts:
11
- draw.polygon([p for p in layout.rect], outline=_layout_color(layout), width=5)
12
-
13
- for layout in layouts:
14
- for fragments in layout.fragments:
15
- draw.polygon([p for p in fragments.rect], outline=_FRAGMENT_COLOR, width=3)
16
-
17
- def _layout_color(layout: Layout) -> tuple[int, int, int]:
18
- cls = layout.cls
19
- if cls == LayoutClass.TITLE:
20
- return (0x0A, 0x12, 0x2C) # Dark
21
- elif cls == LayoutClass.PLAIN_TEXT:
22
- return (0x3C, 0x67, 0x90) # Blue
23
- elif cls == LayoutClass.ABANDON:
24
- return (0xC0, 0xBB, 0xA9) # Gray
25
- elif cls == LayoutClass.FIGURE:
26
- return (0x5B, 0x91, 0x3C) # Dark Green
27
- elif cls == LayoutClass.FIGURE_CAPTION:
28
- return (0x77, 0xB3, 0x54) # Green
29
- elif cls == LayoutClass.TABLE:
30
- return (0x44, 0x17, 0x52) # Dark Purple
31
- elif cls == LayoutClass.TABLE_CAPTION:
32
- return (0x81, 0x75, 0xA0) # Purple
33
- elif cls == LayoutClass.TABLE_FOOTNOTE:
34
- return (0xEF, 0xB6, 0xC9) # Pink Purple
35
- elif cls == LayoutClass.ISOLATE_FORMULA:
36
- return (0xFA, 0x38, 0x27) # Red
37
- elif cls == LayoutClass.FORMULA_CAPTION:
38
- return (0xFF, 0x9D, 0x24) # Orange