doc-page-extractor 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

@@ -73,6 +73,9 @@ def _size_and_wrapper(rect: Rectangle):
73
73
  else:
74
74
  widths.append(distance)
75
75
 
76
+ if len(widths) == 0 and len(heights) == 0:
77
+ return 0.0, 0.0, 0.0, 0.0
78
+
76
79
  width: float = sum(widths) / len(widths)
77
80
  height: float = sum(heights) / len(heights)
78
81
  max_width: float = width
@@ -137,6 +137,10 @@ class DocExtractor:
137
137
  def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
138
138
  fragment_area = fragment.rect.area
139
139
  primary_layouts: list[(Layout, float)] = []
140
+
141
+ if fragment_area == 0.0:
142
+ return None
143
+
140
144
  for layout in layouts:
141
145
  area = intersection_area(fragment.rect, layout.rect)
142
146
  if area / fragment_area > 0.85:
@@ -169,6 +173,9 @@ class DocExtractor:
169
173
  fragment.order = i
170
174
 
171
175
  def _order_fragments_by_ai(self, width: int, height: int, layouts: list[Layout]):
176
+ if width == 0 or height == 0:
177
+ return
178
+
172
179
  layout_model = self._get_layout()
173
180
  boxes: list[list[int]] = []
174
181
  steps: float = 1000.0 # max value of layoutreader
@@ -268,6 +275,9 @@ class DocExtractor:
268
275
  width = right - left
269
276
  height = bottom - top
270
277
 
278
+ if width == 0 or height == 0:
279
+ return
280
+
271
281
  for _left, _top, _right, _bottom in boxes:
272
282
  yield (
273
283
  (_left - left) / width,
doc_page_extractor/ocr.py CHANGED
@@ -63,16 +63,21 @@ class OCR:
63
63
  text, rank = res
64
64
  if is_space_text(text):
65
65
  continue
66
+
67
+ rect = Rectangle(
68
+ lt=(box[0][0], box[0][1]),
69
+ rt=(box[1][0], box[1][1]),
70
+ rb=(box[2][0], box[2][1]),
71
+ lb=(box[3][0], box[3][1]),
72
+ )
73
+ if rect.area == 0.0:
74
+ continue
75
+
66
76
  yield OCRFragment(
67
77
  order=index,
68
78
  text=text,
69
79
  rank=rank,
70
- rect=Rectangle(
71
- lt=(box[0][0], box[0][1]),
72
- rt=(box[1][0], box[1][1]),
73
- rb=(box[2][0], box[2][1]),
74
- lb=(box[3][0], box[3][1]),
75
- ),
80
+ rect=rect,
76
81
  )
77
82
  index += 1
78
83
 
@@ -88,6 +88,9 @@ def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
88
88
  x2 = max(x2, x)
89
89
  y2 = max(y2, y)
90
90
 
91
+ if proto_texts_len == 0:
92
+ continue
93
+
91
94
  fragments.append(OCRFragment(
92
95
  order=min_order,
93
96
  text=" ".join(texts),
@@ -112,11 +115,14 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
112
115
  height = y2 - y1
113
116
  median = (y1 + y2) / 2.0
114
117
 
118
+ if height == 0:
119
+ continue
120
+
115
121
  if len(group) > 0:
116
122
  next_mean_median = (sum_median + median) / (len(group) + 1)
117
123
  next_mean_height = (sum_height + height) / (len(group) + 1)
118
- deviation_rate = abs(median - next_mean_median) / next_mean_height
119
124
 
125
+ deviation_rate = abs(median - next_mean_median) / next_mean_height
120
126
  if deviation_rate > max_deviation_rate:
121
127
  yield group
122
128
  group = []
@@ -141,7 +147,12 @@ def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
141
147
  else:
142
148
  overlay_width, overlay_height = _polygon_size(intersection)
143
149
  polygon2_width, polygon2_height = _polygon_size(polygon2)
144
- return (overlay_width / polygon2_width + overlay_height / polygon2_height) / 2.0
150
+ if polygon2_width == 0.0 or polygon2_height == 0.0:
151
+ return 0.0
152
+ return (
153
+ overlay_width / polygon2_width +
154
+ overlay_height / polygon2_height
155
+ ) / 2.0
145
156
 
146
157
  def _polygon_size(polygon: Polygon) -> tuple[float, float]:
147
158
  x1: float = float("inf")
@@ -1,20 +1,73 @@
1
1
  from typing import Iterable
2
2
  from PIL import ImageDraw
3
+ from PIL.ImageFont import load_default, FreeTypeFont
3
4
  from PIL.Image import Image
4
5
  from .types import Layout, LayoutClass
6
+ from .rectangle import Point
5
7
 
6
8
  _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
9
+ _Color = tuple[int, int, int]
7
10
 
8
- def plot(image: Image, layouts: Iterable[Layout]):
11
+ def plot(image: Image, layouts: Iterable[Layout]) -> None:
12
+ layout_font = load_default(size=35)
13
+ fragment_font = load_default(size=25)
9
14
  draw = ImageDraw.Draw(image, mode="RGBA")
15
+
16
+ def _draw_number(position: Point, number: int, font: FreeTypeFont, bold: bool, color: _Color) -> None:
17
+ nonlocal draw
18
+ x, y = position
19
+ text = str(object=number)
20
+ width = len(text) * font.size
21
+ offset = round(font.size * 0.15)
22
+
23
+ for dx, dy in _generate_delta(bold):
24
+ draw.text(
25
+ xy=(x + dx - width - offset, y + dy),
26
+ text=text,
27
+ font=font,
28
+ fill=color,
29
+ )
30
+
10
31
  for layout in layouts:
11
- draw.polygon([p for p in layout.rect], outline=_layout_color(layout), width=5)
32
+ draw.polygon(
33
+ xy=[p for p in layout.rect],
34
+ outline=_layout_color(layout),
35
+ width=5,
36
+ )
12
37
 
13
38
  for layout in layouts:
14
- for fragments in layout.fragments:
15
- draw.polygon([p for p in fragments.rect], outline=_FRAGMENT_COLOR, width=3)
39
+ for fragment in layout.fragments:
40
+ draw.polygon(
41
+ xy=[p for p in fragment.rect],
42
+ outline=_FRAGMENT_COLOR,
43
+ width=3,
44
+ )
45
+ _draw_number(
46
+ position=fragment.rect.lt,
47
+ number=fragment.order + 1,
48
+ font=fragment_font,
49
+ bold=False,
50
+ color=_FRAGMENT_COLOR,
51
+ )
52
+
53
+ for i, layout in enumerate(layouts):
54
+ _draw_number(
55
+ position=layout.rect.lt,
56
+ number=i + 1,
57
+ font=layout_font,
58
+ bold=True,
59
+ color=_layout_color(layout),
60
+ )
61
+
62
+ def _generate_delta(bold: bool):
63
+ if bold:
64
+ for dx in range(-1, 2):
65
+ for dy in range(-1, 2):
66
+ yield dx, dy
67
+ else:
68
+ yield 0, 0
16
69
 
17
- def _layout_color(layout: Layout) -> tuple[int, int, int]:
70
+ def _layout_color(layout: Layout) -> _Color:
18
71
  cls = layout.cls
19
72
  if cls == LayoutClass.TITLE:
20
73
  return (0x0A, 0x12, 0x2C) # Dark
@@ -129,7 +129,9 @@ def _find_median(rotations: list[float]):
129
129
  mid2 = rotations[n // 2]
130
130
  return (mid1 + mid2) / 2
131
131
 
132
- def _find_mean(rotations: list[float]):
132
+ def _find_mean(rotations: list[float]) -> float:
133
+ if len(rotations) == 0:
134
+ return 0.0
133
135
  return sum(rotations) / len(rotations)
134
136
 
135
137
  # rotation is in [0, pi)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -19,6 +19,7 @@ Dynamic: author-email
19
19
  Dynamic: description
20
20
  Dynamic: description-content-type
21
21
  Dynamic: home-page
22
+ Dynamic: license-file
22
23
  Dynamic: requires-dist
23
24
  Dynamic: summary
24
25
 
@@ -1,15 +1,15 @@
1
1
  doc_page_extractor/__init__.py,sha256=jCf5lo3A9JVDquflYMlvH8nJIs3EjBt8AG5y8mwfS68,210
2
- doc_page_extractor/clipper.py,sha256=PDafB_9JGwV_dRY7oWe1yf44roSsEaCuEdg3VRMvJNo,3125
2
+ doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
3
3
  doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
- doc_page_extractor/extractor.py,sha256=D3SLWUAciq8jGU6mlkVwIon-4nHJaYoKpPjGCN_YLhQ,10055
4
+ doc_page_extractor/extractor.py,sha256=RPoqc_UtbHe4BoQvtaJxWuKyHwoDkK_fuYvuyA31p0Y,10200
5
5
  doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
6
- doc_page_extractor/ocr.py,sha256=6eLUVx6NSuRAwrq8Mc2zYs3yocxpOgUQS_4LIIqywnQ,5147
6
+ doc_page_extractor/ocr.py,sha256=tEalIlEz_80bJN8BXnHWi3UCMoPdJ5ylgkmaBV1dzVs,5201
7
7
  doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
8
- doc_page_extractor/overlap.py,sha256=9_WbHxbKIbHM6R3ZUP2YG33pZlbLCHgwFb--NF3cCG0,5155
9
- doc_page_extractor/plot.py,sha256=R8hbmdGjtw2pAH1lJkGc7Qbis4aRaaAkrkEo6WjbqyM,1378
8
+ doc_page_extractor/overlap.py,sha256=Nx7Z6Ih_7SBE-JAjqlMx51PYiciXomPxIOAS37x6fco,5327
9
+ doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
10
10
  doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
11
11
  doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
12
- doc_page_extractor/rotation.py,sha256=Dp8rXfgCzHQwqlAbU-uQt-zHC6Jm9KsIjcR6IhFQ5EU,4284
12
+ doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
13
13
  doc_page_extractor/types.py,sha256=UWghDwajMtEKEYUcOOjr5dM-MWiJ-P-8nWYagouf9ds,631
14
14
  doc_page_extractor/utils.py,sha256=3rtIxiTJ7W5yOuY0UHedUJ3G34tPOw0jdHnUdOQ1tWI,207
15
15
  doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
@@ -24,10 +24,10 @@ doc_page_extractor/onnxocr/predict_rec.py,sha256=qQrCs5jzCf5PYp-iEKJ53pcx_xRoJdJ
24
24
  doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
25
25
  doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
26
26
  doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
27
+ doc_page_extractor-0.0.9.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
27
28
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
29
  tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
29
- doc_page_extractor-0.0.7.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
30
- doc_page_extractor-0.0.7.dist-info/METADATA,sha256=s-ewJAyPQ1I_fgTee91NN99T42HcAaKFu1MAUhZKqdk,2203
31
- doc_page_extractor-0.0.7.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
32
- doc_page_extractor-0.0.7.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
33
- doc_page_extractor-0.0.7.dist-info/RECORD,,
30
+ doc_page_extractor-0.0.9.dist-info/METADATA,sha256=vwcUU9DE9rUhpbz1UigpDN3YhNnNSRbHGAyJroHPV7Q,2225
31
+ doc_page_extractor-0.0.9.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
32
+ doc_page_extractor-0.0.9.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
33
+ doc_page_extractor-0.0.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5