doc-page-extractor 0.0.7__tar.gz → 0.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (38) hide show
  1. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/PKG-INFO +1 -1
  2. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/clipper.py +3 -0
  3. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/extractor.py +10 -0
  4. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/ocr.py +11 -6
  5. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/overlap.py +16 -7
  6. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/rotation.py +3 -1
  7. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/PKG-INFO +1 -1
  8. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/setup.py +1 -1
  9. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/LICENSE +0 -0
  10. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/README.md +0 -0
  11. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/__init__.py +0 -0
  12. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/downloader.py +0 -0
  13. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/layoutreader.py +0 -0
  14. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/ocr_corrector.py +0 -0
  15. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/__init__.py +0 -0
  16. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  17. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  18. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/imaug.py +0 -0
  19. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/operators.py +0 -0
  20. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  21. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  22. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  23. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  24. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  25. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  26. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/utils.py +0 -0
  27. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/plot.py +0 -0
  28. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/raw_optimizer.py +0 -0
  29. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/rectangle.py +0 -0
  30. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/types.py +0 -0
  31. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor/utils.py +0 -0
  32. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/SOURCES.txt +0 -0
  33. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
  34. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/requires.txt +0 -0
  35. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/top_level.txt +0 -0
  36. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/setup.cfg +0 -0
  37. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/tests/__init__.py +0 -0
  38. {doc_page_extractor-0.0.7 → doc_page_extractor-0.0.8}/tests/test_history_bus.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.7
3
+ Version: 0.0.8
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -73,6 +73,9 @@ def _size_and_wrapper(rect: Rectangle):
73
73
  else:
74
74
  widths.append(distance)
75
75
 
76
+ if len(widths) == 0 and len(heights) == 0:
77
+ return 0.0, 0.0, 0.0, 0.0
78
+
76
79
  width: float = sum(widths) / len(widths)
77
80
  height: float = sum(heights) / len(heights)
78
81
  max_width: float = width
@@ -137,6 +137,10 @@ class DocExtractor:
137
137
  def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
138
138
  fragment_area = fragment.rect.area
139
139
  primary_layouts: list[(Layout, float)] = []
140
+
141
+ if fragment_area == 0.0:
142
+ return None
143
+
140
144
  for layout in layouts:
141
145
  area = intersection_area(fragment.rect, layout.rect)
142
146
  if area / fragment_area > 0.85:
@@ -169,6 +173,9 @@ class DocExtractor:
169
173
  fragment.order = i
170
174
 
171
175
  def _order_fragments_by_ai(self, width: int, height: int, layouts: list[Layout]):
176
+ if width == 0 or height == 0:
177
+ return
178
+
172
179
  layout_model = self._get_layout()
173
180
  boxes: list[list[int]] = []
174
181
  steps: float = 1000.0 # max value of layoutreader
@@ -268,6 +275,9 @@ class DocExtractor:
268
275
  width = right - left
269
276
  height = bottom - top
270
277
 
278
+ if width == 0 or height == 0:
279
+ return
280
+
271
281
  for _left, _top, _right, _bottom in boxes:
272
282
  yield (
273
283
  (_left - left) / width,
@@ -63,16 +63,21 @@ class OCR:
63
63
  text, rank = res
64
64
  if is_space_text(text):
65
65
  continue
66
+
67
+ rect = Rectangle(
68
+ lt=(box[0][0], box[0][1]),
69
+ rt=(box[1][0], box[1][1]),
70
+ rb=(box[2][0], box[2][1]),
71
+ lb=(box[3][0], box[3][1]),
72
+ )
73
+ if rect.area == 0.0:
74
+ continue
75
+
66
76
  yield OCRFragment(
67
77
  order=index,
68
78
  text=text,
69
79
  rank=rank,
70
- rect=Rectangle(
71
- lt=(box[0][0], box[0][1]),
72
- rt=(box[1][0], box[1][1]),
73
- rb=(box[2][0], box[2][1]),
74
- lb=(box[3][0], box[3][1]),
75
- ),
80
+ rect=rect,
76
81
  )
77
82
  index += 1
78
83
 
@@ -88,6 +88,9 @@ def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
88
88
  x2 = max(x2, x)
89
89
  y2 = max(y2, y)
90
90
 
91
+ if len(proto_texts_len) == 0:
92
+ continue
93
+
91
94
  fragments.append(OCRFragment(
92
95
  order=min_order,
93
96
  text=" ".join(texts),
@@ -115,13 +118,14 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
115
118
  if len(group) > 0:
116
119
  next_mean_median = (sum_median + median) / (len(group) + 1)
117
120
  next_mean_height = (sum_height + height) / (len(group) + 1)
118
- deviation_rate = abs(median - next_mean_median) / next_mean_height
119
121
 
120
- if deviation_rate > max_deviation_rate:
121
- yield group
122
- group = []
123
- sum_height = 0.0
124
- sum_median = 0.0
122
+ if next_mean_height > 0:
123
+ deviation_rate = abs(median - next_mean_median) / next_mean_height
124
+ if deviation_rate > max_deviation_rate:
125
+ yield group
126
+ group = []
127
+ sum_height = 0.0
128
+ sum_median = 0.0
125
129
 
126
130
  group.append(fragment)
127
131
  sum_height += height
@@ -141,7 +145,12 @@ def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
141
145
  else:
142
146
  overlay_width, overlay_height = _polygon_size(intersection)
143
147
  polygon2_width, polygon2_height = _polygon_size(polygon2)
144
- return (overlay_width / polygon2_width + overlay_height / polygon2_height) / 2.0
148
+ if polygon2_width == 0.0 or polygon2_height == 0.0:
149
+ return 0.0
150
+ return (
151
+ overlay_width / polygon2_width +
152
+ overlay_height / polygon2_height
153
+ ) / 2.0
145
154
 
146
155
  def _polygon_size(polygon: Polygon) -> tuple[float, float]:
147
156
  x1: float = float("inf")
@@ -129,7 +129,9 @@ def _find_median(rotations: list[float]):
129
129
  mid2 = rotations[n // 2]
130
130
  return (mid1 + mid2) / 2
131
131
 
132
- def _find_mean(rotations: list[float]):
132
+ def _find_mean(rotations: list[float]) -> float:
133
+ if len(rotations) == 0:
134
+ return 0.0
133
135
  return sum(rotations) / len(rotations)
134
136
 
135
137
  # rotation is in [0, pi)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.7
3
+ Version: 0.0.8
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="doc-page-extractor",
5
- version="0.0.7",
5
+ version="0.0.8",
6
6
  author="Tao Zeyu",
7
7
  author_email="i@taozeyu.com",
8
8
  url="https://github.com/Moskize91/doc-page-extractor",