doc-page-extractor 0.0.6__tar.gz → 0.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (38) hide show
  1. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/PKG-INFO +12 -3
  2. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/README.md +11 -1
  3. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/clipper.py +3 -0
  4. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/extractor.py +11 -6
  5. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/ocr.py +11 -6
  6. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/overlap.py +16 -7
  7. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/rotation.py +3 -1
  8. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/PKG-INFO +12 -3
  9. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/requires.txt +0 -1
  10. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/setup.py +1 -2
  11. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/LICENSE +0 -0
  12. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/__init__.py +0 -0
  13. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/downloader.py +0 -0
  14. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/layoutreader.py +0 -0
  15. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/ocr_corrector.py +0 -0
  16. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/__init__.py +0 -0
  17. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  18. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  19. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/imaug.py +0 -0
  20. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/operators.py +0 -0
  21. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  22. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  23. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  24. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  25. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  26. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  27. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/onnxocr/utils.py +0 -0
  28. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/plot.py +0 -0
  29. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/raw_optimizer.py +0 -0
  30. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/rectangle.py +0 -0
  31. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/types.py +0 -0
  32. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor/utils.py +0 -0
  33. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/SOURCES.txt +0 -0
  34. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
  35. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/doc_page_extractor.egg-info/top_level.txt +0 -0
  36. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/setup.cfg +0 -0
  37. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/tests/__init__.py +0 -0
  38. {doc_page_extractor-0.0.6 → doc_page_extractor-0.0.8}/tests/test_history_bus.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -10,7 +10,6 @@ License-File: LICENSE
10
10
  Requires-Dist: opencv-python<5.0,>=4.11.0
11
11
  Requires-Dist: pillow<11.0,>=10.3
12
12
  Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: onnxruntime<2.0,>=1.19.0
14
13
  Requires-Dist: numpy<2.0,>=1.24.0
15
14
  Requires-Dist: shapely<3.0,>=2.0.0
16
15
  Requires-Dist: transformers<5.0,>=4.48.0
@@ -37,10 +36,20 @@ doc page extractor can identify text and format in images and return structured
37
36
  pip install doc-page-extractor
38
37
  ```
39
38
 
39
+ ```shell
40
+ pip install onnxruntime==1.21.0
41
+ ```
42
+
40
43
  ## Using CUDA
41
44
 
42
45
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
43
46
 
47
+ In addition, replace the command to install `onnxruntime` in the previous article with the following:
48
+
49
+ ```shell
50
+ pip install onnxruntime-gpu==1.21.0
51
+ ```
52
+
44
53
  ## Example
45
54
 
46
55
  ```python
@@ -49,7 +58,7 @@ from doc_page_extractor import DocExtractor
49
58
 
50
59
  extractor = DocExtractor(
51
60
  model_dir_path=model_path, # Folder address where AI model is downloaded and installed
52
- device="cpu", # If you want to use CUDA, please change to device="cuda:0".
61
+ device="cpu", # If you want to use CUDA, please change to device="cuda".
53
62
  )
54
63
  with Image.open("/path/to/your/image.png") as image:
55
64
  result = extractor.extract(
@@ -12,10 +12,20 @@ doc page extractor can identify text and format in images and return structured
12
12
  pip install doc-page-extractor
13
13
  ```
14
14
 
15
+ ```shell
16
+ pip install onnxruntime==1.21.0
17
+ ```
18
+
15
19
  ## Using CUDA
16
20
 
17
21
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
18
22
 
23
+ In addition, replace the command to install `onnxruntime` in the previous article with the following:
24
+
25
+ ```shell
26
+ pip install onnxruntime-gpu==1.21.0
27
+ ```
28
+
19
29
  ## Example
20
30
 
21
31
  ```python
@@ -24,7 +34,7 @@ from doc_page_extractor import DocExtractor
24
34
 
25
35
  extractor = DocExtractor(
26
36
  model_dir_path=model_path, # Folder address where AI model is downloaded and installed
27
- device="cpu", # If you want to use CUDA, please change to device="cuda:0".
37
+ device="cpu", # If you want to use CUDA, please change to device="cuda".
28
38
  )
29
39
  with Image.open("/path/to/your/image.png") as image:
30
40
  result = extractor.extract(
@@ -73,6 +73,9 @@ def _size_and_wrapper(rect: Rectangle):
73
73
  else:
74
74
  widths.append(distance)
75
75
 
76
+ if len(widths) == 0 and len(heights) == 0:
77
+ return 0.0, 0.0, 0.0, 0.0
78
+
76
79
  width: float = sum(widths) / len(widths)
77
80
  height: float = sum(heights) / len(heights)
78
81
  max_width: float = width
@@ -1,5 +1,4 @@
1
1
  import os
2
- import torch
3
2
 
4
3
  from typing import Literal, Iterable
5
4
  from pathlib import Path
@@ -34,10 +33,6 @@ class DocExtractor:
34
33
  self._yolo: YOLOv10 | None = None
35
34
  self._layout: LayoutLMv3ForTokenClassification | None = None
36
35
 
37
- if self._device.startswith("cuda") and not torch.cuda.is_available():
38
- self._device = "cpu"
39
- print("Warn: cuda is not available, use cpu instead")
40
-
41
36
  def extract(
42
37
  self,
43
38
  image: Image,
@@ -83,7 +78,7 @@ class DocExtractor:
83
78
  source=source,
84
79
  imgsz=1024,
85
80
  conf=0.2,
86
- device=self._device # Device to use (e.g., "cuda:0" or "cpu")
81
+ device=self._device # Device to use (e.g., "cuda" or "cpu")
87
82
  )
88
83
  boxes = det_res[0].__dict__["boxes"]
89
84
  layouts: list[Layout] = []
@@ -142,6 +137,10 @@ class DocExtractor:
142
137
  def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
143
138
  fragment_area = fragment.rect.area
144
139
  primary_layouts: list[(Layout, float)] = []
140
+
141
+ if fragment_area == 0.0:
142
+ return None
143
+
145
144
  for layout in layouts:
146
145
  area = intersection_area(fragment.rect, layout.rect)
147
146
  if area / fragment_area > 0.85:
@@ -174,6 +173,9 @@ class DocExtractor:
174
173
  fragment.order = i
175
174
 
176
175
  def _order_fragments_by_ai(self, width: int, height: int, layouts: list[Layout]):
176
+ if width == 0 or height == 0:
177
+ return
178
+
177
179
  layout_model = self._get_layout()
178
180
  boxes: list[list[int]] = []
179
181
  steps: float = 1000.0 # max value of layoutreader
@@ -273,6 +275,9 @@ class DocExtractor:
273
275
  width = right - left
274
276
  height = bottom - top
275
277
 
278
+ if width == 0 or height == 0:
279
+ return
280
+
276
281
  for _left, _top, _right, _bottom in boxes:
277
282
  yield (
278
283
  (_left - left) / width,
@@ -63,16 +63,21 @@ class OCR:
63
63
  text, rank = res
64
64
  if is_space_text(text):
65
65
  continue
66
+
67
+ rect = Rectangle(
68
+ lt=(box[0][0], box[0][1]),
69
+ rt=(box[1][0], box[1][1]),
70
+ rb=(box[2][0], box[2][1]),
71
+ lb=(box[3][0], box[3][1]),
72
+ )
73
+ if rect.area == 0.0:
74
+ continue
75
+
66
76
  yield OCRFragment(
67
77
  order=index,
68
78
  text=text,
69
79
  rank=rank,
70
- rect=Rectangle(
71
- lt=(box[0][0], box[0][1]),
72
- rt=(box[1][0], box[1][1]),
73
- rb=(box[2][0], box[2][1]),
74
- lb=(box[3][0], box[3][1]),
75
- ),
80
+ rect=rect,
76
81
  )
77
82
  index += 1
78
83
 
@@ -88,6 +88,9 @@ def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
88
88
  x2 = max(x2, x)
89
89
  y2 = max(y2, y)
90
90
 
91
+ if len(proto_texts_len) == 0:
92
+ continue
93
+
91
94
  fragments.append(OCRFragment(
92
95
  order=min_order,
93
96
  text=" ".join(texts),
@@ -115,13 +118,14 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
115
118
  if len(group) > 0:
116
119
  next_mean_median = (sum_median + median) / (len(group) + 1)
117
120
  next_mean_height = (sum_height + height) / (len(group) + 1)
118
- deviation_rate = abs(median - next_mean_median) / next_mean_height
119
121
 
120
- if deviation_rate > max_deviation_rate:
121
- yield group
122
- group = []
123
- sum_height = 0.0
124
- sum_median = 0.0
122
+ if next_mean_height > 0:
123
+ deviation_rate = abs(median - next_mean_median) / next_mean_height
124
+ if deviation_rate > max_deviation_rate:
125
+ yield group
126
+ group = []
127
+ sum_height = 0.0
128
+ sum_median = 0.0
125
129
 
126
130
  group.append(fragment)
127
131
  sum_height += height
@@ -141,7 +145,12 @@ def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
141
145
  else:
142
146
  overlay_width, overlay_height = _polygon_size(intersection)
143
147
  polygon2_width, polygon2_height = _polygon_size(polygon2)
144
- return (overlay_width / polygon2_width + overlay_height / polygon2_height) / 2.0
148
+ if polygon2_width == 0.0 or polygon2_height == 0.0:
149
+ return 0.0
150
+ return (
151
+ overlay_width / polygon2_width +
152
+ overlay_height / polygon2_height
153
+ ) / 2.0
145
154
 
146
155
  def _polygon_size(polygon: Polygon) -> tuple[float, float]:
147
156
  x1: float = float("inf")
@@ -129,7 +129,9 @@ def _find_median(rotations: list[float]):
129
129
  mid2 = rotations[n // 2]
130
130
  return (mid1 + mid2) / 2
131
131
 
132
- def _find_mean(rotations: list[float]):
132
+ def _find_mean(rotations: list[float]) -> float:
133
+ if len(rotations) == 0:
134
+ return 0.0
133
135
  return sum(rotations) / len(rotations)
134
136
 
135
137
  # rotation is in [0, pi)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -10,7 +10,6 @@ License-File: LICENSE
10
10
  Requires-Dist: opencv-python<5.0,>=4.11.0
11
11
  Requires-Dist: pillow<11.0,>=10.3
12
12
  Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: onnxruntime<2.0,>=1.19.0
14
13
  Requires-Dist: numpy<2.0,>=1.24.0
15
14
  Requires-Dist: shapely<3.0,>=2.0.0
16
15
  Requires-Dist: transformers<5.0,>=4.48.0
@@ -37,10 +36,20 @@ doc page extractor can identify text and format in images and return structured
37
36
  pip install doc-page-extractor
38
37
  ```
39
38
 
39
+ ```shell
40
+ pip install onnxruntime==1.21.0
41
+ ```
42
+
40
43
  ## Using CUDA
41
44
 
42
45
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
43
46
 
47
+ In addition, replace the command to install `onnxruntime` in the previous article with the following:
48
+
49
+ ```shell
50
+ pip install onnxruntime-gpu==1.21.0
51
+ ```
52
+
44
53
  ## Example
45
54
 
46
55
  ```python
@@ -49,7 +58,7 @@ from doc_page_extractor import DocExtractor
49
58
 
50
59
  extractor = DocExtractor(
51
60
  model_dir_path=model_path, # Folder address where AI model is downloaded and installed
52
- device="cpu", # If you want to use CUDA, please change to device="cuda:0".
61
+ device="cpu", # If you want to use CUDA, please change to device="cuda".
53
62
  )
54
63
  with Image.open("/path/to/your/image.png") as image:
55
64
  result = extractor.extract(
@@ -1,7 +1,6 @@
1
1
  opencv-python<5.0,>=4.11.0
2
2
  pillow<11.0,>=10.3
3
3
  pyclipper<2.0,>=1.2.0
4
- onnxruntime<2.0,>=1.19.0
5
4
  numpy<2.0,>=1.24.0
6
5
  shapely<3.0,>=2.0.0
7
6
  transformers<5.0,>=4.48.0
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="doc-page-extractor",
5
- version="0.0.6",
5
+ version="0.0.8",
6
6
  author="Tao Zeyu",
7
7
  author_email="i@taozeyu.com",
8
8
  url="https://github.com/Moskize91/doc-page-extractor",
@@ -14,7 +14,6 @@ setup(
14
14
  "opencv-python>=4.11.0,<5.0",
15
15
  "pillow>=10.3,<11.0",
16
16
  "pyclipper>=1.2.0,<2.0",
17
- "onnxruntime>=1.19.0,<2.0",
18
17
  "numpy>=1.24.0,<2.0",
19
18
  "shapely>=2.0.0,<3.0",
20
19
  "transformers>=4.48.0,<5.0",