doc-page-extractor 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (52) hide show
  1. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/PKG-INFO +33 -30
  2. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/README.md +3 -7
  3. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/clipper.py +3 -3
  4. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/extractor.py +3 -3
  5. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/layout_order.py +3 -3
  6. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/layoutreader.py +1 -1
  7. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/model.py +39 -20
  8. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/ocr.py +8 -5
  9. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/ocr_corrector.py +3 -3
  10. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/overlap.py +3 -3
  11. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/plot.py +6 -4
  12. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/raw_optimizer.py +1 -1
  13. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/table.py +1 -1
  14. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/types.py +2 -2
  15. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/utils.py +1 -1
  16. doc_page_extractor-0.2.4/pyproject.toml +59 -0
  17. doc_page_extractor-0.2.4/scripts/prebuild.py +8 -0
  18. doc_page_extractor-0.2.2/doc_page_extractor.egg-info/PKG-INFO +0 -85
  19. doc_page_extractor-0.2.2/doc_page_extractor.egg-info/SOURCES.txt +0 -48
  20. doc_page_extractor-0.2.2/doc_page_extractor.egg-info/dependency_links.txt +0 -1
  21. doc_page_extractor-0.2.2/doc_page_extractor.egg-info/requires.txt +0 -10
  22. doc_page_extractor-0.2.2/doc_page_extractor.egg-info/top_level.txt +0 -2
  23. doc_page_extractor-0.2.2/setup.cfg +0 -4
  24. doc_page_extractor-0.2.2/setup.py +0 -28
  25. doc_page_extractor-0.2.2/tests/__init__.py +0 -0
  26. doc_page_extractor-0.2.2/tests/test_history_bus.py +0 -55
  27. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/LICENSE +0 -0
  28. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/__init__.py +0 -0
  29. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/downloader.py +0 -0
  30. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/latex.py +0 -0
  31. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/__init__.py +0 -0
  32. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  33. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  34. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/imaug.py +0 -0
  35. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/operators.py +0 -0
  36. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  37. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  38. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  39. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  40. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  41. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  42. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/utils.py +0 -0
  43. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/rectangle.py +0 -0
  44. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/rotation.py +0 -0
  45. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/__init__.py +0 -0
  46. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -0
  47. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -0
  48. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -0
  49. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -0
  50. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -0
  51. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -0
  52. {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -0
@@ -1,30 +1,37 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: doc-page-extractor
3
- Version: 0.2.2
4
- Summary: doc page extractor can identify text and format in images and return structured data.
5
- Home-page: https://github.com/Moskize91/doc-page-extractor
3
+ Version: 0.2.4
4
+ Summary:
5
+ License: AGPL-3.0
6
6
  Author: Tao Zeyu
7
7
  Author-email: i@taozeyu.com
8
+ Maintainer: Tao Zeyu
9
+ Maintainer-email: i@taozeyu.com
10
+ Requires-Python: >=3.10,<3.13
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Provides-Extra: cpu
20
+ Provides-Extra: cuda
21
+ Requires-Dist: accelerate (>=1.6.0,<2.0)
22
+ Requires-Dist: doclayout_yolo (>=0.0.3)
23
+ Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
24
+ Requires-Dist: numpy (>=1.24.0,<2.0)
25
+ Requires-Dist: onnxruntime (==1.21.0) ; extra == "cpu"
26
+ Requires-Dist: onnxruntime-gpu (==1.21.0) ; extra == "cuda"
27
+ Requires-Dist: opencv-python (>=4.10.0,<5.0)
28
+ Requires-Dist: pillow (>=10.3,<11.0)
29
+ Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
30
+ Requires-Dist: pyclipper (>=1.2.0,<2.0)
31
+ Requires-Dist: shapely (>=2.0.0,<3.0)
32
+ Requires-Dist: transformers (>=4.42.4,<=4.47)
33
+ Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
8
34
  Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: opencv-python<5.0,>=4.10.0
11
- Requires-Dist: pillow<11.0,>=10.3
12
- Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: numpy<2.0,>=1.24.0
14
- Requires-Dist: shapely<3.0,>=2.0.0
15
- Requires-Dist: transformers<=4.47,>=4.42.4
16
- Requires-Dist: doclayout_yolo>=0.0.3
17
- Requires-Dist: pix2tex<=0.2.0,>=0.1.4
18
- Requires-Dist: accelerate<2.0,>=1.6.0
19
- Requires-Dist: huggingface_hub<1.0,>=0.30.2
20
- Dynamic: author
21
- Dynamic: author-email
22
- Dynamic: description
23
- Dynamic: description-content-type
24
- Dynamic: home-page
25
- Dynamic: license-file
26
- Dynamic: requires-dist
27
- Dynamic: summary
28
35
 
29
36
  # doc page extractor
30
37
 
@@ -37,21 +44,17 @@ doc page extractor can identify text and format in images and return structured
37
44
  ## Installation
38
45
 
39
46
  ```shell
40
- pip install doc-page-extractor
41
- ```
42
-
43
- ```shell
44
- pip install onnxruntime==1.21.0
47
+ pip install doc-page-extractor[cpu]
45
48
  ```
46
49
 
47
50
  ## Using CUDA
48
51
 
49
52
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
50
53
 
51
- In addition, replace the command to install `onnxruntime` in the previous article with the following:
54
+ The installation mentioned above uses the following command.
52
55
 
53
56
  ```shell
54
- pip install onnxruntime-gpu==1.21.0
57
+ pip install doc-page-extractor[cuda]
55
58
  ```
56
59
 
57
60
  ## Example
@@ -9,21 +9,17 @@ doc page extractor can identify text and format in images and return structured
9
9
  ## Installation
10
10
 
11
11
  ```shell
12
- pip install doc-page-extractor
13
- ```
14
-
15
- ```shell
16
- pip install onnxruntime==1.21.0
12
+ pip install doc-page-extractor[cpu]
17
13
  ```
18
14
 
19
15
  ## Using CUDA
20
16
 
21
17
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
22
18
 
23
- In addition, replace the command to install `onnxruntime` in the previous article with the following:
19
+ The installation mentioned above uses the following command.
24
20
 
25
21
  ```shell
26
- pip install onnxruntime-gpu==1.21.0
22
+ pip install doc-page-extractor[cuda]
27
23
  ```
28
24
 
29
25
  ## Example
@@ -13,12 +13,12 @@ def clip(
13
13
  wrapped_width: float = 0.0,
14
14
  wrapped_height: float = 0.0,
15
15
  ) -> Image:
16
- image: Image
16
+ image: Image | None
17
17
  if extracted_result.adjusted_image is None:
18
18
  image = extracted_result.extracted_image
19
19
  else:
20
20
  image = extracted_result.adjusted_image
21
-
21
+ assert image is not None, "Image must not be None"
22
22
  return clip_from_image(
23
23
  image, layout.rect,
24
24
  wrapped_width, wrapped_height,
@@ -91,7 +91,7 @@ def _size_and_wrapper(rect: Rectangle):
91
91
 
92
92
  return width, height, max_width, max_height
93
93
 
94
- def _to_pillow_matrix(matrix: np.array):
94
+ def _to_pillow_matrix(matrix):
95
95
  return (
96
96
  matrix[0][0], matrix[0][1], matrix[0][2],
97
97
  matrix[1][0], matrix[1][1], matrix[1][2],
@@ -1,7 +1,7 @@
1
1
  import torch
2
2
 
3
3
  from os import PathLike
4
- from typing import Literal, Generator
4
+ from typing import cast, Any, Literal, Generator
5
5
  from PIL.Image import Image
6
6
  from doclayout_yolo import YOLOv10
7
7
 
@@ -99,7 +99,7 @@ class DocExtractor:
99
99
  # about source parameter to see:
100
100
  # https://github.com/opendatalab/DocLayout-YOLO/blob/7c4be36bc61f11b67cf4a44ee47f3c41e9800a91/doclayout_yolo/data/build.py#L157-L175
101
101
  det_res = self._get_yolo().predict(
102
- source=source,
102
+ source=cast(Any, source),
103
103
  imgsz=1024,
104
104
  conf=0.2,
105
105
  device=self._device # Device to use (e.g., "cuda" or "cpu")
@@ -180,7 +180,7 @@ class DocExtractor:
180
180
 
181
181
  def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
182
182
  fragment_area = fragment.rect.area
183
- primary_layouts: list[(Layout, float)] = []
183
+ primary_layouts: list[tuple[Layout, float]] = []
184
184
 
185
185
  if fragment_area == 0.0:
186
186
  return None
@@ -88,7 +88,7 @@ class LayoutOrder:
88
88
  y_scale = layoutreader_size / float(height)
89
89
 
90
90
  for bbox in bbox_list:
91
- x0, y0, x1, y1 = self._squeeze(bbox.value, width, height)
91
+ x0, y0, x1, y1 = self._squeeze(bbox, width, height)
92
92
  x0 = round(x0 * x_scale)
93
93
  y0 = round(y0 * y_scale)
94
94
  x1 = round(x1 * x_scale)
@@ -223,8 +223,8 @@ class LayoutOrder:
223
223
  mid2 = sorted_numbers[n // 2]
224
224
  return float((mid1 + mid2) / 2)
225
225
 
226
- def _squeeze(self, bbox: _BBox, width: int, height: int) -> _BBox:
227
- x0, y0, x1, y1 = bbox
226
+ def _squeeze(self, bbox: _BBox, width: int, height: int) -> tuple[float, float, float, float]:
227
+ x0, y0, x1, y1 = bbox.value
228
228
  x0 = self._squeeze_value(x0, width)
229
229
  x1 = self._squeeze_value(x1, width)
230
230
  y0 = self._squeeze_value(y0, height)
@@ -64,7 +64,7 @@ class DataCollator:
64
64
  return ret
65
65
 
66
66
 
67
- def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]:
67
+ def boxes2inputs(boxes: List[List[float]]) -> Dict[str, torch.Tensor]:
68
68
  bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
69
69
  input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
70
70
  attention_mask = [1] + [1] * len(boxes) + [1]
@@ -1,26 +1,30 @@
1
1
  from os import PathLike
2
- from typing import runtime_checkable, Protocol
2
+ from time import sleep
3
+ from typing import cast, runtime_checkable, Protocol
3
4
  from pathlib import Path
4
5
  from threading import Lock
5
6
  from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
6
7
 
7
8
 
9
+ _RETRY_TIMES = 6
10
+ _RETRY_SLEEP = 3.5
11
+
8
12
  @runtime_checkable
9
13
  class Model(Protocol):
10
14
  def get_onnx_ocr_path(self) -> Path:
11
- pass
15
+ raise NotImplementedError()
12
16
 
13
17
  def get_yolo_path(self) -> Path:
14
- pass
18
+ raise NotImplementedError()
15
19
 
16
20
  def get_layoutreader_path(self) -> Path:
17
- pass
21
+ raise NotImplementedError()
18
22
 
19
23
  def get_struct_eqtable_path(self) -> Path:
20
- pass
24
+ raise NotImplementedError()
21
25
 
22
26
  def get_latex_path(self) -> Path:
23
- pass
27
+ raise NotImplementedError()
24
28
 
25
29
  class HuggingfaceModel(Model):
26
30
  def __init__(self, model_cache_dir: PathLike):
@@ -96,19 +100,34 @@ class HuggingfaceModel(Model):
96
100
  model_path = model_path.parent
97
101
 
98
102
  else:
99
- if is_snapshot:
100
- model_path = snapshot_download(
101
- cache_dir=self._model_cache_dir,
102
- repo_id=repo_id,
103
- repo_type=repo_type,
104
- )
105
- else:
106
- model_path = hf_hub_download(
107
- cache_dir=self._model_cache_dir,
108
- repo_id=repo_id,
109
- repo_type=repo_type,
110
- filename=filename,
111
- )
112
- model_path = Path(model_path)
103
+ # https://github.com/huggingface/huggingface_hub/issues/1542#issuecomment-1630465844
104
+ latest_error: ConnectionError | None = None
105
+ for i in range(_RETRY_TIMES + 1):
106
+ if latest_error is not None:
107
+ print(f"Retrying to download {repo_id} model, attempt {i + 1}/{_RETRY_TIMES}...")
108
+ sleep(_RETRY_SLEEP)
109
+ try:
110
+ if is_snapshot:
111
+ model_path = snapshot_download(
112
+ cache_dir=self._model_cache_dir,
113
+ repo_id=repo_id,
114
+ repo_type=repo_type,
115
+ resume_download=True,
116
+ )
117
+ else:
118
+ model_path = hf_hub_download(
119
+ cache_dir=self._model_cache_dir,
120
+ repo_id=repo_id,
121
+ repo_type=repo_type,
122
+ filename=filename,
123
+ resume_download=True,
124
+ )
125
+ latest_error = None
126
+ except ConnectionError as err:
127
+ latest_error = err
128
+
129
+ if latest_error is not None:
130
+ raise latest_error
131
+ model_path = Path(cast(PathLike, model_path))
113
132
 
114
133
  return model_path
@@ -2,7 +2,7 @@ import numpy as np
2
2
  import cv2
3
3
  import os
4
4
 
5
- from typing import Literal, Generator
5
+ from typing import cast, Any, Iterable, Literal, Generator
6
6
  from dataclasses import dataclass
7
7
  from .onnxocr import TextSystem
8
8
  from .types import OCRFragment
@@ -80,7 +80,10 @@ class OCR:
80
80
  image = self._preprocess_image(image)
81
81
  dt_boxes, rec_res = text_system(image)
82
82
 
83
- for box, res in zip(dt_boxes, rec_res):
83
+ for box, res in zip(
84
+ cast(Iterable[Any], dt_boxes),
85
+ cast(Iterable[Any], rec_res),
86
+ ):
84
87
  yield box.tolist(), res
85
88
 
86
89
  def _get_text_system(self) -> TextSystem:
@@ -123,8 +126,8 @@ class OCR:
123
126
  model_paths.append(str(model_dir / file_name))
124
127
  return model_paths
125
128
 
126
- def _preprocess_image(self, image: np.ndarray) -> np.ndarray:
127
- image = self._alpha_to_color(image, (255, 255, 255))
129
+ def _preprocess_image(self, np_image: np.ndarray) -> np.ndarray:
130
+ image = self._alpha_to_color(np_image, (255, 255, 255))
128
131
  # image = cv2.bitwise_not(image) # inv
129
132
  # image = self._binarize_img(image) # bin
130
133
  image = cv2.normalize(
@@ -148,7 +151,7 @@ class OCR:
148
151
  image = gpu_frame.download()
149
152
  elif cv2.ocl.haveOpenCL():
150
153
  cv2.ocl.setUseOpenCL(True)
151
- gpu_frame = cv2.UMat(image)
154
+ gpu_frame = cv2.UMat(cast(Any, image))
152
155
  image = cv2.fastNlMeansDenoisingColored(
153
156
  src=gpu_frame,
154
157
  dst=None,
@@ -1,6 +1,6 @@
1
1
  import numpy as np
2
2
 
3
- from typing import Iterable
3
+ from typing import cast, Iterable
4
4
  from shapely.geometry import Polygon
5
5
  from PIL.Image import new, Image, Resampling
6
6
  from .types import Layout, OCRFragment
@@ -90,13 +90,13 @@ def _match_fragments(
90
90
  ) -> tuple[list[tuple[OCRFragment, OCRFragment]], list[OCRFragment]]:
91
91
 
92
92
  zone_polygon = Polygon(zone_rect)
93
- fragments2: list[OCRFragment] = list(fragments2)
93
+ fragments2 = list(fragments2)
94
94
  matched_fragments: list[tuple[OCRFragment, OCRFragment]] = []
95
95
  not_matched_fragments: list[OCRFragment] = []
96
96
 
97
97
  for fragment1 in fragments1:
98
98
  polygon1 = Polygon(fragment1.rect)
99
- polygon1 = zone_polygon.intersection(polygon1)
99
+ polygon1 = cast(Polygon, zone_polygon.intersection(polygon1))
100
100
  if polygon1.is_empty:
101
101
  continue
102
102
 
@@ -1,4 +1,4 @@
1
- from typing import Generator
1
+ from typing import cast, Generator
2
2
  from shapely.geometry import Polygon
3
3
  from .types import Layout, OCRFragment
4
4
  from .rectangle import Rectangle
@@ -92,7 +92,7 @@ def merge_fragments_as_line(origin_fragments: list[OCRFragment]) -> list[OCRFrag
92
92
  continue
93
93
 
94
94
  fragments.append(OCRFragment(
95
- order=min_order,
95
+ order=round(min_order),
96
96
  text=" ".join(texts),
97
97
  rank=text_rate_weights / proto_texts_len,
98
98
  rect=Rectangle(
@@ -141,7 +141,7 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
141
141
  # they are very sensitive to changes in height because they are very thin and long.
142
142
  # In order to make it equally sensitive to length and width, the ratio of area is not used.
143
143
  def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
144
- intersection: Polygon = polygon1.intersection(polygon2)
144
+ intersection = cast(Polygon, polygon1.intersection(polygon2))
145
145
  if intersection.is_empty:
146
146
  return 0.0
147
147
  else:
@@ -1,4 +1,4 @@
1
- from typing import Iterable
1
+ from typing import cast, Iterable
2
2
  from PIL import ImageDraw
3
3
  from PIL.ImageFont import load_default, FreeTypeFont
4
4
  from PIL.Image import Image
@@ -9,8 +9,8 @@ _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
9
9
  _Color = tuple[int, int, int]
10
10
 
11
11
  def plot(image: Image, layouts: Iterable[Layout]) -> None:
12
- layout_font = load_default(size=35)
13
- fragment_font = load_default(size=25)
12
+ layout_font = cast(FreeTypeFont, load_default(size=35))
13
+ fragment_font = cast(FreeTypeFont, load_default(size=25))
14
14
  draw = ImageDraw.Draw(image, mode="RGBA")
15
15
 
16
16
  def _draw_number(position: Point, number: int, font: FreeTypeFont, bold: bool, color: _Color) -> None:
@@ -88,4 +88,6 @@ def _layout_color(layout: Layout) -> _Color:
88
88
  elif cls == LayoutClass.ISOLATE_FORMULA:
89
89
  return (0xFA, 0x38, 0x27) # Red
90
90
  elif cls == LayoutClass.FORMULA_CAPTION:
91
- return (0xFF, 0x9D, 0x24) # Orange
91
+ return (0xFF, 0x9D, 0x24) # Orange
92
+ else:
93
+ return (0x00, 0x00, 0x00)
@@ -3,7 +3,7 @@ import numpy as np
3
3
  from dataclasses import dataclass
4
4
  from PIL.Image import Image
5
5
  from math import pi
6
- from .types import OCRFragment, Layout
6
+ from .types import Layout, OCRFragment
7
7
  from .rotation import calculate_rotation, RotationAdjuster
8
8
  from .rectangle import Rectangle
9
9
 
@@ -48,7 +48,7 @@ class Table:
48
48
  from .struct_eqtable import build_model
49
49
  model_path = self._model.get_struct_eqtable_path()
50
50
  table_model = build_model(
51
- model_ckpt=model_path,
51
+ model_ckpt=str(model_path),
52
52
  max_new_tokens=1024,
53
53
  max_time=30,
54
54
  lmdeploy=False,
@@ -50,12 +50,12 @@ class PlainLayout(BaseLayout):
50
50
  @dataclass
51
51
  class TableLayout(BaseLayout):
52
52
  parsed: tuple[str, TableLayoutParsedFormat] | None
53
- cls: LayoutClass.TABLE
53
+ cls: Literal[LayoutClass.TABLE]
54
54
 
55
55
  @dataclass
56
56
  class FormulaLayout(BaseLayout):
57
57
  latex: str | None
58
- cls: LayoutClass.ISOLATE_FORMULA
58
+ cls: Literal[LayoutClass.ISOLATE_FORMULA]
59
59
 
60
60
  Layout = PlainLayout | TableLayout | FormulaLayout
61
61
 
@@ -12,7 +12,7 @@ def ensure_dir(path: str) -> str:
12
12
  return path
13
13
 
14
14
  def is_space_text(text: str) -> bool:
15
- return re.match(r"^\s*$", text)
15
+ return bool(re.match(r"^\s*$", text))
16
16
 
17
17
  def expand_image(image: Image, percent: float):
18
18
  width, height = image.size
@@ -0,0 +1,59 @@
1
+ [project]
2
+ name = "doc-page-extractor"
3
+ version = "0.2.4"
4
+ description = ""
5
+ authors = [
6
+ {name = "Tao Zeyu",email = "i@taozeyu.com"}
7
+ ]
8
+ maintainers = [
9
+ {name = "Tao Zeyu", email = "i@taozeyu.com"}
10
+ ]
11
+ license = {text = "AGPL-3.0"}
12
+ readme = "README.md"
13
+ requires-python = ">=3.10,<3.13"
14
+
15
+ [build-system]
16
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
17
+ build-backend = "poetry.core.masonry.api"
18
+
19
+ [tool.poetry]
20
+ license = "AGPL-3.0"
21
+ readme = "README.md"
22
+ repository = "https://github.com/moskize91/doc-page-extractor"
23
+ packages = [
24
+ {include = "doc_page_extractor" }
25
+ ]
26
+ include = ["doc_page_extractor/struct_eqtable/**/*.py"]
27
+ classifiers=[
28
+ "Development Status :: 2 - Pre-Alpha",
29
+ "Intended Audience :: Developers",
30
+ "License :: OSI Approved :: GNU Affero General Public License v3",
31
+ "Programming Language :: Python",
32
+ "Programming Language :: Python :: 3.10",
33
+ ]
34
+
35
+ [tool.poetry.dependencies]
36
+ opencv-python = ">=4.10.0,<5.0"
37
+ pillow = ">=10.3,<11.0"
38
+ pyclipper = ">=1.2.0,<2.0"
39
+ numpy = ">=1.24.0,<2.0"
40
+ shapely = ">=2.0.0,<3.0"
41
+ transformers = ">=4.42.4,<=4.47"
42
+ doclayout_yolo = ">=0.0.3"
43
+ pix2tex = ">=0.1.4,<=0.2.0"
44
+ accelerate = ">=1.6.0,<2.0"
45
+ huggingface_hub = ">=0.33.0,<1.0"
46
+
47
+ onnxruntime = { version = "1.21.0", optional = true }
48
+ onnxruntime-gpu = { version = "1.21.0", optional = true }
49
+
50
+ [tool.poetry.extras]
51
+ cpu = ["onnxruntime"]
52
+ cuda = ["onnxruntime-gpu"]
53
+
54
+ [tool.poetry.group.dev.dependencies]
55
+ pylint = "^3.3.7"
56
+
57
+ [tool.poetry.build]
58
+ generate-setup-file = false
59
+ script = "scripts/prebuild.py"
@@ -0,0 +1,8 @@
1
+ from subprocess import run
2
+ from pathlib import Path
3
+
4
+
5
+ def prebuild(setup_kwargs):
6
+ shell_path = Path(__file__).parent / "sync-struct-eqtable.sh"
7
+ run(["bash", str(shell_path)], check=True)
8
+ return setup_kwargs
@@ -1,85 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: doc-page-extractor
3
- Version: 0.2.2
4
- Summary: doc page extractor can identify text and format in images and return structured data.
5
- Home-page: https://github.com/Moskize91/doc-page-extractor
6
- Author: Tao Zeyu
7
- Author-email: i@taozeyu.com
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: opencv-python<5.0,>=4.10.0
11
- Requires-Dist: pillow<11.0,>=10.3
12
- Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: numpy<2.0,>=1.24.0
14
- Requires-Dist: shapely<3.0,>=2.0.0
15
- Requires-Dist: transformers<=4.47,>=4.42.4
16
- Requires-Dist: doclayout_yolo>=0.0.3
17
- Requires-Dist: pix2tex<=0.2.0,>=0.1.4
18
- Requires-Dist: accelerate<2.0,>=1.6.0
19
- Requires-Dist: huggingface_hub<1.0,>=0.30.2
20
- Dynamic: author
21
- Dynamic: author-email
22
- Dynamic: description
23
- Dynamic: description-content-type
24
- Dynamic: home-page
25
- Dynamic: license-file
26
- Dynamic: requires-dist
27
- Dynamic: summary
28
-
29
- # doc page extractor
30
-
31
- English | [中文](./README_zh-CN.md)
32
-
33
- ## Introduction
34
-
35
- doc page extractor can identify text and format in images and return structured data.
36
-
37
- ## Installation
38
-
39
- ```shell
40
- pip install doc-page-extractor
41
- ```
42
-
43
- ```shell
44
- pip install onnxruntime==1.21.0
45
- ```
46
-
47
- ## Using CUDA
48
-
49
- Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
50
-
51
- In addition, replace the command to install `onnxruntime` in the previous article with the following:
52
-
53
- ```shell
54
- pip install onnxruntime-gpu==1.21.0
55
- ```
56
-
57
- ## Example
58
-
59
- ```python
60
- from PIL import Image
61
- from doc_page_extractor import DocExtractor
62
-
63
- extractor = DocExtractor(
64
- model_dir_path=model_path, # Folder address where AI model is downloaded and installed
65
- device="cpu", # If you want to use CUDA, please change to device="cuda".
66
- )
67
- with Image.open("/path/to/your/image.png") as image:
68
- result = extractor.extract(
69
- image=image,
70
- lang="ch", # Language of image text
71
- )
72
- for layout in result.layouts:
73
- for fragment in layout.fragments:
74
- print(fragment.rect, fragment.text)
75
- ```
76
-
77
- ## Acknowledgements
78
-
79
- The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
80
-
81
- - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
82
- - [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
83
- - [layoutreader](https://github.com/ppaanngggg/layoutreader)
84
- - [StructEqTable](https://github.com/Alpha-Innovator/StructEqTable-Deploy)
85
- - [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
@@ -1,48 +0,0 @@
1
- LICENSE
2
- README.md
3
- setup.py
4
- doc_page_extractor/__init__.py
5
- doc_page_extractor/clipper.py
6
- doc_page_extractor/downloader.py
7
- doc_page_extractor/extractor.py
8
- doc_page_extractor/latex.py
9
- doc_page_extractor/layout_order.py
10
- doc_page_extractor/layoutreader.py
11
- doc_page_extractor/model.py
12
- doc_page_extractor/ocr.py
13
- doc_page_extractor/ocr_corrector.py
14
- doc_page_extractor/overlap.py
15
- doc_page_extractor/plot.py
16
- doc_page_extractor/raw_optimizer.py
17
- doc_page_extractor/rectangle.py
18
- doc_page_extractor/rotation.py
19
- doc_page_extractor/table.py
20
- doc_page_extractor/types.py
21
- doc_page_extractor/utils.py
22
- doc_page_extractor.egg-info/PKG-INFO
23
- doc_page_extractor.egg-info/SOURCES.txt
24
- doc_page_extractor.egg-info/dependency_links.txt
25
- doc_page_extractor.egg-info/requires.txt
26
- doc_page_extractor.egg-info/top_level.txt
27
- doc_page_extractor/onnxocr/__init__.py
28
- doc_page_extractor/onnxocr/cls_postprocess.py
29
- doc_page_extractor/onnxocr/db_postprocess.py
30
- doc_page_extractor/onnxocr/imaug.py
31
- doc_page_extractor/onnxocr/operators.py
32
- doc_page_extractor/onnxocr/predict_base.py
33
- doc_page_extractor/onnxocr/predict_cls.py
34
- doc_page_extractor/onnxocr/predict_det.py
35
- doc_page_extractor/onnxocr/predict_rec.py
36
- doc_page_extractor/onnxocr/predict_system.py
37
- doc_page_extractor/onnxocr/rec_postprocess.py
38
- doc_page_extractor/onnxocr/utils.py
39
- doc_page_extractor/struct_eqtable/__init__.py
40
- doc_page_extractor/struct_eqtable/internvl/__init__.py
41
- doc_page_extractor/struct_eqtable/internvl/conversation.py
42
- doc_page_extractor/struct_eqtable/internvl/internvl.py
43
- doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py
44
- doc_page_extractor/struct_eqtable/pix2s/__init__.py
45
- doc_page_extractor/struct_eqtable/pix2s/pix2s.py
46
- doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py
47
- tests/__init__.py
48
- tests/test_history_bus.py
@@ -1,10 +0,0 @@
1
- opencv-python<5.0,>=4.10.0
2
- pillow<11.0,>=10.3
3
- pyclipper<2.0,>=1.2.0
4
- numpy<2.0,>=1.24.0
5
- shapely<3.0,>=2.0.0
6
- transformers<=4.47,>=4.42.4
7
- doclayout_yolo>=0.0.3
8
- pix2tex<=0.2.0,>=0.1.4
9
- accelerate<2.0,>=1.6.0
10
- huggingface_hub<1.0,>=0.30.2
@@ -1,2 +0,0 @@
1
- doc_page_extractor
2
- tests
@@ -1,4 +0,0 @@
1
- [egg_info]
2
- tag_build =
3
- tag_date = 0
4
-
@@ -1,28 +0,0 @@
1
- from setuptools import setup, find_packages
2
-
3
- if "doc_page_extractor.struct_eqtable" not in find_packages():
4
- raise RuntimeError("struct_eqtable not found. Please download struct_eqtable first.")
5
-
6
- setup(
7
- name="doc-page-extractor",
8
- version="0.2.2",
9
- author="Tao Zeyu",
10
- author_email="i@taozeyu.com",
11
- url="https://github.com/Moskize91/doc-page-extractor",
12
- description="doc page extractor can identify text and format in images and return structured data.",
13
- packages=find_packages(),
14
- long_description=open("./README.md", encoding="utf8").read(),
15
- long_description_content_type="text/markdown",
16
- install_requires=[
17
- "opencv-python>=4.10.0,<5.0",
18
- "pillow>=10.3,<11.0",
19
- "pyclipper>=1.2.0,<2.0",
20
- "numpy>=1.24.0,<2.0",
21
- "shapely>=2.0.0,<3.0",
22
- "transformers>=4.42.4,<=4.47",
23
- "doclayout_yolo>=0.0.3",
24
- "pix2tex>=0.1.4,<=0.2.0",
25
- "accelerate>=1.6.0,<2.0",
26
- "huggingface_hub>=0.30.2,<1.0",
27
- ],
28
- )
File without changes
@@ -1,55 +0,0 @@
1
- import os
2
- import unittest
3
-
4
- from PIL import Image
5
- from doc_page_extractor import DocExtractor, Layout, LayoutClass
6
-
7
-
8
- class TestGroup(unittest.TestCase):
9
- def test_history_bugs(self):
10
- model_path = os.path.join(self._project_path(), "model")
11
- image_path = os.path.join(self._project_path(), "tests", "images", "figure.png")
12
- os.makedirs(model_path, exist_ok=True)
13
-
14
- extractor = DocExtractor(model_path, "cpu")
15
- layouts: list[tuple[LayoutClass, list[str]]]
16
-
17
- with Image.open(image_path) as image:
18
- result = extractor.extract(image, extract_formula=False)
19
- layouts = [self._format_Layout(layout) for layout in result.layouts]
20
-
21
- self.assertEqual(layouts, [
22
- (LayoutClass.PLAIN_TEXT, [
23
- "口的11.8%①。这既是江南农业落后的反映,又是它的原因。当战国以",
24
- "后黄河流域因铁器牛耕的普及获得基本的开发,农区联结成一大片的",
25
- "时候,南方农业开发始终没有突破星点状或斑块状分布的格局。由于",
26
- "地旷人稀,耕作相当粗放,许多水田采取火耕水瓣的方式,旱田则多",
27
- "行刀耕火种②。司马迁在《史记·货殖列传》中说:“总之,楚越之",
28
- "地,地厂人希,饭稻囊鱼,或火耕而水瓣,果隋(蕨)赢(螺)蛤,",
29
- "不待贾而足,地势饶食,无饥谨之患,以故皆偷生,无积聚而多",
30
- "贫。”这种概括虽然未免太突出了南方经济的落后面,有一定片面性,",
31
- "但大体还是反映了实际情形的。战国秦汉时期,南方与黄河流域农业",
32
- "的差距显然拉大了。",
33
- ]),
34
- (LayoutClass.FIGURE, []),
35
- (LayoutClass.FIGURE_CAPTION, [
36
- "西晋陶水田犁耙模型(广东连县出土)"
37
- ]),
38
- (LayoutClass.FIGURE, []),
39
- (LayoutClass.FIGURE_CAPTION, [
40
- "南朝陶耙田模型 (广西苍梧倒水出土)"
41
- ]),
42
- (LayoutClass.PLAIN_TEXT, [
43
- "①据赵文林、谢淑君:《中国人口史》(人民出版社1988年)有关资料统计。",
44
- "②《盐铁论·通有》:“荆扬…………伐木而树谷,焚莱而播粟,火耕而水。”"
45
- ]),
46
- (LayoutClass.ABANDON, [
47
- "136"
48
- ]),
49
- ])
50
-
51
- def _format_Layout(self, layout: Layout) -> tuple[LayoutClass, list[str]]:
52
- return layout.cls, [f.text.strip() for f in layout.fragments]
53
-
54
- def _project_path(self) -> str:
55
- return os.path.abspath(os.path.join(__file__, "..", ".."))