doc-page-extractor 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (43) hide show
  1. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/PKG-INFO +8 -8
  2. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/README.md +3 -7
  3. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/clipper.py +3 -3
  4. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/extractor.py +3 -3
  5. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/layout_order.py +3 -3
  6. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/layoutreader.py +1 -1
  7. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/model.py +7 -7
  8. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/ocr.py +8 -5
  9. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/ocr_corrector.py +3 -3
  10. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/overlap.py +3 -3
  11. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/plot.py +6 -4
  12. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/raw_optimizer.py +1 -1
  13. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/__init__.py +49 -0
  14. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/internvl/__init__.py +2 -0
  15. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/internvl/conversation.py +394 -0
  16. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/internvl/internvl.py +198 -0
  17. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +81 -0
  18. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/pix2s/__init__.py +3 -0
  19. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/pix2s/pix2s.py +76 -0
  20. doc_page_extractor-0.2.4/doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +1047 -0
  21. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/table.py +1 -1
  22. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/types.py +2 -2
  23. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/utils.py +1 -1
  24. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/pyproject.toml +26 -15
  25. doc_page_extractor-0.2.4/scripts/prebuild.py +8 -0
  26. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/LICENSE +0 -0
  27. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/__init__.py +0 -0
  28. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/downloader.py +0 -0
  29. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/latex.py +0 -0
  30. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/__init__.py +0 -0
  31. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  32. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  33. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/imaug.py +0 -0
  34. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/operators.py +0 -0
  35. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  36. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  37. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  38. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  39. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  40. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  41. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/utils.py +0 -0
  42. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/rectangle.py +0 -0
  43. {doc_page_extractor-0.2.3 → doc_page_extractor-0.2.4}/doc_page_extractor/rotation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: doc-page-extractor
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary:
5
5
  License: AGPL-3.0
6
6
  Author: Tao Zeyu
@@ -16,10 +16,14 @@ Classifier: Programming Language :: Python :: 3
16
16
  Classifier: Programming Language :: Python :: 3.10
17
17
  Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
+ Provides-Extra: cpu
20
+ Provides-Extra: cuda
19
21
  Requires-Dist: accelerate (>=1.6.0,<2.0)
20
22
  Requires-Dist: doclayout_yolo (>=0.0.3)
21
23
  Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
22
24
  Requires-Dist: numpy (>=1.24.0,<2.0)
25
+ Requires-Dist: onnxruntime (==1.21.0) ; extra == "cpu"
26
+ Requires-Dist: onnxruntime-gpu (==1.21.0) ; extra == "cuda"
23
27
  Requires-Dist: opencv-python (>=4.10.0,<5.0)
24
28
  Requires-Dist: pillow (>=10.3,<11.0)
25
29
  Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
@@ -40,21 +44,17 @@ doc page extractor can identify text and format in images and return structured
40
44
  ## Installation
41
45
 
42
46
  ```shell
43
- pip install doc-page-extractor
44
- ```
45
-
46
- ```shell
47
- pip install onnxruntime==1.21.0
47
+ pip install doc-page-extractor[cpu]
48
48
  ```
49
49
 
50
50
  ## Using CUDA
51
51
 
52
52
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
53
53
 
54
- In addition, replace the command to install `onnxruntime` in the previous article with the following:
54
+ The installation mentioned above uses the following command.
55
55
 
56
56
  ```shell
57
- pip install onnxruntime-gpu==1.21.0
57
+ pip install doc-page-extractor[cuda]
58
58
  ```
59
59
 
60
60
  ## Example
@@ -9,21 +9,17 @@ doc page extractor can identify text and format in images and return structured
9
9
  ## Installation
10
10
 
11
11
  ```shell
12
- pip install doc-page-extractor
13
- ```
14
-
15
- ```shell
16
- pip install onnxruntime==1.21.0
12
+ pip install doc-page-extractor[cpu]
17
13
  ```
18
14
 
19
15
  ## Using CUDA
20
16
 
21
17
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
22
18
 
23
- In addition, replace the command to install `onnxruntime` in the previous article with the following:
19
+ The installation mentioned above uses the following command.
24
20
 
25
21
  ```shell
26
- pip install onnxruntime-gpu==1.21.0
22
+ pip install doc-page-extractor[cuda]
27
23
  ```
28
24
 
29
25
  ## Example
@@ -13,12 +13,12 @@ def clip(
13
13
  wrapped_width: float = 0.0,
14
14
  wrapped_height: float = 0.0,
15
15
  ) -> Image:
16
- image: Image
16
+ image: Image | None
17
17
  if extracted_result.adjusted_image is None:
18
18
  image = extracted_result.extracted_image
19
19
  else:
20
20
  image = extracted_result.adjusted_image
21
-
21
+ assert image is not None, "Image must not be None"
22
22
  return clip_from_image(
23
23
  image, layout.rect,
24
24
  wrapped_width, wrapped_height,
@@ -91,7 +91,7 @@ def _size_and_wrapper(rect: Rectangle):
91
91
 
92
92
  return width, height, max_width, max_height
93
93
 
94
- def _to_pillow_matrix(matrix: np.array):
94
+ def _to_pillow_matrix(matrix):
95
95
  return (
96
96
  matrix[0][0], matrix[0][1], matrix[0][2],
97
97
  matrix[1][0], matrix[1][1], matrix[1][2],
@@ -1,7 +1,7 @@
1
1
  import torch
2
2
 
3
3
  from os import PathLike
4
- from typing import Literal, Generator
4
+ from typing import cast, Any, Literal, Generator
5
5
  from PIL.Image import Image
6
6
  from doclayout_yolo import YOLOv10
7
7
 
@@ -99,7 +99,7 @@ class DocExtractor:
99
99
  # about source parameter to see:
100
100
  # https://github.com/opendatalab/DocLayout-YOLO/blob/7c4be36bc61f11b67cf4a44ee47f3c41e9800a91/doclayout_yolo/data/build.py#L157-L175
101
101
  det_res = self._get_yolo().predict(
102
- source=source,
102
+ source=cast(Any, source),
103
103
  imgsz=1024,
104
104
  conf=0.2,
105
105
  device=self._device # Device to use (e.g., "cuda" or "cpu")
@@ -180,7 +180,7 @@ class DocExtractor:
180
180
 
181
181
  def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
182
182
  fragment_area = fragment.rect.area
183
- primary_layouts: list[(Layout, float)] = []
183
+ primary_layouts: list[tuple[Layout, float]] = []
184
184
 
185
185
  if fragment_area == 0.0:
186
186
  return None
@@ -88,7 +88,7 @@ class LayoutOrder:
88
88
  y_scale = layoutreader_size / float(height)
89
89
 
90
90
  for bbox in bbox_list:
91
- x0, y0, x1, y1 = self._squeeze(bbox.value, width, height)
91
+ x0, y0, x1, y1 = self._squeeze(bbox, width, height)
92
92
  x0 = round(x0 * x_scale)
93
93
  y0 = round(y0 * y_scale)
94
94
  x1 = round(x1 * x_scale)
@@ -223,8 +223,8 @@ class LayoutOrder:
223
223
  mid2 = sorted_numbers[n // 2]
224
224
  return float((mid1 + mid2) / 2)
225
225
 
226
- def _squeeze(self, bbox: _BBox, width: int, height: int) -> _BBox:
227
- x0, y0, x1, y1 = bbox
226
+ def _squeeze(self, bbox: _BBox, width: int, height: int) -> tuple[float, float, float, float]:
227
+ x0, y0, x1, y1 = bbox.value
228
228
  x0 = self._squeeze_value(x0, width)
229
229
  x1 = self._squeeze_value(x1, width)
230
230
  y0 = self._squeeze_value(y0, height)
@@ -64,7 +64,7 @@ class DataCollator:
64
64
  return ret
65
65
 
66
66
 
67
- def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]:
67
+ def boxes2inputs(boxes: List[List[float]]) -> Dict[str, torch.Tensor]:
68
68
  bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
69
69
  input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
70
70
  attention_mask = [1] + [1] * len(boxes) + [1]
@@ -1,6 +1,6 @@
1
1
  from os import PathLike
2
2
  from time import sleep
3
- from typing import runtime_checkable, Protocol
3
+ from typing import cast, runtime_checkable, Protocol
4
4
  from pathlib import Path
5
5
  from threading import Lock
6
6
  from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
@@ -12,19 +12,19 @@ _RETRY_SLEEP = 3.5
12
12
  @runtime_checkable
13
13
  class Model(Protocol):
14
14
  def get_onnx_ocr_path(self) -> Path:
15
- pass
15
+ raise NotImplementedError()
16
16
 
17
17
  def get_yolo_path(self) -> Path:
18
- pass
18
+ raise NotImplementedError()
19
19
 
20
20
  def get_layoutreader_path(self) -> Path:
21
- pass
21
+ raise NotImplementedError()
22
22
 
23
23
  def get_struct_eqtable_path(self) -> Path:
24
- pass
24
+ raise NotImplementedError()
25
25
 
26
26
  def get_latex_path(self) -> Path:
27
- pass
27
+ raise NotImplementedError()
28
28
 
29
29
  class HuggingfaceModel(Model):
30
30
  def __init__(self, model_cache_dir: PathLike):
@@ -128,6 +128,6 @@ class HuggingfaceModel(Model):
128
128
 
129
129
  if latest_error is not None:
130
130
  raise latest_error
131
- model_path = Path(model_path)
131
+ model_path = Path(cast(PathLike, model_path))
132
132
 
133
133
  return model_path
@@ -2,7 +2,7 @@ import numpy as np
2
2
  import cv2
3
3
  import os
4
4
 
5
- from typing import Literal, Generator
5
+ from typing import cast, Any, Iterable, Literal, Generator
6
6
  from dataclasses import dataclass
7
7
  from .onnxocr import TextSystem
8
8
  from .types import OCRFragment
@@ -80,7 +80,10 @@ class OCR:
80
80
  image = self._preprocess_image(image)
81
81
  dt_boxes, rec_res = text_system(image)
82
82
 
83
- for box, res in zip(dt_boxes, rec_res):
83
+ for box, res in zip(
84
+ cast(Iterable[Any], dt_boxes),
85
+ cast(Iterable[Any], rec_res),
86
+ ):
84
87
  yield box.tolist(), res
85
88
 
86
89
  def _get_text_system(self) -> TextSystem:
@@ -123,8 +126,8 @@ class OCR:
123
126
  model_paths.append(str(model_dir / file_name))
124
127
  return model_paths
125
128
 
126
- def _preprocess_image(self, image: np.ndarray) -> np.ndarray:
127
- image = self._alpha_to_color(image, (255, 255, 255))
129
+ def _preprocess_image(self, np_image: np.ndarray) -> np.ndarray:
130
+ image = self._alpha_to_color(np_image, (255, 255, 255))
128
131
  # image = cv2.bitwise_not(image) # inv
129
132
  # image = self._binarize_img(image) # bin
130
133
  image = cv2.normalize(
@@ -148,7 +151,7 @@ class OCR:
148
151
  image = gpu_frame.download()
149
152
  elif cv2.ocl.haveOpenCL():
150
153
  cv2.ocl.setUseOpenCL(True)
151
- gpu_frame = cv2.UMat(image)
154
+ gpu_frame = cv2.UMat(cast(Any, image))
152
155
  image = cv2.fastNlMeansDenoisingColored(
153
156
  src=gpu_frame,
154
157
  dst=None,
@@ -1,6 +1,6 @@
1
1
  import numpy as np
2
2
 
3
- from typing import Iterable
3
+ from typing import cast, Iterable
4
4
  from shapely.geometry import Polygon
5
5
  from PIL.Image import new, Image, Resampling
6
6
  from .types import Layout, OCRFragment
@@ -90,13 +90,13 @@ def _match_fragments(
90
90
  ) -> tuple[list[tuple[OCRFragment, OCRFragment]], list[OCRFragment]]:
91
91
 
92
92
  zone_polygon = Polygon(zone_rect)
93
- fragments2: list[OCRFragment] = list(fragments2)
93
+ fragments2 = list(fragments2)
94
94
  matched_fragments: list[tuple[OCRFragment, OCRFragment]] = []
95
95
  not_matched_fragments: list[OCRFragment] = []
96
96
 
97
97
  for fragment1 in fragments1:
98
98
  polygon1 = Polygon(fragment1.rect)
99
- polygon1 = zone_polygon.intersection(polygon1)
99
+ polygon1 = cast(Polygon, zone_polygon.intersection(polygon1))
100
100
  if polygon1.is_empty:
101
101
  continue
102
102
 
@@ -1,4 +1,4 @@
1
- from typing import Generator
1
+ from typing import cast, Generator
2
2
  from shapely.geometry import Polygon
3
3
  from .types import Layout, OCRFragment
4
4
  from .rectangle import Rectangle
@@ -92,7 +92,7 @@ def merge_fragments_as_line(origin_fragments: list[OCRFragment]) -> list[OCRFrag
92
92
  continue
93
93
 
94
94
  fragments.append(OCRFragment(
95
- order=min_order,
95
+ order=round(min_order),
96
96
  text=" ".join(texts),
97
97
  rank=text_rate_weights / proto_texts_len,
98
98
  rect=Rectangle(
@@ -141,7 +141,7 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
141
141
  # they are very sensitive to changes in height because they are very thin and long.
142
142
  # In order to make it equally sensitive to length and width, the ratio of area is not used.
143
143
  def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
144
- intersection: Polygon = polygon1.intersection(polygon2)
144
+ intersection = cast(Polygon, polygon1.intersection(polygon2))
145
145
  if intersection.is_empty:
146
146
  return 0.0
147
147
  else:
@@ -1,4 +1,4 @@
1
- from typing import Iterable
1
+ from typing import cast, Iterable
2
2
  from PIL import ImageDraw
3
3
  from PIL.ImageFont import load_default, FreeTypeFont
4
4
  from PIL.Image import Image
@@ -9,8 +9,8 @@ _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
9
9
  _Color = tuple[int, int, int]
10
10
 
11
11
  def plot(image: Image, layouts: Iterable[Layout]) -> None:
12
- layout_font = load_default(size=35)
13
- fragment_font = load_default(size=25)
12
+ layout_font = cast(FreeTypeFont, load_default(size=35))
13
+ fragment_font = cast(FreeTypeFont, load_default(size=25))
14
14
  draw = ImageDraw.Draw(image, mode="RGBA")
15
15
 
16
16
  def _draw_number(position: Point, number: int, font: FreeTypeFont, bold: bool, color: _Color) -> None:
@@ -88,4 +88,6 @@ def _layout_color(layout: Layout) -> _Color:
88
88
  elif cls == LayoutClass.ISOLATE_FORMULA:
89
89
  return (0xFA, 0x38, 0x27) # Red
90
90
  elif cls == LayoutClass.FORMULA_CAPTION:
91
- return (0xFF, 0x9D, 0x24) # Orange
91
+ return (0xFF, 0x9D, 0x24) # Orange
92
+ else:
93
+ return (0x00, 0x00, 0x00)
@@ -3,7 +3,7 @@ import numpy as np
3
3
  from dataclasses import dataclass
4
4
  from PIL.Image import Image
5
5
  from math import pi
6
- from .types import OCRFragment, Layout
6
+ from .types import Layout, OCRFragment
7
7
  from .rotation import calculate_rotation, RotationAdjuster
8
8
  from .rectangle import Rectangle
9
9
 
@@ -0,0 +1,49 @@
1
+ from .pix2s import Pix2Struct, Pix2StructTensorRT
2
+ from .internvl import InternVL, InternVL_LMDeploy
3
+
4
+ from transformers import AutoConfig
5
+
6
+
7
+ __ALL_MODELS__ = {
8
+ 'Pix2Struct': Pix2Struct,
9
+ 'Pix2StructTensorRT': Pix2StructTensorRT,
10
+ 'InternVL': InternVL,
11
+ 'InternVL_LMDeploy': InternVL_LMDeploy,
12
+ }
13
+
14
+
15
+ def get_model_name(model_path):
16
+ model_config = AutoConfig.from_pretrained(
17
+ model_path,
18
+ trust_remote_code=True,
19
+ )
20
+
21
+ if 'Pix2Struct' in model_config.architectures[0]:
22
+ model_name = 'Pix2Struct'
23
+ elif 'InternVL' in model_config.architectures[0]:
24
+ model_name = 'InternVL'
25
+ else:
26
+ raise ValueError(f"Unsupported model type: {model_config.architectures[0]}")
27
+
28
+ return model_name
29
+
30
+
31
+ def build_model(
32
+ model_ckpt='U4R/StructTable-InternVL2-1B',
33
+ cache_dir=None,
34
+ local_files_only=None,
35
+ **kwargs,
36
+ ):
37
+ model_name = get_model_name(model_ckpt)
38
+ if model_name == 'InternVL' and kwargs.get('lmdeploy', False):
39
+ model_name = 'InternVL_LMDeploy'
40
+ elif model_name == 'Pix2Struct' and kwargs.get('tensorrt_path', None):
41
+ model_name = 'Pix2StructTensorRT'
42
+
43
+ model = __ALL_MODELS__[model_name](
44
+ model_ckpt,
45
+ cache_dir=cache_dir,
46
+ local_files_only=local_files_only,
47
+ **kwargs
48
+ )
49
+ return model
@@ -0,0 +1,2 @@
1
+ from .internvl import InternVL
2
+ from .internvl_lmdeploy import InternVL_LMDeploy