doc-page-extractor 0.2.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc_page_extractor/__init__.py +5 -15
- doc_page_extractor/check_env.py +40 -0
- doc_page_extractor/extractor.py +88 -215
- doc_page_extractor/model.py +97 -0
- doc_page_extractor/parser.py +51 -0
- doc_page_extractor/plot.py +52 -79
- doc_page_extractor/redacter.py +111 -0
- doc_page_extractor-1.0.2.dist-info/METADATA +120 -0
- doc_page_extractor-1.0.2.dist-info/RECORD +11 -0
- {doc_page_extractor-0.2.0.dist-info → doc_page_extractor-1.0.2.dist-info}/WHEEL +1 -2
- doc_page_extractor-1.0.2.dist-info/licenses/LICENSE +21 -0
- doc_page_extractor/clipper.py +0 -119
- doc_page_extractor/downloader.py +0 -16
- doc_page_extractor/latex.py +0 -31
- doc_page_extractor/layout_order.py +0 -237
- doc_page_extractor/layoutreader.py +0 -126
- doc_page_extractor/models.py +0 -92
- doc_page_extractor/ocr.py +0 -200
- doc_page_extractor/ocr_corrector.py +0 -126
- doc_page_extractor/onnxocr/__init__.py +0 -1
- doc_page_extractor/onnxocr/cls_postprocess.py +0 -26
- doc_page_extractor/onnxocr/db_postprocess.py +0 -246
- doc_page_extractor/onnxocr/imaug.py +0 -32
- doc_page_extractor/onnxocr/operators.py +0 -187
- doc_page_extractor/onnxocr/predict_base.py +0 -57
- doc_page_extractor/onnxocr/predict_cls.py +0 -109
- doc_page_extractor/onnxocr/predict_det.py +0 -139
- doc_page_extractor/onnxocr/predict_rec.py +0 -344
- doc_page_extractor/onnxocr/predict_system.py +0 -97
- doc_page_extractor/onnxocr/rec_postprocess.py +0 -896
- doc_page_extractor/onnxocr/utils.py +0 -71
- doc_page_extractor/overlap.py +0 -167
- doc_page_extractor/raw_optimizer.py +0 -104
- doc_page_extractor/rectangle.py +0 -72
- doc_page_extractor/rotation.py +0 -158
- doc_page_extractor/struct_eqtable/__init__.py +0 -49
- doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -2
- doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -394
- doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -198
- doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -81
- doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -3
- doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -76
- doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -1047
- doc_page_extractor/table.py +0 -70
- doc_page_extractor/types.py +0 -91
- doc_page_extractor/utils.py +0 -32
- doc_page_extractor-0.2.0.dist-info/METADATA +0 -85
- doc_page_extractor-0.2.0.dist-info/RECORD +0 -45
- doc_page_extractor-0.2.0.dist-info/licenses/LICENSE +0 -661
- doc_page_extractor-0.2.0.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_history_bus.py +0 -55
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from typing import Any, Generator, Iterable, cast
|
|
2
|
+
|
|
3
|
+
from PIL import Image, ImageDraw
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def redact(
|
|
7
|
+
image: Image.Image,
|
|
8
|
+
fill_color: tuple[int, int, int],
|
|
9
|
+
rectangles: Iterable[tuple[int, int, int, int]],
|
|
10
|
+
) -> Image.Image:
|
|
11
|
+
draw = ImageDraw.Draw(image)
|
|
12
|
+
for x1, y1, x2, y2 in rectangles:
|
|
13
|
+
draw.rectangle((x1, y1, x2, y2), fill=fill_color)
|
|
14
|
+
return image
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _AveragingColor:
|
|
18
|
+
def __init__(self) -> None:
|
|
19
|
+
self._r: float = 0.0
|
|
20
|
+
self._g: float = 0.0
|
|
21
|
+
self._b: float = 0.0
|
|
22
|
+
self._a: float = 0.0
|
|
23
|
+
self._count: int = 0
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def count(self) -> int:
|
|
27
|
+
return self._count
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def average(self) -> tuple[float, float, float, float]:
|
|
31
|
+
if self._count == 0:
|
|
32
|
+
return 1.0, 1.0, 1.0, 1.0
|
|
33
|
+
return (
|
|
34
|
+
self._r / self._count,
|
|
35
|
+
self._g / self._count,
|
|
36
|
+
self._b / self._count,
|
|
37
|
+
self._a / self._count,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def add_color(self, r: float, g: float, b: float, a: float) -> None:
|
|
41
|
+
self._r += r
|
|
42
|
+
self._g += g
|
|
43
|
+
self._b += b
|
|
44
|
+
self._a += a
|
|
45
|
+
self._count += 1
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def background_color(image: Image.Image) -> tuple[int, int, int]:
|
|
49
|
+
"""将像素颜色按灰度排序,取中位颜色。此颜色与纸张的颜色相同,可做背景色"""
|
|
50
|
+
pixels_count = image.width * image.height
|
|
51
|
+
if pixels_count == 0:
|
|
52
|
+
return 255, 255, 255
|
|
53
|
+
|
|
54
|
+
bucket: list[_AveragingColor | None] = [None] * 256
|
|
55
|
+
for r, g, b, a in _iter_pixels(image):
|
|
56
|
+
gray = round(255 * _gray(r, g, b, a))
|
|
57
|
+
colors = bucket[gray]
|
|
58
|
+
if colors is None:
|
|
59
|
+
colors = _AveragingColor()
|
|
60
|
+
bucket[gray] = colors
|
|
61
|
+
colors.add_color(r, g, b, a)
|
|
62
|
+
|
|
63
|
+
offset: int = 0
|
|
64
|
+
found_colors: _AveragingColor | None = None
|
|
65
|
+
|
|
66
|
+
for colors in bucket:
|
|
67
|
+
if not colors:
|
|
68
|
+
continue
|
|
69
|
+
offset += colors.count
|
|
70
|
+
if offset > pixels_count // 2:
|
|
71
|
+
found_colors = colors
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
assert found_colors is not None
|
|
75
|
+
r, g, b, a = found_colors.average
|
|
76
|
+
|
|
77
|
+
# 背景色为白色
|
|
78
|
+
r = r * a + 1.0 * (1.0 - a)
|
|
79
|
+
g = g * a + 1.0 * (1.0 - a)
|
|
80
|
+
b = b * a + 1.0 * (1.0 - a)
|
|
81
|
+
|
|
82
|
+
return round(r * 255), round(g * 255), round(b * 255)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _gray(r: float, g: float, b: float, a: float) -> float:
|
|
86
|
+
# ITU-R BT.601 https://en.wikipedia.org/wiki/Rec._601
|
|
87
|
+
gray = 0.299 * r + 0.587 * g + 0.114 * b
|
|
88
|
+
return gray * a
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _iter_pixels(
|
|
92
|
+
image: Image.Image,
|
|
93
|
+
) -> Generator[tuple[float, float, float, float], None, None]:
|
|
94
|
+
for pixel in cast(Any, image.getdata()):
|
|
95
|
+
pixel_len = len(cast(tuple, pixel)) if isinstance(pixel, tuple) else 1
|
|
96
|
+
if pixel_len == 4:
|
|
97
|
+
# RGBA 格式
|
|
98
|
+
r, g, b, a = cast(tuple[int, int, int, int], pixel)
|
|
99
|
+
elif pixel_len == 3:
|
|
100
|
+
# RGB 格式
|
|
101
|
+
r, g, b = cast(tuple[int, int, int], pixel)
|
|
102
|
+
a = 255
|
|
103
|
+
elif pixel_len == 2:
|
|
104
|
+
# LA 格式 (灰度 + alpha)
|
|
105
|
+
l, a = cast(tuple[int, int], pixel)
|
|
106
|
+
r = g = b = l
|
|
107
|
+
else:
|
|
108
|
+
# L 格式 (灰度)
|
|
109
|
+
r = g = b = cast(int, pixel)
|
|
110
|
+
a = 255
|
|
111
|
+
yield (r / 255.0, g / 255.0, b / 255.0, a / 255.0)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doc-page-extractor
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: Document page extraction tool powered by DeepSeek-OCR
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Tao Zeyu
|
|
8
|
+
Author-email: i@taozeyu.com
|
|
9
|
+
Maintainer: Tao Zeyu
|
|
10
|
+
Maintainer-email: i@taozeyu.com
|
|
11
|
+
Requires-Python: >=3.10,<3.14
|
|
12
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Requires-Dist: addict (>=2.4.0)
|
|
21
|
+
Requires-Dist: easydict (>=1.13)
|
|
22
|
+
Requires-Dist: einops (>=0.8.0)
|
|
23
|
+
Requires-Dist: transformers (>=4.46.0,<4.48.0)
|
|
24
|
+
Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# doc-page-extractor
|
|
28
|
+
|
|
29
|
+
Document page extraction tool powered by DeepSeek-OCR.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
> **⚠️ Important:** This package requires PyTorch with CUDA support (GPU Required). PyTorch is NOT automatically installed - you must install it manually first.
|
|
34
|
+
|
|
35
|
+
### Step 1: Install PyTorch with CUDA
|
|
36
|
+
|
|
37
|
+
Choose the command that matches your CUDA version:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# For CUDA 12.1 (recommended for most users)
|
|
41
|
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
|
42
|
+
|
|
43
|
+
# For CUDA 11.8
|
|
44
|
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
|
|
45
|
+
|
|
46
|
+
# For CUDA 12.6
|
|
47
|
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
> 💡 **Don't know your CUDA version?** Run `nvidia-smi` to check, or just try CUDA 12.1 (works with most recent drivers).
|
|
51
|
+
|
|
52
|
+
### Step 2: Install doc-page-extractor
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install doc-page-extractor
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Verify Installation
|
|
59
|
+
|
|
60
|
+
Check if everything is working:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python -c "import doc_page_extractor; import torch; print('✓ Installation successful!'); print('✓ CUDA available:', torch.cuda.is_available())"
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Expected output:
|
|
67
|
+
```
|
|
68
|
+
✓ Installation successful!
|
|
69
|
+
✓ CUDA available: True
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
If CUDA shows `False`, see the troubleshooting section below.
|
|
73
|
+
|
|
74
|
+
## Usage
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from doc_page_extractor import PageExtractor
|
|
78
|
+
|
|
79
|
+
# Your code here
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Troubleshooting
|
|
83
|
+
|
|
84
|
+
### "PyTorch is required but not installed!"
|
|
85
|
+
|
|
86
|
+
Install PyTorch first:
|
|
87
|
+
```bash
|
|
88
|
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### "CUDA is not available!"
|
|
92
|
+
|
|
93
|
+
**Check your GPU driver:**
|
|
94
|
+
```bash
|
|
95
|
+
nvidia-smi
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**If the command fails**, you need to install NVIDIA drivers:
|
|
99
|
+
- Download from: https://www.nvidia.com/download/index.aspx
|
|
100
|
+
|
|
101
|
+
**If it succeeds**, you might have CPU-only PyTorch. Reinstall with CUDA:
|
|
102
|
+
```bash
|
|
103
|
+
pip uninstall torch torchvision
|
|
104
|
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Requirements
|
|
108
|
+
|
|
109
|
+
- Python >= 3.10, < 3.14
|
|
110
|
+
- **NVIDIA GPU with CUDA 11.8 or 12.1 support (Required)**
|
|
111
|
+
- Sufficient GPU memory (recommended: 4GB+ VRAM)
|
|
112
|
+
|
|
113
|
+
## Development
|
|
114
|
+
|
|
115
|
+
For contributors and developers, see [Development Guide](docs/DEVELOPMENT.md) for:
|
|
116
|
+
- Running tests
|
|
117
|
+
- Running lint checks
|
|
118
|
+
- Building the package
|
|
119
|
+
|
|
120
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
doc_page_extractor/__init__.py,sha256=BCLTWrjj0r8HJGoUzrY4T630WiN-di33NWwYOLF7YXc,191
|
|
2
|
+
doc_page_extractor/check_env.py,sha256=pYk_58eqhSbe3GB0INYli6mATjCvZtHUTo3QX-ZExzw,1460
|
|
3
|
+
doc_page_extractor/extractor.py,sha256=XSP_LlIKjOgKS-25fxPouCJnWmh2KNF6Z79oc3b3QGs,3096
|
|
4
|
+
doc_page_extractor/model.py,sha256=I5RLt5GsoWeux5QHrKM80uVuN5kB6drsQx-Gp8X3wEk,3111
|
|
5
|
+
doc_page_extractor/parser.py,sha256=1PdDKQ6SOftoklVH5DnvJYUhJPHtVr0hclGxgBIj2LE,1652
|
|
6
|
+
doc_page_extractor/plot.py,sha256=3ZD-rw__7pu7EPMnxwHpHkhLolbOPJbdDK4949XsKKA,1647
|
|
7
|
+
doc_page_extractor/redacter.py,sha256=jVfH-XWmuq2IYn4g1tGSnZc6gUXrRhn7roSCPtoYbHQ,3227
|
|
8
|
+
doc_page_extractor-1.0.2.dist-info/METADATA,sha256=MAXKv7u4f4MNCak7-PArUTzUClTDCbyL34nn1iFAlMY,3182
|
|
9
|
+
doc_page_extractor-1.0.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
10
|
+
doc_page_extractor-1.0.2.dist-info/licenses/LICENSE,sha256=1Kv5XShR6SbZVHr1Z_2tBC8oFk_rfO6CBtmmygj3Jlo,1074
|
|
11
|
+
doc_page_extractor-1.0.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Tao Zeyu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
doc_page_extractor/clipper.py
DELETED
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from math import pi, ceil, sin, cos, sqrt
|
|
4
|
-
from PIL.Image import Image, Transform
|
|
5
|
-
from .types import Layout, ExtractedResult
|
|
6
|
-
from .rectangle import Rectangle
|
|
7
|
-
from .rotation import calculate_rotation_with_rect, normal_vertical_rotation
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def clip(
|
|
11
|
-
extracted_result: ExtractedResult,
|
|
12
|
-
layout: Layout,
|
|
13
|
-
wrapped_width: float = 0.0,
|
|
14
|
-
wrapped_height: float = 0.0,
|
|
15
|
-
) -> Image:
|
|
16
|
-
image: Image
|
|
17
|
-
if extracted_result.adjusted_image is None:
|
|
18
|
-
image = extracted_result.extracted_image
|
|
19
|
-
else:
|
|
20
|
-
image = extracted_result.adjusted_image
|
|
21
|
-
|
|
22
|
-
return clip_from_image(
|
|
23
|
-
image, layout.rect,
|
|
24
|
-
wrapped_width, wrapped_height,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
def clip_from_image(
|
|
28
|
-
image: Image,
|
|
29
|
-
rect: Rectangle,
|
|
30
|
-
wrapped_width: float = 0.0,
|
|
31
|
-
wrapped_height: float = 0.0,
|
|
32
|
-
) -> Image:
|
|
33
|
-
horizontal_rotation, vertical_rotation = calculate_rotation_with_rect(rect)
|
|
34
|
-
image = image.copy()
|
|
35
|
-
matrix_move = np.array(_get_move_matrix(rect.lt[0], rect.lt[1])).reshape(3, 3)
|
|
36
|
-
matrix_rotate = np.array(_get_rotate_matrix(-horizontal_rotation)).reshape(3, 3)
|
|
37
|
-
matrix = np.dot(matrix_move, matrix_rotate)
|
|
38
|
-
|
|
39
|
-
y_axis_rotation = normal_vertical_rotation(vertical_rotation - horizontal_rotation)
|
|
40
|
-
|
|
41
|
-
if abs(y_axis_rotation - 0.25 * pi) > 0.0:
|
|
42
|
-
x = cos(y_axis_rotation)
|
|
43
|
-
y = sin(y_axis_rotation)
|
|
44
|
-
matrix_shear = np.array(_get_shear_matrix(x, y)).reshape(3, 3)
|
|
45
|
-
matrix = np.dot(matrix, matrix_shear)
|
|
46
|
-
|
|
47
|
-
width, height, max_width, max_height = _size_and_wrapper(rect)
|
|
48
|
-
max_width += wrapped_width
|
|
49
|
-
max_height += wrapped_height
|
|
50
|
-
|
|
51
|
-
if max_width != width or max_height != height:
|
|
52
|
-
dx = (max_width - width) / 2.0
|
|
53
|
-
dy = (max_height - height) / 2.0
|
|
54
|
-
matrix_move = np.array(_get_move_matrix(-dx, -dy)).reshape(3, 3)
|
|
55
|
-
matrix = np.dot(matrix, matrix_move)
|
|
56
|
-
|
|
57
|
-
return image.transform(
|
|
58
|
-
size=(ceil(max_width), ceil(max_height)),
|
|
59
|
-
method=Transform.AFFINE,
|
|
60
|
-
data=_to_pillow_matrix(matrix),
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
def _size_and_wrapper(rect: Rectangle):
|
|
64
|
-
widths: list[float] = []
|
|
65
|
-
heights: list[float] = []
|
|
66
|
-
|
|
67
|
-
for i, (p1, p2) in enumerate(rect.segments):
|
|
68
|
-
dx = p2[0] - p1[0]
|
|
69
|
-
dy = p2[1] - p1[1]
|
|
70
|
-
distance = sqrt(dx*dx + dy*dy)
|
|
71
|
-
if i % 2 == 0:
|
|
72
|
-
heights.append(distance)
|
|
73
|
-
else:
|
|
74
|
-
widths.append(distance)
|
|
75
|
-
|
|
76
|
-
if len(widths) == 0 and len(heights) == 0:
|
|
77
|
-
return 0.0, 0.0, 0.0, 0.0
|
|
78
|
-
|
|
79
|
-
width: float = sum(widths) / len(widths)
|
|
80
|
-
height: float = sum(heights) / len(heights)
|
|
81
|
-
max_width: float = width
|
|
82
|
-
max_height: float = height
|
|
83
|
-
|
|
84
|
-
for width in widths:
|
|
85
|
-
if width > max_width:
|
|
86
|
-
max_width = width
|
|
87
|
-
|
|
88
|
-
for height in heights:
|
|
89
|
-
if height > max_height:
|
|
90
|
-
max_height = height
|
|
91
|
-
|
|
92
|
-
return width, height, max_width, max_height
|
|
93
|
-
|
|
94
|
-
def _to_pillow_matrix(matrix: np.array):
|
|
95
|
-
return (
|
|
96
|
-
matrix[0][0], matrix[0][1], matrix[0][2],
|
|
97
|
-
matrix[1][0], matrix[1][1], matrix[1][2],
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
def _get_move_matrix(dx: float, dy: float):
|
|
101
|
-
return (
|
|
102
|
-
1.0, 0.0, dx,
|
|
103
|
-
0.0, 1.0, dy,
|
|
104
|
-
0.0, 0.0, 1.0,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
def _get_rotate_matrix(rotation: float):
|
|
108
|
-
return (
|
|
109
|
-
cos(rotation), sin(rotation), 0.0,
|
|
110
|
-
-sin(rotation), cos(rotation), 0.0,
|
|
111
|
-
0.0, 0.0, 1.0
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
def _get_shear_matrix(x0: float, y0: float):
|
|
115
|
-
return (
|
|
116
|
-
1.0, 0.0, 0.0,
|
|
117
|
-
x0, y0, 0.0,
|
|
118
|
-
0.0, 0.0, 1.0,
|
|
119
|
-
)
|
doc_page_extractor/downloader.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import requests
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def download(url: str, file_path: Path):
|
|
7
|
-
response = requests.get(url, stream=True, timeout=60)
|
|
8
|
-
if response.status_code != 200:
|
|
9
|
-
raise FileNotFoundError(f"Failed to download file from {url}: {response.status_code}")
|
|
10
|
-
try:
|
|
11
|
-
with open(file_path, "wb") as file:
|
|
12
|
-
file.write(response.content)
|
|
13
|
-
except Exception as e:
|
|
14
|
-
if os.path.exists(file_path):
|
|
15
|
-
os.remove(file_path)
|
|
16
|
-
raise e
|
doc_page_extractor/latex.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import torch
|
|
3
|
-
|
|
4
|
-
from munch import Munch
|
|
5
|
-
from pix2tex.cli import LatexOCR
|
|
6
|
-
from PIL.Image import Image
|
|
7
|
-
from typing import Literal
|
|
8
|
-
from .utils import expand_image
|
|
9
|
-
from .types import GetModelDir
|
|
10
|
-
|
|
11
|
-
class LaTeX:
|
|
12
|
-
def __init__(self, device: Literal["cpu", "cuda"],get_model_dir: GetModelDir):
|
|
13
|
-
self._model_path: str = get_model_dir()
|
|
14
|
-
self._model: LatexOCR | None = None
|
|
15
|
-
self._device: Literal["cpu", "cuda"] = device
|
|
16
|
-
|
|
17
|
-
def extract(self, image: Image) -> str | None:
|
|
18
|
-
image = expand_image(image, 0.1) # 添加边缘提高识别准确率
|
|
19
|
-
model = self._get_model()
|
|
20
|
-
with torch.no_grad():
|
|
21
|
-
return model(image)
|
|
22
|
-
|
|
23
|
-
def _get_model(self) -> LatexOCR:
|
|
24
|
-
if self._model is None:
|
|
25
|
-
self._model = LatexOCR(Munch({
|
|
26
|
-
"config": os.path.join("settings", "config.yaml"),
|
|
27
|
-
"checkpoint": os.path.join(self._model_path, "checkpoints", "weights.pth"),
|
|
28
|
-
"no_cuda": self._device == "cpu",
|
|
29
|
-
"no_resize": False,
|
|
30
|
-
}))
|
|
31
|
-
return self._model
|
|
@@ -1,237 +0,0 @@
|
|
|
1
|
-
import torch
|
|
2
|
-
|
|
3
|
-
from typing import Generator
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from transformers import LayoutLMv3ForTokenClassification
|
|
6
|
-
|
|
7
|
-
from .types import Layout, LayoutClass, GetModelDir
|
|
8
|
-
from .layoutreader import prepare_inputs, boxes2inputs, parse_logits
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class _BBox:
|
|
13
|
-
layout_index: int
|
|
14
|
-
fragment_index: int
|
|
15
|
-
virtual: bool
|
|
16
|
-
order: int
|
|
17
|
-
value: tuple[float, float, float, float]
|
|
18
|
-
|
|
19
|
-
class LayoutOrder:
|
|
20
|
-
def __init__(self, get_model_dir: GetModelDir):
|
|
21
|
-
self._model_path: str = get_model_dir()
|
|
22
|
-
self._model: LayoutLMv3ForTokenClassification | None = None
|
|
23
|
-
self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
24
|
-
|
|
25
|
-
def _get_model(self) -> LayoutLMv3ForTokenClassification:
|
|
26
|
-
if self._model is None:
|
|
27
|
-
self._model = LayoutLMv3ForTokenClassification.from_pretrained(
|
|
28
|
-
pretrained_model_name_or_path=self._model_path,
|
|
29
|
-
local_files_only=True,
|
|
30
|
-
).to(device=self._device)
|
|
31
|
-
return self._model
|
|
32
|
-
|
|
33
|
-
def sort(self, layouts: list[Layout], size: tuple[int, int]) -> list[Layout]:
|
|
34
|
-
width, height = size
|
|
35
|
-
if width == 0 or height == 0:
|
|
36
|
-
return layouts
|
|
37
|
-
|
|
38
|
-
bbox_list = self._order_and_get_bbox_list(
|
|
39
|
-
layouts=layouts,
|
|
40
|
-
width=width,
|
|
41
|
-
height=height,
|
|
42
|
-
)
|
|
43
|
-
if bbox_list is None:
|
|
44
|
-
return layouts
|
|
45
|
-
|
|
46
|
-
return self._sort_layouts_and_fragments(layouts, bbox_list)
|
|
47
|
-
|
|
48
|
-
def _order_and_get_bbox_list(
|
|
49
|
-
self,
|
|
50
|
-
layouts: list[Layout],
|
|
51
|
-
width: int,
|
|
52
|
-
height: int,
|
|
53
|
-
) -> list[_BBox] | None:
|
|
54
|
-
|
|
55
|
-
line_height = self._line_height(layouts)
|
|
56
|
-
bbox_list: list[_BBox] = []
|
|
57
|
-
|
|
58
|
-
for i, layout in enumerate(layouts):
|
|
59
|
-
if layout.cls == LayoutClass.PLAIN_TEXT and \
|
|
60
|
-
len(layout.fragments) > 0:
|
|
61
|
-
for j, fragment in enumerate(layout.fragments):
|
|
62
|
-
bbox_list.append(_BBox(
|
|
63
|
-
layout_index=i,
|
|
64
|
-
fragment_index=j,
|
|
65
|
-
virtual=False,
|
|
66
|
-
order=0,
|
|
67
|
-
value=fragment.rect.wrapper,
|
|
68
|
-
))
|
|
69
|
-
else:
|
|
70
|
-
bbox_list.extend(
|
|
71
|
-
self._generate_virtual_lines(
|
|
72
|
-
layout=layout,
|
|
73
|
-
layout_index=i,
|
|
74
|
-
line_height=line_height,
|
|
75
|
-
width=width,
|
|
76
|
-
height=height,
|
|
77
|
-
),
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
if len(bbox_list) > 200:
|
|
81
|
-
# https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L522
|
|
82
|
-
return None
|
|
83
|
-
|
|
84
|
-
layoutreader_size = 1000.0
|
|
85
|
-
x_scale = layoutreader_size / float(width)
|
|
86
|
-
y_scale = layoutreader_size / float(height)
|
|
87
|
-
|
|
88
|
-
for bbox in bbox_list:
|
|
89
|
-
x0, y0, x1, y1 = self._squeeze(bbox.value, width, height)
|
|
90
|
-
x0 = round(x0 * x_scale)
|
|
91
|
-
y0 = round(y0 * y_scale)
|
|
92
|
-
x1 = round(x1 * x_scale)
|
|
93
|
-
y1 = round(y1 * y_scale)
|
|
94
|
-
bbox.value = (x0, y0, x1, y1)
|
|
95
|
-
|
|
96
|
-
bbox_list.sort(key=lambda b: b.value) # 必须排序,乱序传入 layoutreader 会令它无法识别正确顺序
|
|
97
|
-
model = self._get_model()
|
|
98
|
-
|
|
99
|
-
with torch.no_grad():
|
|
100
|
-
inputs = boxes2inputs([list(bbox.value) for bbox in bbox_list])
|
|
101
|
-
inputs = prepare_inputs(inputs, model)
|
|
102
|
-
logits = model(**inputs).logits.cpu().squeeze(0)
|
|
103
|
-
orders = parse_logits(logits, len(bbox_list))
|
|
104
|
-
|
|
105
|
-
sorted_bbox_list = [bbox_list[i] for i in orders]
|
|
106
|
-
for i, bbox in enumerate(sorted_bbox_list):
|
|
107
|
-
bbox.order = i
|
|
108
|
-
|
|
109
|
-
return sorted_bbox_list
|
|
110
|
-
|
|
111
|
-
def _sort_layouts_and_fragments(self, layouts: list[Layout], bbox_list: list[_BBox]):
|
|
112
|
-
layout_bbox_list: list[list[_BBox]] = [[] for _ in range(len(layouts))]
|
|
113
|
-
for bbox in bbox_list:
|
|
114
|
-
layout_bbox_list[bbox.layout_index].append(bbox)
|
|
115
|
-
|
|
116
|
-
layouts_with_median_order: list[tuple[Layout, float]] = []
|
|
117
|
-
for layout_index, bbox_list in enumerate(layout_bbox_list):
|
|
118
|
-
layout = layouts[layout_index]
|
|
119
|
-
orders = [b.order for b in bbox_list] # virtual bbox 保证了 orders 不可能为空
|
|
120
|
-
median_order = self._median(orders)
|
|
121
|
-
layouts_with_median_order.append((layout, median_order))
|
|
122
|
-
|
|
123
|
-
for layout, bbox_list in zip(layouts, layout_bbox_list):
|
|
124
|
-
for bbox in bbox_list:
|
|
125
|
-
if not bbox.virtual:
|
|
126
|
-
layout.fragments[bbox.fragment_index].order = bbox.order
|
|
127
|
-
if all(not bbox.virtual for bbox in bbox_list):
|
|
128
|
-
layout.fragments.sort(key=lambda f: f.order)
|
|
129
|
-
|
|
130
|
-
layouts_with_median_order.sort(key=lambda x: x[1])
|
|
131
|
-
layouts = [layout for layout, _ in layouts_with_median_order]
|
|
132
|
-
next_fragment_order: int = 0
|
|
133
|
-
|
|
134
|
-
for layout in layouts:
|
|
135
|
-
for fragment in layout.fragments:
|
|
136
|
-
fragment.order = next_fragment_order
|
|
137
|
-
next_fragment_order += 1
|
|
138
|
-
|
|
139
|
-
return layouts
|
|
140
|
-
|
|
141
|
-
def _line_height(self, layouts: list[Layout]) -> float:
|
|
142
|
-
line_height: float = 0.0
|
|
143
|
-
count: int = 0
|
|
144
|
-
for layout in layouts:
|
|
145
|
-
for fragment in layout.fragments:
|
|
146
|
-
_, height = fragment.rect.size
|
|
147
|
-
line_height += height
|
|
148
|
-
count += 1
|
|
149
|
-
if count == 0:
|
|
150
|
-
return 10.0
|
|
151
|
-
return line_height / float(count)
|
|
152
|
-
|
|
153
|
-
def _generate_virtual_lines(
|
|
154
|
-
self,
|
|
155
|
-
layout: Layout,
|
|
156
|
-
layout_index: int,
|
|
157
|
-
line_height: float,
|
|
158
|
-
width: int,
|
|
159
|
-
height: int,
|
|
160
|
-
) -> Generator[_BBox, None, None]:
|
|
161
|
-
|
|
162
|
-
# https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L451-L490
|
|
163
|
-
x0, y0, x1, y1 = layout.rect.wrapper
|
|
164
|
-
layout_height = y1 - y0
|
|
165
|
-
layout_weight = x1 - x0
|
|
166
|
-
lines = int(layout_height / line_height)
|
|
167
|
-
|
|
168
|
-
if layout_height <= line_height * 2:
|
|
169
|
-
yield _BBox(
|
|
170
|
-
layout_index=layout_index,
|
|
171
|
-
fragment_index=0,
|
|
172
|
-
virtual=True,
|
|
173
|
-
order=0,
|
|
174
|
-
value=(x0, y0, x1, y1),
|
|
175
|
-
)
|
|
176
|
-
return
|
|
177
|
-
|
|
178
|
-
elif layout_height <= height * 0.25 or \
|
|
179
|
-
width * 0.5 <= layout_weight or \
|
|
180
|
-
width * 0.25 < layout_weight:
|
|
181
|
-
if layout_weight > width * 0.4:
|
|
182
|
-
lines = 3
|
|
183
|
-
elif layout_weight <= width * 0.25:
|
|
184
|
-
if layout_height / layout_weight > 1.2: # 细长的不分
|
|
185
|
-
yield _BBox(
|
|
186
|
-
layout_index=layout_index,
|
|
187
|
-
fragment_index=0,
|
|
188
|
-
virtual=True,
|
|
189
|
-
order=0,
|
|
190
|
-
value=(x0, y0, x1, y1),
|
|
191
|
-
)
|
|
192
|
-
return
|
|
193
|
-
else: # 不细长的还是分成两行
|
|
194
|
-
lines = 2
|
|
195
|
-
|
|
196
|
-
lines = max(1, lines)
|
|
197
|
-
line_height = (y1 - y0) / lines
|
|
198
|
-
current_y = y0
|
|
199
|
-
|
|
200
|
-
for i in range(lines):
|
|
201
|
-
yield _BBox(
|
|
202
|
-
layout_index=layout_index,
|
|
203
|
-
fragment_index=i,
|
|
204
|
-
virtual=True,
|
|
205
|
-
order=0,
|
|
206
|
-
value=(x0, current_y, x1, current_y + line_height),
|
|
207
|
-
)
|
|
208
|
-
current_y += line_height
|
|
209
|
-
|
|
210
|
-
def _median(self, numbers: list[int]) -> float:
|
|
211
|
-
sorted_numbers = sorted(numbers)
|
|
212
|
-
n = len(sorted_numbers)
|
|
213
|
-
|
|
214
|
-
# 判断是奇数还是偶数个元素
|
|
215
|
-
if n % 2 == 1:
|
|
216
|
-
# 奇数情况,直接取中间的数
|
|
217
|
-
return float(sorted_numbers[n // 2])
|
|
218
|
-
else:
|
|
219
|
-
# 偶数情况,取中间两个数的平均值
|
|
220
|
-
mid1 = sorted_numbers[n // 2 - 1]
|
|
221
|
-
mid2 = sorted_numbers[n // 2]
|
|
222
|
-
return float((mid1 + mid2) / 2)
|
|
223
|
-
|
|
224
|
-
def _squeeze(self, bbox: _BBox, width: int, height: int) -> _BBox:
|
|
225
|
-
x0, y0, x1, y1 = bbox
|
|
226
|
-
x0 = self._squeeze_value(x0, width)
|
|
227
|
-
x1 = self._squeeze_value(x1, width)
|
|
228
|
-
y0 = self._squeeze_value(y0, height)
|
|
229
|
-
y1 = self._squeeze_value(y1, height)
|
|
230
|
-
return x0, y0, x1, y1
|
|
231
|
-
|
|
232
|
-
def _squeeze_value(self, position: float, size: int) -> float:
|
|
233
|
-
if position < 0:
|
|
234
|
-
position = 0.0
|
|
235
|
-
if position > size:
|
|
236
|
-
position = float(size)
|
|
237
|
-
return position
|