doc-page-extractor 0.2.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. doc_page_extractor/__init__.py +5 -15
  2. doc_page_extractor/check_env.py +40 -0
  3. doc_page_extractor/extractor.py +88 -215
  4. doc_page_extractor/model.py +97 -0
  5. doc_page_extractor/parser.py +51 -0
  6. doc_page_extractor/plot.py +52 -79
  7. doc_page_extractor/redacter.py +111 -0
  8. doc_page_extractor-1.0.2.dist-info/METADATA +120 -0
  9. doc_page_extractor-1.0.2.dist-info/RECORD +11 -0
  10. {doc_page_extractor-0.2.0.dist-info → doc_page_extractor-1.0.2.dist-info}/WHEEL +1 -2
  11. doc_page_extractor-1.0.2.dist-info/licenses/LICENSE +21 -0
  12. doc_page_extractor/clipper.py +0 -119
  13. doc_page_extractor/downloader.py +0 -16
  14. doc_page_extractor/latex.py +0 -31
  15. doc_page_extractor/layout_order.py +0 -237
  16. doc_page_extractor/layoutreader.py +0 -126
  17. doc_page_extractor/models.py +0 -92
  18. doc_page_extractor/ocr.py +0 -200
  19. doc_page_extractor/ocr_corrector.py +0 -126
  20. doc_page_extractor/onnxocr/__init__.py +0 -1
  21. doc_page_extractor/onnxocr/cls_postprocess.py +0 -26
  22. doc_page_extractor/onnxocr/db_postprocess.py +0 -246
  23. doc_page_extractor/onnxocr/imaug.py +0 -32
  24. doc_page_extractor/onnxocr/operators.py +0 -187
  25. doc_page_extractor/onnxocr/predict_base.py +0 -57
  26. doc_page_extractor/onnxocr/predict_cls.py +0 -109
  27. doc_page_extractor/onnxocr/predict_det.py +0 -139
  28. doc_page_extractor/onnxocr/predict_rec.py +0 -344
  29. doc_page_extractor/onnxocr/predict_system.py +0 -97
  30. doc_page_extractor/onnxocr/rec_postprocess.py +0 -896
  31. doc_page_extractor/onnxocr/utils.py +0 -71
  32. doc_page_extractor/overlap.py +0 -167
  33. doc_page_extractor/raw_optimizer.py +0 -104
  34. doc_page_extractor/rectangle.py +0 -72
  35. doc_page_extractor/rotation.py +0 -158
  36. doc_page_extractor/struct_eqtable/__init__.py +0 -49
  37. doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -2
  38. doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -394
  39. doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -198
  40. doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -81
  41. doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -3
  42. doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -76
  43. doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -1047
  44. doc_page_extractor/table.py +0 -70
  45. doc_page_extractor/types.py +0 -91
  46. doc_page_extractor/utils.py +0 -32
  47. doc_page_extractor-0.2.0.dist-info/METADATA +0 -85
  48. doc_page_extractor-0.2.0.dist-info/RECORD +0 -45
  49. doc_page_extractor-0.2.0.dist-info/licenses/LICENSE +0 -661
  50. doc_page_extractor-0.2.0.dist-info/top_level.txt +0 -2
  51. tests/__init__.py +0 -0
  52. tests/test_history_bus.py +0 -55
@@ -0,0 +1,111 @@
1
+ from typing import Any, Generator, Iterable, cast
2
+
3
+ from PIL import Image, ImageDraw
4
+
5
+
6
+ def redact(
7
+ image: Image.Image,
8
+ fill_color: tuple[int, int, int],
9
+ rectangles: Iterable[tuple[int, int, int, int]],
10
+ ) -> Image.Image:
11
+ draw = ImageDraw.Draw(image)
12
+ for x1, y1, x2, y2 in rectangles:
13
+ draw.rectangle((x1, y1, x2, y2), fill=fill_color)
14
+ return image
15
+
16
+
17
+ class _AveragingColor:
18
+ def __init__(self) -> None:
19
+ self._r: float = 0.0
20
+ self._g: float = 0.0
21
+ self._b: float = 0.0
22
+ self._a: float = 0.0
23
+ self._count: int = 0
24
+
25
+ @property
26
+ def count(self) -> int:
27
+ return self._count
28
+
29
+ @property
30
+ def average(self) -> tuple[float, float, float, float]:
31
+ if self._count == 0:
32
+ return 1.0, 1.0, 1.0, 1.0
33
+ return (
34
+ self._r / self._count,
35
+ self._g / self._count,
36
+ self._b / self._count,
37
+ self._a / self._count,
38
+ )
39
+
40
+ def add_color(self, r: float, g: float, b: float, a: float) -> None:
41
+ self._r += r
42
+ self._g += g
43
+ self._b += b
44
+ self._a += a
45
+ self._count += 1
46
+
47
+
48
+ def background_color(image: Image.Image) -> tuple[int, int, int]:
49
+ """将像素颜色按灰度排序,取中位颜色。此颜色与纸张的颜色相同,可做背景色"""
50
+ pixels_count = image.width * image.height
51
+ if pixels_count == 0:
52
+ return 255, 255, 255
53
+
54
+ bucket: list[_AveragingColor | None] = [None] * 256
55
+ for r, g, b, a in _iter_pixels(image):
56
+ gray = round(255 * _gray(r, g, b, a))
57
+ colors = bucket[gray]
58
+ if colors is None:
59
+ colors = _AveragingColor()
60
+ bucket[gray] = colors
61
+ colors.add_color(r, g, b, a)
62
+
63
+ offset: int = 0
64
+ found_colors: _AveragingColor | None = None
65
+
66
+ for colors in bucket:
67
+ if not colors:
68
+ continue
69
+ offset += colors.count
70
+ if offset > pixels_count // 2:
71
+ found_colors = colors
72
+ break
73
+
74
+ assert found_colors is not None
75
+ r, g, b, a = found_colors.average
76
+
77
+ # 背景色为白色
78
+ r = r * a + 1.0 * (1.0 - a)
79
+ g = g * a + 1.0 * (1.0 - a)
80
+ b = b * a + 1.0 * (1.0 - a)
81
+
82
+ return round(r * 255), round(g * 255), round(b * 255)
83
+
84
+
85
+ def _gray(r: float, g: float, b: float, a: float) -> float:
86
+ # ITU-R BT.601 https://en.wikipedia.org/wiki/Rec._601
87
+ gray = 0.299 * r + 0.587 * g + 0.114 * b
88
+ return gray * a
89
+
90
+
91
+ def _iter_pixels(
92
+ image: Image.Image,
93
+ ) -> Generator[tuple[float, float, float, float], None, None]:
94
+ for pixel in cast(Any, image.getdata()):
95
+ pixel_len = len(cast(tuple, pixel)) if isinstance(pixel, tuple) else 1
96
+ if pixel_len == 4:
97
+ # RGBA 格式
98
+ r, g, b, a = cast(tuple[int, int, int, int], pixel)
99
+ elif pixel_len == 3:
100
+ # RGB 格式
101
+ r, g, b = cast(tuple[int, int, int], pixel)
102
+ a = 255
103
+ elif pixel_len == 2:
104
+ # LA 格式 (灰度 + alpha)
105
+ l, a = cast(tuple[int, int], pixel)
106
+ r = g = b = l
107
+ else:
108
+ # L 格式 (灰度)
109
+ r = g = b = cast(int, pixel)
110
+ a = 255
111
+ yield (r / 255.0, g / 255.0, b / 255.0, a / 255.0)
@@ -0,0 +1,120 @@
1
+ Metadata-Version: 2.4
2
+ Name: doc-page-extractor
3
+ Version: 1.0.2
4
+ Summary: Document page extraction tool powered by DeepSeek-OCR
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Author: Tao Zeyu
8
+ Author-email: i@taozeyu.com
9
+ Maintainer: Tao Zeyu
10
+ Maintainer-email: i@taozeyu.com
11
+ Requires-Python: >=3.10,<3.14
12
+ Classifier: Development Status :: 2 - Pre-Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Dist: addict (>=2.4.0)
21
+ Requires-Dist: easydict (>=1.13)
22
+ Requires-Dist: einops (>=0.8.0)
23
+ Requires-Dist: transformers (>=4.46.0,<4.48.0)
24
+ Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
25
+ Description-Content-Type: text/markdown
26
+
27
+ # doc-page-extractor
28
+
29
+ Document page extraction tool powered by DeepSeek-OCR.
30
+
31
+ ## Installation
32
+
33
+ > **⚠️ Important:** This package requires PyTorch with CUDA support (GPU Required). PyTorch is NOT automatically installed - you must install it manually first.
34
+
35
+ ### Step 1: Install PyTorch with CUDA
36
+
37
+ Choose the command that matches your CUDA version:
38
+
39
+ ```bash
40
+ # For CUDA 12.1 (recommended for most users)
41
+ pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
42
+
43
+ # For CUDA 11.8
44
+ pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
45
+
46
+ # For CUDA 12.6
47
+ pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
48
+ ```
49
+
50
+ > 💡 **Don't know your CUDA version?** Run `nvidia-smi` to check, or just try CUDA 12.1 (works with most recent drivers).
51
+
52
+ ### Step 2: Install doc-page-extractor
53
+
54
+ ```bash
55
+ pip install doc-page-extractor
56
+ ```
57
+
58
+ ### Verify Installation
59
+
60
+ Check if everything is working:
61
+
62
+ ```bash
63
+ python -c "import doc_page_extractor; import torch; print('✓ Installation successful!'); print('✓ CUDA available:', torch.cuda.is_available())"
64
+ ```
65
+
66
+ Expected output:
67
+ ```
68
+ ✓ Installation successful!
69
+ ✓ CUDA available: True
70
+ ```
71
+
72
+ If CUDA shows `False`, see the troubleshooting section below.
73
+
74
+ ## Usage
75
+
76
+ ```python
77
+ from doc_page_extractor import PageExtractor
78
+
79
+ # Your code here
80
+ ```
81
+
82
+ ## Troubleshooting
83
+
84
+ ### "PyTorch is required but not installed!"
85
+
86
+ Install PyTorch first:
87
+ ```bash
88
+ pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
89
+ ```
90
+
91
+ ### "CUDA is not available!"
92
+
93
+ **Check your GPU driver:**
94
+ ```bash
95
+ nvidia-smi
96
+ ```
97
+
98
+ **If the command fails**, you need to install NVIDIA drivers:
99
+ - Download from: https://www.nvidia.com/download/index.aspx
100
+
101
+ **If it succeeds**, you might have CPU-only PyTorch. Reinstall with CUDA:
102
+ ```bash
103
+ pip uninstall torch torchvision
104
+ pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
105
+ ```
106
+
107
+ ## Requirements
108
+
109
+ - Python >= 3.10, < 3.14
110
+ - **NVIDIA GPU with CUDA 11.8 or 12.1 support (Required)**
111
+ - Sufficient GPU memory (recommended: 4GB+ VRAM)
112
+
113
+ ## Development
114
+
115
+ For contributors and developers, see [Development Guide](docs/DEVELOPMENT.md) for:
116
+ - Running tests
117
+ - Running lint checks
118
+ - Building the package
119
+
120
+
@@ -0,0 +1,11 @@
1
+ doc_page_extractor/__init__.py,sha256=BCLTWrjj0r8HJGoUzrY4T630WiN-di33NWwYOLF7YXc,191
2
+ doc_page_extractor/check_env.py,sha256=pYk_58eqhSbe3GB0INYli6mATjCvZtHUTo3QX-ZExzw,1460
3
+ doc_page_extractor/extractor.py,sha256=XSP_LlIKjOgKS-25fxPouCJnWmh2KNF6Z79oc3b3QGs,3096
4
+ doc_page_extractor/model.py,sha256=I5RLt5GsoWeux5QHrKM80uVuN5kB6drsQx-Gp8X3wEk,3111
5
+ doc_page_extractor/parser.py,sha256=1PdDKQ6SOftoklVH5DnvJYUhJPHtVr0hclGxgBIj2LE,1652
6
+ doc_page_extractor/plot.py,sha256=3ZD-rw__7pu7EPMnxwHpHkhLolbOPJbdDK4949XsKKA,1647
7
+ doc_page_extractor/redacter.py,sha256=jVfH-XWmuq2IYn4g1tGSnZc6gUXrRhn7roSCPtoYbHQ,3227
8
+ doc_page_extractor-1.0.2.dist-info/METADATA,sha256=MAXKv7u4f4MNCak7-PArUTzUClTDCbyL34nn1iFAlMY,3182
9
+ doc_page_extractor-1.0.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
10
+ doc_page_extractor-1.0.2.dist-info/licenses/LICENSE,sha256=1Kv5XShR6SbZVHr1Z_2tBC8oFk_rfO6CBtmmygj3Jlo,1074
11
+ doc_page_extractor-1.0.2.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Tao Zeyu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -1,119 +0,0 @@
1
- import numpy as np
2
-
3
- from math import pi, ceil, sin, cos, sqrt
4
- from PIL.Image import Image, Transform
5
- from .types import Layout, ExtractedResult
6
- from .rectangle import Rectangle
7
- from .rotation import calculate_rotation_with_rect, normal_vertical_rotation
8
-
9
-
10
- def clip(
11
- extracted_result: ExtractedResult,
12
- layout: Layout,
13
- wrapped_width: float = 0.0,
14
- wrapped_height: float = 0.0,
15
- ) -> Image:
16
- image: Image
17
- if extracted_result.adjusted_image is None:
18
- image = extracted_result.extracted_image
19
- else:
20
- image = extracted_result.adjusted_image
21
-
22
- return clip_from_image(
23
- image, layout.rect,
24
- wrapped_width, wrapped_height,
25
- )
26
-
27
- def clip_from_image(
28
- image: Image,
29
- rect: Rectangle,
30
- wrapped_width: float = 0.0,
31
- wrapped_height: float = 0.0,
32
- ) -> Image:
33
- horizontal_rotation, vertical_rotation = calculate_rotation_with_rect(rect)
34
- image = image.copy()
35
- matrix_move = np.array(_get_move_matrix(rect.lt[0], rect.lt[1])).reshape(3, 3)
36
- matrix_rotate = np.array(_get_rotate_matrix(-horizontal_rotation)).reshape(3, 3)
37
- matrix = np.dot(matrix_move, matrix_rotate)
38
-
39
- y_axis_rotation = normal_vertical_rotation(vertical_rotation - horizontal_rotation)
40
-
41
- if abs(y_axis_rotation - 0.25 * pi) > 0.0:
42
- x = cos(y_axis_rotation)
43
- y = sin(y_axis_rotation)
44
- matrix_shear = np.array(_get_shear_matrix(x, y)).reshape(3, 3)
45
- matrix = np.dot(matrix, matrix_shear)
46
-
47
- width, height, max_width, max_height = _size_and_wrapper(rect)
48
- max_width += wrapped_width
49
- max_height += wrapped_height
50
-
51
- if max_width != width or max_height != height:
52
- dx = (max_width - width) / 2.0
53
- dy = (max_height - height) / 2.0
54
- matrix_move = np.array(_get_move_matrix(-dx, -dy)).reshape(3, 3)
55
- matrix = np.dot(matrix, matrix_move)
56
-
57
- return image.transform(
58
- size=(ceil(max_width), ceil(max_height)),
59
- method=Transform.AFFINE,
60
- data=_to_pillow_matrix(matrix),
61
- )
62
-
63
- def _size_and_wrapper(rect: Rectangle):
64
- widths: list[float] = []
65
- heights: list[float] = []
66
-
67
- for i, (p1, p2) in enumerate(rect.segments):
68
- dx = p2[0] - p1[0]
69
- dy = p2[1] - p1[1]
70
- distance = sqrt(dx*dx + dy*dy)
71
- if i % 2 == 0:
72
- heights.append(distance)
73
- else:
74
- widths.append(distance)
75
-
76
- if len(widths) == 0 and len(heights) == 0:
77
- return 0.0, 0.0, 0.0, 0.0
78
-
79
- width: float = sum(widths) / len(widths)
80
- height: float = sum(heights) / len(heights)
81
- max_width: float = width
82
- max_height: float = height
83
-
84
- for width in widths:
85
- if width > max_width:
86
- max_width = width
87
-
88
- for height in heights:
89
- if height > max_height:
90
- max_height = height
91
-
92
- return width, height, max_width, max_height
93
-
94
- def _to_pillow_matrix(matrix: np.array):
95
- return (
96
- matrix[0][0], matrix[0][1], matrix[0][2],
97
- matrix[1][0], matrix[1][1], matrix[1][2],
98
- )
99
-
100
- def _get_move_matrix(dx: float, dy: float):
101
- return (
102
- 1.0, 0.0, dx,
103
- 0.0, 1.0, dy,
104
- 0.0, 0.0, 1.0,
105
- )
106
-
107
- def _get_rotate_matrix(rotation: float):
108
- return (
109
- cos(rotation), sin(rotation), 0.0,
110
- -sin(rotation), cos(rotation), 0.0,
111
- 0.0, 0.0, 1.0
112
- )
113
-
114
- def _get_shear_matrix(x0: float, y0: float):
115
- return (
116
- 1.0, 0.0, 0.0,
117
- x0, y0, 0.0,
118
- 0.0, 0.0, 1.0,
119
- )
@@ -1,16 +0,0 @@
1
- import os
2
- import requests
3
- from pathlib import Path
4
-
5
-
6
- def download(url: str, file_path: Path):
7
- response = requests.get(url, stream=True, timeout=60)
8
- if response.status_code != 200:
9
- raise FileNotFoundError(f"Failed to download file from {url}: {response.status_code}")
10
- try:
11
- with open(file_path, "wb") as file:
12
- file.write(response.content)
13
- except Exception as e:
14
- if os.path.exists(file_path):
15
- os.remove(file_path)
16
- raise e
@@ -1,31 +0,0 @@
1
- import os
2
- import torch
3
-
4
- from munch import Munch
5
- from pix2tex.cli import LatexOCR
6
- from PIL.Image import Image
7
- from typing import Literal
8
- from .utils import expand_image
9
- from .types import GetModelDir
10
-
11
- class LaTeX:
12
- def __init__(self, device: Literal["cpu", "cuda"],get_model_dir: GetModelDir):
13
- self._model_path: str = get_model_dir()
14
- self._model: LatexOCR | None = None
15
- self._device: Literal["cpu", "cuda"] = device
16
-
17
- def extract(self, image: Image) -> str | None:
18
- image = expand_image(image, 0.1) # 添加边缘提高识别准确率
19
- model = self._get_model()
20
- with torch.no_grad():
21
- return model(image)
22
-
23
- def _get_model(self) -> LatexOCR:
24
- if self._model is None:
25
- self._model = LatexOCR(Munch({
26
- "config": os.path.join("settings", "config.yaml"),
27
- "checkpoint": os.path.join(self._model_path, "checkpoints", "weights.pth"),
28
- "no_cuda": self._device == "cpu",
29
- "no_resize": False,
30
- }))
31
- return self._model
@@ -1,237 +0,0 @@
1
- import torch
2
-
3
- from typing import Generator
4
- from dataclasses import dataclass
5
- from transformers import LayoutLMv3ForTokenClassification
6
-
7
- from .types import Layout, LayoutClass, GetModelDir
8
- from .layoutreader import prepare_inputs, boxes2inputs, parse_logits
9
-
10
-
11
- @dataclass
12
- class _BBox:
13
- layout_index: int
14
- fragment_index: int
15
- virtual: bool
16
- order: int
17
- value: tuple[float, float, float, float]
18
-
19
- class LayoutOrder:
20
- def __init__(self, get_model_dir: GetModelDir):
21
- self._model_path: str = get_model_dir()
22
- self._model: LayoutLMv3ForTokenClassification | None = None
23
- self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
-
25
- def _get_model(self) -> LayoutLMv3ForTokenClassification:
26
- if self._model is None:
27
- self._model = LayoutLMv3ForTokenClassification.from_pretrained(
28
- pretrained_model_name_or_path=self._model_path,
29
- local_files_only=True,
30
- ).to(device=self._device)
31
- return self._model
32
-
33
- def sort(self, layouts: list[Layout], size: tuple[int, int]) -> list[Layout]:
34
- width, height = size
35
- if width == 0 or height == 0:
36
- return layouts
37
-
38
- bbox_list = self._order_and_get_bbox_list(
39
- layouts=layouts,
40
- width=width,
41
- height=height,
42
- )
43
- if bbox_list is None:
44
- return layouts
45
-
46
- return self._sort_layouts_and_fragments(layouts, bbox_list)
47
-
48
- def _order_and_get_bbox_list(
49
- self,
50
- layouts: list[Layout],
51
- width: int,
52
- height: int,
53
- ) -> list[_BBox] | None:
54
-
55
- line_height = self._line_height(layouts)
56
- bbox_list: list[_BBox] = []
57
-
58
- for i, layout in enumerate(layouts):
59
- if layout.cls == LayoutClass.PLAIN_TEXT and \
60
- len(layout.fragments) > 0:
61
- for j, fragment in enumerate(layout.fragments):
62
- bbox_list.append(_BBox(
63
- layout_index=i,
64
- fragment_index=j,
65
- virtual=False,
66
- order=0,
67
- value=fragment.rect.wrapper,
68
- ))
69
- else:
70
- bbox_list.extend(
71
- self._generate_virtual_lines(
72
- layout=layout,
73
- layout_index=i,
74
- line_height=line_height,
75
- width=width,
76
- height=height,
77
- ),
78
- )
79
-
80
- if len(bbox_list) > 200:
81
- # https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L522
82
- return None
83
-
84
- layoutreader_size = 1000.0
85
- x_scale = layoutreader_size / float(width)
86
- y_scale = layoutreader_size / float(height)
87
-
88
- for bbox in bbox_list:
89
- x0, y0, x1, y1 = self._squeeze(bbox.value, width, height)
90
- x0 = round(x0 * x_scale)
91
- y0 = round(y0 * y_scale)
92
- x1 = round(x1 * x_scale)
93
- y1 = round(y1 * y_scale)
94
- bbox.value = (x0, y0, x1, y1)
95
-
96
- bbox_list.sort(key=lambda b: b.value) # 必须排序,乱序传入 layoutreader 会令它无法识别正确顺序
97
- model = self._get_model()
98
-
99
- with torch.no_grad():
100
- inputs = boxes2inputs([list(bbox.value) for bbox in bbox_list])
101
- inputs = prepare_inputs(inputs, model)
102
- logits = model(**inputs).logits.cpu().squeeze(0)
103
- orders = parse_logits(logits, len(bbox_list))
104
-
105
- sorted_bbox_list = [bbox_list[i] for i in orders]
106
- for i, bbox in enumerate(sorted_bbox_list):
107
- bbox.order = i
108
-
109
- return sorted_bbox_list
110
-
111
- def _sort_layouts_and_fragments(self, layouts: list[Layout], bbox_list: list[_BBox]):
112
- layout_bbox_list: list[list[_BBox]] = [[] for _ in range(len(layouts))]
113
- for bbox in bbox_list:
114
- layout_bbox_list[bbox.layout_index].append(bbox)
115
-
116
- layouts_with_median_order: list[tuple[Layout, float]] = []
117
- for layout_index, bbox_list in enumerate(layout_bbox_list):
118
- layout = layouts[layout_index]
119
- orders = [b.order for b in bbox_list] # virtual bbox 保证了 orders 不可能为空
120
- median_order = self._median(orders)
121
- layouts_with_median_order.append((layout, median_order))
122
-
123
- for layout, bbox_list in zip(layouts, layout_bbox_list):
124
- for bbox in bbox_list:
125
- if not bbox.virtual:
126
- layout.fragments[bbox.fragment_index].order = bbox.order
127
- if all(not bbox.virtual for bbox in bbox_list):
128
- layout.fragments.sort(key=lambda f: f.order)
129
-
130
- layouts_with_median_order.sort(key=lambda x: x[1])
131
- layouts = [layout for layout, _ in layouts_with_median_order]
132
- next_fragment_order: int = 0
133
-
134
- for layout in layouts:
135
- for fragment in layout.fragments:
136
- fragment.order = next_fragment_order
137
- next_fragment_order += 1
138
-
139
- return layouts
140
-
141
- def _line_height(self, layouts: list[Layout]) -> float:
142
- line_height: float = 0.0
143
- count: int = 0
144
- for layout in layouts:
145
- for fragment in layout.fragments:
146
- _, height = fragment.rect.size
147
- line_height += height
148
- count += 1
149
- if count == 0:
150
- return 10.0
151
- return line_height / float(count)
152
-
153
- def _generate_virtual_lines(
154
- self,
155
- layout: Layout,
156
- layout_index: int,
157
- line_height: float,
158
- width: int,
159
- height: int,
160
- ) -> Generator[_BBox, None, None]:
161
-
162
- # https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L451-L490
163
- x0, y0, x1, y1 = layout.rect.wrapper
164
- layout_height = y1 - y0
165
- layout_weight = x1 - x0
166
- lines = int(layout_height / line_height)
167
-
168
- if layout_height <= line_height * 2:
169
- yield _BBox(
170
- layout_index=layout_index,
171
- fragment_index=0,
172
- virtual=True,
173
- order=0,
174
- value=(x0, y0, x1, y1),
175
- )
176
- return
177
-
178
- elif layout_height <= height * 0.25 or \
179
- width * 0.5 <= layout_weight or \
180
- width * 0.25 < layout_weight:
181
- if layout_weight > width * 0.4:
182
- lines = 3
183
- elif layout_weight <= width * 0.25:
184
- if layout_height / layout_weight > 1.2: # 细长的不分
185
- yield _BBox(
186
- layout_index=layout_index,
187
- fragment_index=0,
188
- virtual=True,
189
- order=0,
190
- value=(x0, y0, x1, y1),
191
- )
192
- return
193
- else: # 不细长的还是分成两行
194
- lines = 2
195
-
196
- lines = max(1, lines)
197
- line_height = (y1 - y0) / lines
198
- current_y = y0
199
-
200
- for i in range(lines):
201
- yield _BBox(
202
- layout_index=layout_index,
203
- fragment_index=i,
204
- virtual=True,
205
- order=0,
206
- value=(x0, current_y, x1, current_y + line_height),
207
- )
208
- current_y += line_height
209
-
210
- def _median(self, numbers: list[int]) -> float:
211
- sorted_numbers = sorted(numbers)
212
- n = len(sorted_numbers)
213
-
214
- # 判断是奇数还是偶数个元素
215
- if n % 2 == 1:
216
- # 奇数情况,直接取中间的数
217
- return float(sorted_numbers[n // 2])
218
- else:
219
- # 偶数情况,取中间两个数的平均值
220
- mid1 = sorted_numbers[n // 2 - 1]
221
- mid2 = sorted_numbers[n // 2]
222
- return float((mid1 + mid2) / 2)
223
-
224
- def _squeeze(self, bbox: _BBox, width: int, height: int) -> _BBox:
225
- x0, y0, x1, y1 = bbox
226
- x0 = self._squeeze_value(x0, width)
227
- x1 = self._squeeze_value(x1, width)
228
- y0 = self._squeeze_value(y0, height)
229
- y1 = self._squeeze_value(y1, height)
230
- return x0, y0, x1, y1
231
-
232
- def _squeeze_value(self, position: float, size: int) -> float:
233
- if position < 0:
234
- position = 0.0
235
- if position > size:
236
- position = float(size)
237
- return position