doc-page-extractor 0.2.4__cp310-cp310-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (34) hide show
  1. doc_page_extractor/__init__.py +16 -0
  2. doc_page_extractor/clipper.py +119 -0
  3. doc_page_extractor/downloader.py +16 -0
  4. doc_page_extractor/extractor.py +218 -0
  5. doc_page_extractor/latex.py +33 -0
  6. doc_page_extractor/layout_order.py +239 -0
  7. doc_page_extractor/layoutreader.py +126 -0
  8. doc_page_extractor/model.py +133 -0
  9. doc_page_extractor/ocr.py +196 -0
  10. doc_page_extractor/ocr_corrector.py +126 -0
  11. doc_page_extractor/onnxocr/__init__.py +1 -0
  12. doc_page_extractor/onnxocr/cls_postprocess.py +26 -0
  13. doc_page_extractor/onnxocr/db_postprocess.py +246 -0
  14. doc_page_extractor/onnxocr/imaug.py +32 -0
  15. doc_page_extractor/onnxocr/operators.py +187 -0
  16. doc_page_extractor/onnxocr/predict_base.py +57 -0
  17. doc_page_extractor/onnxocr/predict_cls.py +109 -0
  18. doc_page_extractor/onnxocr/predict_det.py +139 -0
  19. doc_page_extractor/onnxocr/predict_rec.py +344 -0
  20. doc_page_extractor/onnxocr/predict_system.py +97 -0
  21. doc_page_extractor/onnxocr/rec_postprocess.py +896 -0
  22. doc_page_extractor/onnxocr/utils.py +71 -0
  23. doc_page_extractor/overlap.py +167 -0
  24. doc_page_extractor/plot.py +93 -0
  25. doc_page_extractor/raw_optimizer.py +104 -0
  26. doc_page_extractor/rectangle.py +72 -0
  27. doc_page_extractor/rotation.py +158 -0
  28. doc_page_extractor/table.py +60 -0
  29. doc_page_extractor/types.py +68 -0
  30. doc_page_extractor/utils.py +32 -0
  31. doc_page_extractor-0.2.4.dist-info/LICENSE +661 -0
  32. doc_page_extractor-0.2.4.dist-info/METADATA +88 -0
  33. doc_page_extractor-0.2.4.dist-info/RECORD +34 -0
  34. doc_page_extractor-0.2.4.dist-info/WHEEL +4 -0
@@ -0,0 +1,16 @@
1
+ from .extractor import DocExtractor
2
+ from .clipper import clip, clip_from_image
3
+ from .plot import plot
4
+ from .rectangle import Point, Rectangle
5
+ from .model import Model, HuggingfaceModel
6
+ from .types import (
7
+ ExtractedResult,
8
+ OCRFragment,
9
+ LayoutClass,
10
+ TableLayoutParsedFormat,
11
+ Layout,
12
+ BaseLayout,
13
+ PlainLayout,
14
+ FormulaLayout,
15
+ TableLayout,
16
+ )
@@ -0,0 +1,119 @@
1
+ import numpy as np
2
+
3
+ from math import pi, ceil, sin, cos, sqrt
4
+ from PIL.Image import Image, Transform
5
+ from .types import Layout, ExtractedResult
6
+ from .rectangle import Rectangle
7
+ from .rotation import calculate_rotation_with_rect, normal_vertical_rotation
8
+
9
+
10
+ def clip(
11
+ extracted_result: ExtractedResult,
12
+ layout: Layout,
13
+ wrapped_width: float = 0.0,
14
+ wrapped_height: float = 0.0,
15
+ ) -> Image:
16
+ image: Image | None
17
+ if extracted_result.adjusted_image is None:
18
+ image = extracted_result.extracted_image
19
+ else:
20
+ image = extracted_result.adjusted_image
21
+ assert image is not None, "Image must not be None"
22
+ return clip_from_image(
23
+ image, layout.rect,
24
+ wrapped_width, wrapped_height,
25
+ )
26
+
27
+ def clip_from_image(
28
+ image: Image,
29
+ rect: Rectangle,
30
+ wrapped_width: float = 0.0,
31
+ wrapped_height: float = 0.0,
32
+ ) -> Image:
33
+ horizontal_rotation, vertical_rotation = calculate_rotation_with_rect(rect)
34
+ image = image.copy()
35
+ matrix_move = np.array(_get_move_matrix(rect.lt[0], rect.lt[1])).reshape(3, 3)
36
+ matrix_rotate = np.array(_get_rotate_matrix(-horizontal_rotation)).reshape(3, 3)
37
+ matrix = np.dot(matrix_move, matrix_rotate)
38
+
39
+ y_axis_rotation = normal_vertical_rotation(vertical_rotation - horizontal_rotation)
40
+
41
+ if abs(y_axis_rotation - 0.25 * pi) > 0.0:
42
+ x = cos(y_axis_rotation)
43
+ y = sin(y_axis_rotation)
44
+ matrix_shear = np.array(_get_shear_matrix(x, y)).reshape(3, 3)
45
+ matrix = np.dot(matrix, matrix_shear)
46
+
47
+ width, height, max_width, max_height = _size_and_wrapper(rect)
48
+ max_width += wrapped_width
49
+ max_height += wrapped_height
50
+
51
+ if max_width != width or max_height != height:
52
+ dx = (max_width - width) / 2.0
53
+ dy = (max_height - height) / 2.0
54
+ matrix_move = np.array(_get_move_matrix(-dx, -dy)).reshape(3, 3)
55
+ matrix = np.dot(matrix, matrix_move)
56
+
57
+ return image.transform(
58
+ size=(ceil(max_width), ceil(max_height)),
59
+ method=Transform.AFFINE,
60
+ data=_to_pillow_matrix(matrix),
61
+ )
62
+
63
+ def _size_and_wrapper(rect: Rectangle):
64
+ widths: list[float] = []
65
+ heights: list[float] = []
66
+
67
+ for i, (p1, p2) in enumerate(rect.segments):
68
+ dx = p2[0] - p1[0]
69
+ dy = p2[1] - p1[1]
70
+ distance = sqrt(dx*dx + dy*dy)
71
+ if i % 2 == 0:
72
+ heights.append(distance)
73
+ else:
74
+ widths.append(distance)
75
+
76
+ if len(widths) == 0 and len(heights) == 0:
77
+ return 0.0, 0.0, 0.0, 0.0
78
+
79
+ width: float = sum(widths) / len(widths)
80
+ height: float = sum(heights) / len(heights)
81
+ max_width: float = width
82
+ max_height: float = height
83
+
84
+ for width in widths:
85
+ if width > max_width:
86
+ max_width = width
87
+
88
+ for height in heights:
89
+ if height > max_height:
90
+ max_height = height
91
+
92
+ return width, height, max_width, max_height
93
+
94
+ def _to_pillow_matrix(matrix):
95
+ return (
96
+ matrix[0][0], matrix[0][1], matrix[0][2],
97
+ matrix[1][0], matrix[1][1], matrix[1][2],
98
+ )
99
+
100
+ def _get_move_matrix(dx: float, dy: float):
101
+ return (
102
+ 1.0, 0.0, dx,
103
+ 0.0, 1.0, dy,
104
+ 0.0, 0.0, 1.0,
105
+ )
106
+
107
+ def _get_rotate_matrix(rotation: float):
108
+ return (
109
+ cos(rotation), sin(rotation), 0.0,
110
+ -sin(rotation), cos(rotation), 0.0,
111
+ 0.0, 0.0, 1.0
112
+ )
113
+
114
+ def _get_shear_matrix(x0: float, y0: float):
115
+ return (
116
+ 1.0, 0.0, 0.0,
117
+ x0, y0, 0.0,
118
+ 0.0, 0.0, 1.0,
119
+ )
@@ -0,0 +1,16 @@
1
+ import os
2
+ import requests
3
+ from pathlib import Path
4
+
5
+
6
+ def download(url: str, file_path: Path):
7
+ response = requests.get(url, stream=True, timeout=60)
8
+ if response.status_code != 200:
9
+ raise FileNotFoundError(f"Failed to download file from {url}: {response.status_code}")
10
+ try:
11
+ with open(file_path, "wb") as file:
12
+ file.write(response.content)
13
+ except Exception as e:
14
+ if os.path.exists(file_path):
15
+ os.remove(file_path)
16
+ raise e
@@ -0,0 +1,218 @@
1
+ import torch
2
+
3
+ from os import PathLike
4
+ from typing import cast, Any, Literal, Generator
5
+ from PIL.Image import Image
6
+ from doclayout_yolo import YOLOv10
7
+
8
+ from .model import Model, HuggingfaceModel
9
+ from .ocr import OCR
10
+ from .ocr_corrector import correct_fragments
11
+ from .raw_optimizer import RawOptimizer
12
+ from .rectangle import intersection_area, Rectangle
13
+ from .table import Table
14
+ from .latex import LaTeX
15
+ from .layout_order import LayoutOrder
16
+ from .overlap import merge_fragments_as_line, remove_overlap_layouts
17
+ from .clipper import clip_from_image
18
+ from .types import (
19
+ ExtractedResult,
20
+ OCRFragment,
21
+ Layout,
22
+ LayoutClass,
23
+ PlainLayout,
24
+ TableLayout,
25
+ FormulaLayout,
26
+ TableLayoutParsedFormat
27
+ )
28
+
29
+
30
+ class DocExtractor:
31
+ def __init__(
32
+ self,
33
+ model_cache_dir: PathLike | None = None,
34
+ device: Literal["cpu", "cuda"] = "cpu",
35
+ model: Model | None = None,
36
+ ) -> None:
37
+
38
+ if model is None:
39
+ if model_cache_dir is None:
40
+ raise ValueError("You must provide a model_cache_dir or a model instance.")
41
+ model = HuggingfaceModel(model_cache_dir)
42
+
43
+ if device == "cuda" and not torch.cuda.is_available():
44
+ device = "cpu"
45
+ print("CUDA is not available. Using CPU instead.")
46
+
47
+ self._device: Literal["cpu", "cuda"] = device
48
+ self._model: Model = model
49
+ self._yolo: YOLOv10 | None = None
50
+ self._ocr: OCR = OCR(device, model)
51
+ self._table: Table = Table(device, model)
52
+ self._latex: LaTeX = LaTeX(device, model)
53
+ self._layout_order: LayoutOrder = LayoutOrder(device, model)
54
+
55
+ def prepare_models(self):
56
+ self._model.get_onnx_ocr_path()
57
+ self._model.get_yolo_path()
58
+ self._model.get_layoutreader_path()
59
+ self._model.get_struct_eqtable_path()
60
+ self._model.get_latex_path()
61
+
62
+ def extract(
63
+ self,
64
+ image: Image,
65
+ extract_formula: bool,
66
+ extract_table_format: TableLayoutParsedFormat | None = None,
67
+ ocr_for_each_layouts: bool = False,
68
+ adjust_points: bool = False
69
+ ) -> ExtractedResult:
70
+
71
+ raw_optimizer = RawOptimizer(image, adjust_points)
72
+ fragments = list(self._ocr.search_fragments(raw_optimizer.image_np))
73
+ raw_optimizer.receive_raw_fragments(fragments)
74
+ layouts = list(self._yolo_extract_layouts(raw_optimizer.image))
75
+ layouts = self._layouts_matched_by_fragments(fragments, layouts)
76
+ layouts = remove_overlap_layouts(layouts)
77
+
78
+ if ocr_for_each_layouts:
79
+ self._correct_fragments_by_ocr_layouts(raw_optimizer.image, layouts)
80
+
81
+ layouts = self._layout_order.sort(layouts, raw_optimizer.image.size)
82
+ layouts = [layout for layout in layouts if self._should_keep_layout(layout)]
83
+
84
+ self._parse_table_and_formula_layouts(layouts, raw_optimizer, extract_formula=extract_formula, extract_table_format=extract_table_format)
85
+
86
+ for layout in layouts:
87
+ layout.fragments = merge_fragments_as_line(layout.fragments)
88
+
89
+ raw_optimizer.receive_raw_layouts(layouts)
90
+
91
+ return ExtractedResult(
92
+ rotation=raw_optimizer.rotation,
93
+ layouts=layouts,
94
+ extracted_image=image,
95
+ adjusted_image=raw_optimizer.adjusted_image,
96
+ )
97
+
98
+ def _yolo_extract_layouts(self, source: Image) -> Generator[Layout, None, None]:
99
+ # about source parameter to see:
100
+ # https://github.com/opendatalab/DocLayout-YOLO/blob/7c4be36bc61f11b67cf4a44ee47f3c41e9800a91/doclayout_yolo/data/build.py#L157-L175
101
+ det_res = self._get_yolo().predict(
102
+ source=cast(Any, source),
103
+ imgsz=1024,
104
+ conf=0.2,
105
+ device=self._device # Device to use (e.g., "cuda" or "cpu")
106
+ )
107
+ boxes = det_res[0].__dict__["boxes"]
108
+
109
+ for cls_id, rect in zip(boxes.cls, boxes.xyxy):
110
+ cls_id = cls_id.item()
111
+ cls=LayoutClass(round(cls_id))
112
+
113
+ x1, y1, x2, y2 = rect
114
+ x1 = x1.item()
115
+ y1 = y1.item()
116
+ x2 = x2.item()
117
+ y2 = y2.item()
118
+ rect = Rectangle(
119
+ lt=(x1, y1),
120
+ rt=(x2, y1),
121
+ lb=(x1, y2),
122
+ rb=(x2, y2),
123
+ )
124
+ if rect.is_valid:
125
+ if cls == LayoutClass.TABLE:
126
+ yield TableLayout(cls=cls, rect=rect, fragments=[], parsed=None)
127
+ elif cls == LayoutClass.ISOLATE_FORMULA:
128
+ yield FormulaLayout(cls=cls, rect=rect, fragments=[], latex=None)
129
+ else:
130
+ yield PlainLayout(cls=cls, rect=rect, fragments=[])
131
+
132
+ def _layouts_matched_by_fragments(self, fragments: list[OCRFragment], layouts: list[Layout]):
133
+ layouts_group = self._split_layouts_by_group(layouts)
134
+ for fragment in fragments:
135
+ for sub_layouts in layouts_group:
136
+ layout = self._find_matched_layout(fragment, sub_layouts)
137
+ if layout is not None:
138
+ layout.fragments.append(fragment)
139
+ break
140
+ return layouts
141
+
142
+ def _correct_fragments_by_ocr_layouts(self, source: Image, layouts: list[Layout]):
143
+ for layout in layouts:
144
+ correct_fragments(self._ocr, source, layout)
145
+
146
+ def _parse_table_and_formula_layouts(
147
+ self,
148
+ layouts: list[Layout],
149
+ raw_optimizer: RawOptimizer,
150
+ extract_formula: bool,
151
+ extract_table_format: TableLayoutParsedFormat | None,
152
+ ):
153
+ for layout in layouts:
154
+ if isinstance(layout, FormulaLayout) and extract_formula:
155
+ image = clip_from_image(raw_optimizer.image, layout.rect)
156
+ layout.latex = self._latex.extract(image)
157
+ elif isinstance(layout, TableLayout) and extract_table_format is not None:
158
+ image = clip_from_image(raw_optimizer.image, layout.rect)
159
+ parsed = self._table.predict(image, extract_table_format)
160
+ if parsed is not None:
161
+ layout.parsed = (parsed, extract_table_format)
162
+
163
+ def _split_layouts_by_group(self, layouts: list[Layout]):
164
+ texts_layouts: list[Layout] = []
165
+ abandon_layouts: list[Layout] = []
166
+
167
+ for layout in layouts:
168
+ cls = layout.cls
169
+ if cls == LayoutClass.TITLE or \
170
+ cls == LayoutClass.PLAIN_TEXT or \
171
+ cls == LayoutClass.FIGURE_CAPTION or \
172
+ cls == LayoutClass.TABLE_CAPTION or \
173
+ cls == LayoutClass.TABLE_FOOTNOTE or \
174
+ cls == LayoutClass.FORMULA_CAPTION:
175
+ texts_layouts.append(layout)
176
+ elif cls == LayoutClass.ABANDON:
177
+ abandon_layouts.append(layout)
178
+
179
+ return texts_layouts, abandon_layouts
180
+
181
+ def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
182
+ fragment_area = fragment.rect.area
183
+ primary_layouts: list[tuple[Layout, float]] = []
184
+
185
+ if fragment_area == 0.0:
186
+ return None
187
+
188
+ for layout in layouts:
189
+ area = intersection_area(fragment.rect, layout.rect)
190
+ if area / fragment_area > 0.85:
191
+ primary_layouts.append((layout, layout.rect.area))
192
+
193
+ min_area: float = float("inf")
194
+ min_layout: Layout | None = None
195
+
196
+ for layout, area in primary_layouts:
197
+ if area < min_area:
198
+ min_area = area
199
+ min_layout = layout
200
+
201
+ return min_layout
202
+
203
+ def _get_yolo(self) -> YOLOv10:
204
+ if self._yolo is None:
205
+ model_path = self._model.get_yolo_path()
206
+ self._yolo = YOLOv10(str(model_path))
207
+ return self._yolo
208
+
209
+ def _should_keep_layout(self, layout: Layout) -> bool:
210
+ if len(layout.fragments) > 0:
211
+ return True
212
+ cls = layout.cls
213
+ return (
214
+ cls == LayoutClass.FIGURE or
215
+ cls == LayoutClass.TABLE or
216
+ cls == LayoutClass.ISOLATE_FORMULA
217
+ )
218
+
@@ -0,0 +1,33 @@
1
+ import os
2
+ import torch
3
+
4
+ from munch import Munch
5
+ from pix2tex.cli import LatexOCR
6
+ from PIL.Image import Image
7
+ from typing import Literal
8
+ from .utils import expand_image
9
+ from .model import Model
10
+
11
+
12
+ class LaTeX:
13
+ def __init__(self, device: Literal["cpu", "cuda"], model: Model) -> None:
14
+ self._model: Model = model
15
+ self._latex_model: LatexOCR | None = None
16
+ self._device: Literal["cpu", "cuda"] = device
17
+
18
+ def extract(self, image: Image) -> str | None:
19
+ image = expand_image(image, 0.1) # 添加边缘提高识别准确率
20
+ model = self._get_model()
21
+ with torch.no_grad():
22
+ return model(image)
23
+
24
+ def _get_model(self) -> LatexOCR:
25
+ if self._latex_model is None:
26
+ model_path = self._model.get_latex_path()
27
+ self._latex_model = LatexOCR(Munch({
28
+ "config": os.path.join("settings", "config.yaml"),
29
+ "checkpoint": os.path.join(model_path, "checkpoints", "weights.pth"),
30
+ "no_cuda": self._device == "cpu",
31
+ "no_resize": False,
32
+ }))
33
+ return self._latex_model
@@ -0,0 +1,239 @@
1
+ import torch
2
+
3
+ from typing import Generator, Literal
4
+ from dataclasses import dataclass
5
+ from transformers import LayoutLMv3ForTokenClassification
6
+
7
+ from .types import Layout, LayoutClass
8
+ from .model import Model
9
+ from .layoutreader import prepare_inputs, boxes2inputs, parse_logits
10
+
11
+
12
+ @dataclass
13
+ class _BBox:
14
+ layout_index: int
15
+ fragment_index: int
16
+ virtual: bool
17
+ order: int
18
+ value: tuple[float, float, float, float]
19
+
20
+ class LayoutOrder:
21
+ def __init__(self, device: Literal["cpu", "cuda"], model: Model):
22
+ self._model: Model = model
23
+ self._order_model: LayoutLMv3ForTokenClassification | None = None
24
+ self._device: Literal["cpu", "cuda"] = device
25
+
26
+ def _get_model(self) -> LayoutLMv3ForTokenClassification:
27
+ if self._order_model is None:
28
+ model_path = self._model.get_layoutreader_path()
29
+ self._order_model = LayoutLMv3ForTokenClassification.from_pretrained(
30
+ pretrained_model_name_or_path=model_path,
31
+ local_files_only=True,
32
+ ).to(device=self._device)
33
+ return self._order_model
34
+
35
+ def sort(self, layouts: list[Layout], size: tuple[int, int]) -> list[Layout]:
36
+ width, height = size
37
+ if width == 0 or height == 0:
38
+ return layouts
39
+
40
+ bbox_list = self._order_and_get_bbox_list(
41
+ layouts=layouts,
42
+ width=width,
43
+ height=height,
44
+ )
45
+ if bbox_list is None:
46
+ return layouts
47
+
48
+ return self._sort_layouts_and_fragments(layouts, bbox_list)
49
+
50
+ def _order_and_get_bbox_list(
51
+ self,
52
+ layouts: list[Layout],
53
+ width: int,
54
+ height: int,
55
+ ) -> list[_BBox] | None:
56
+
57
+ line_height = self._line_height(layouts)
58
+ bbox_list: list[_BBox] = []
59
+
60
+ for i, layout in enumerate(layouts):
61
+ if layout.cls == LayoutClass.PLAIN_TEXT and \
62
+ len(layout.fragments) > 0:
63
+ for j, fragment in enumerate(layout.fragments):
64
+ bbox_list.append(_BBox(
65
+ layout_index=i,
66
+ fragment_index=j,
67
+ virtual=False,
68
+ order=0,
69
+ value=fragment.rect.wrapper,
70
+ ))
71
+ else:
72
+ bbox_list.extend(
73
+ self._generate_virtual_lines(
74
+ layout=layout,
75
+ layout_index=i,
76
+ line_height=line_height,
77
+ width=width,
78
+ height=height,
79
+ ),
80
+ )
81
+
82
+ if len(bbox_list) > 200:
83
+ # https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L522
84
+ return None
85
+
86
+ layoutreader_size = 1000.0
87
+ x_scale = layoutreader_size / float(width)
88
+ y_scale = layoutreader_size / float(height)
89
+
90
+ for bbox in bbox_list:
91
+ x0, y0, x1, y1 = self._squeeze(bbox, width, height)
92
+ x0 = round(x0 * x_scale)
93
+ y0 = round(y0 * y_scale)
94
+ x1 = round(x1 * x_scale)
95
+ y1 = round(y1 * y_scale)
96
+ bbox.value = (x0, y0, x1, y1)
97
+
98
+ bbox_list.sort(key=lambda b: b.value) # 必须排序,乱序传入 layoutreader 会令它无法识别正确顺序
99
+ model = self._get_model()
100
+
101
+ with torch.no_grad():
102
+ inputs = boxes2inputs([list(bbox.value) for bbox in bbox_list])
103
+ inputs = prepare_inputs(inputs, model)
104
+ logits = model(**inputs).logits.cpu().squeeze(0)
105
+ orders = parse_logits(logits, len(bbox_list))
106
+
107
+ sorted_bbox_list = [bbox_list[i] for i in orders]
108
+ for i, bbox in enumerate(sorted_bbox_list):
109
+ bbox.order = i
110
+
111
+ return sorted_bbox_list
112
+
113
+ def _sort_layouts_and_fragments(self, layouts: list[Layout], bbox_list: list[_BBox]):
114
+ layout_bbox_list: list[list[_BBox]] = [[] for _ in range(len(layouts))]
115
+ for bbox in bbox_list:
116
+ layout_bbox_list[bbox.layout_index].append(bbox)
117
+
118
+ layouts_with_median_order: list[tuple[Layout, float]] = []
119
+ for layout_index, bbox_list in enumerate(layout_bbox_list):
120
+ layout = layouts[layout_index]
121
+ orders = [b.order for b in bbox_list] # virtual bbox 保证了 orders 不可能为空
122
+ median_order = self._median(orders)
123
+ layouts_with_median_order.append((layout, median_order))
124
+
125
+ for layout, bbox_list in zip(layouts, layout_bbox_list):
126
+ for bbox in bbox_list:
127
+ if not bbox.virtual:
128
+ layout.fragments[bbox.fragment_index].order = bbox.order
129
+ if all(not bbox.virtual for bbox in bbox_list):
130
+ layout.fragments.sort(key=lambda f: f.order)
131
+
132
+ layouts_with_median_order.sort(key=lambda x: x[1])
133
+ layouts = [layout for layout, _ in layouts_with_median_order]
134
+ next_fragment_order: int = 0
135
+
136
+ for layout in layouts:
137
+ for fragment in layout.fragments:
138
+ fragment.order = next_fragment_order
139
+ next_fragment_order += 1
140
+
141
+ return layouts
142
+
143
+ def _line_height(self, layouts: list[Layout]) -> float:
144
+ line_height: float = 0.0
145
+ count: int = 0
146
+ for layout in layouts:
147
+ for fragment in layout.fragments:
148
+ _, height = fragment.rect.size
149
+ line_height += height
150
+ count += 1
151
+ if count == 0:
152
+ return 10.0
153
+ return line_height / float(count)
154
+
155
+ def _generate_virtual_lines(
156
+ self,
157
+ layout: Layout,
158
+ layout_index: int,
159
+ line_height: float,
160
+ width: int,
161
+ height: int,
162
+ ) -> Generator[_BBox, None, None]:
163
+
164
+ # https://github.com/opendatalab/MinerU/blob/980f5c8cd70f22f8c0c9b7b40eaff6f4804e6524/magic_pdf/pdf_parse_union_core_v2.py#L451-L490
165
+ x0, y0, x1, y1 = layout.rect.wrapper
166
+ layout_height = y1 - y0
167
+ layout_weight = x1 - x0
168
+ lines = int(layout_height / line_height)
169
+
170
+ if layout_height <= line_height * 2:
171
+ yield _BBox(
172
+ layout_index=layout_index,
173
+ fragment_index=0,
174
+ virtual=True,
175
+ order=0,
176
+ value=(x0, y0, x1, y1),
177
+ )
178
+ return
179
+
180
+ elif layout_height <= height * 0.25 or \
181
+ width * 0.5 <= layout_weight or \
182
+ width * 0.25 < layout_weight:
183
+ if layout_weight > width * 0.4:
184
+ lines = 3
185
+ elif layout_weight <= width * 0.25:
186
+ if layout_height / layout_weight > 1.2: # 细长的不分
187
+ yield _BBox(
188
+ layout_index=layout_index,
189
+ fragment_index=0,
190
+ virtual=True,
191
+ order=0,
192
+ value=(x0, y0, x1, y1),
193
+ )
194
+ return
195
+ else: # 不细长的还是分成两行
196
+ lines = 2
197
+
198
+ lines = max(1, lines)
199
+ line_height = (y1 - y0) / lines
200
+ current_y = y0
201
+
202
+ for i in range(lines):
203
+ yield _BBox(
204
+ layout_index=layout_index,
205
+ fragment_index=i,
206
+ virtual=True,
207
+ order=0,
208
+ value=(x0, current_y, x1, current_y + line_height),
209
+ )
210
+ current_y += line_height
211
+
212
+ def _median(self, numbers: list[int]) -> float:
213
+ sorted_numbers = sorted(numbers)
214
+ n = len(sorted_numbers)
215
+
216
+ # 判断是奇数还是偶数个元素
217
+ if n % 2 == 1:
218
+ # 奇数情况,直接取中间的数
219
+ return float(sorted_numbers[n // 2])
220
+ else:
221
+ # 偶数情况,取中间两个数的平均值
222
+ mid1 = sorted_numbers[n // 2 - 1]
223
+ mid2 = sorted_numbers[n // 2]
224
+ return float((mid1 + mid2) / 2)
225
+
226
+ def _squeeze(self, bbox: _BBox, width: int, height: int) -> tuple[float, float, float, float]:
227
+ x0, y0, x1, y1 = bbox.value
228
+ x0 = self._squeeze_value(x0, width)
229
+ x1 = self._squeeze_value(x1, width)
230
+ y0 = self._squeeze_value(y0, height)
231
+ y1 = self._squeeze_value(y1, height)
232
+ return x0, y0, x1, y1
233
+
234
+ def _squeeze_value(self, position: float, size: int) -> float:
235
+ if position < 0:
236
+ position = 0.0
237
+ if position > size:
238
+ position = float(size)
239
+ return position