docling 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +59 -0
- docling/backend/docling_parse_backend.py +207 -0
- docling/backend/pypdfium2_backend.py +233 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/base_models.py +312 -0
- docling/datamodel/document.py +363 -0
- docling/datamodel/settings.py +32 -0
- docling/document_converter.py +276 -0
- docling/models/__init__.py +0 -0
- docling/models/base_ocr_model.py +124 -0
- docling/models/ds_glm_model.py +82 -0
- docling/models/easyocr_model.py +70 -0
- docling/models/layout_model.py +328 -0
- docling/models/page_assemble_model.py +148 -0
- docling/models/table_structure_model.py +144 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/base_model_pipeline.py +17 -0
- docling/pipeline/standard_model_pipeline.py +38 -0
- docling/utils/__init__.py +0 -0
- docling/utils/layout_utils.py +806 -0
- docling/utils/utils.py +41 -0
- docling-1.6.2.dist-info/LICENSE +21 -0
- docling-1.6.2.dist-info/METADATA +192 -0
- docling-1.6.2.dist-info/RECORD +27 -0
- docling-1.6.2.dist-info/WHEEL +4 -0
@@ -0,0 +1,312 @@
|
|
1
|
+
import copy
|
2
|
+
import warnings
|
3
|
+
from enum import Enum, auto
|
4
|
+
from io import BytesIO
|
5
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
6
|
+
|
7
|
+
from PIL.Image import Image
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
9
|
+
from typing_extensions import Self
|
10
|
+
|
11
|
+
from docling.backend.abstract_backend import PdfPageBackend
|
12
|
+
|
13
|
+
|
14
|
+
class ConversionStatus(str, Enum):
|
15
|
+
PENDING = auto()
|
16
|
+
STARTED = auto()
|
17
|
+
FAILURE = auto()
|
18
|
+
SUCCESS = auto()
|
19
|
+
SUCCESS_WITH_ERRORS = auto()
|
20
|
+
|
21
|
+
|
22
|
+
class DocInputType(str, Enum):
|
23
|
+
PATH = auto()
|
24
|
+
STREAM = auto()
|
25
|
+
|
26
|
+
|
27
|
+
class CoordOrigin(str, Enum):
|
28
|
+
TOPLEFT = auto()
|
29
|
+
BOTTOMLEFT = auto()
|
30
|
+
|
31
|
+
|
32
|
+
class PageSize(BaseModel):
|
33
|
+
width: float = 0.0
|
34
|
+
height: float = 0.0
|
35
|
+
|
36
|
+
|
37
|
+
class BoundingBox(BaseModel):
|
38
|
+
l: float # left
|
39
|
+
t: float # top
|
40
|
+
r: float # right
|
41
|
+
b: float # bottom
|
42
|
+
|
43
|
+
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
44
|
+
|
45
|
+
@property
|
46
|
+
def width(self):
|
47
|
+
return self.r - self.l
|
48
|
+
|
49
|
+
@property
|
50
|
+
def height(self):
|
51
|
+
return abs(self.t - self.b)
|
52
|
+
|
53
|
+
def scaled(self, scale: float) -> "BoundingBox":
|
54
|
+
out_bbox = copy.deepcopy(self)
|
55
|
+
out_bbox.l *= scale
|
56
|
+
out_bbox.r *= scale
|
57
|
+
out_bbox.t *= scale
|
58
|
+
out_bbox.b *= scale
|
59
|
+
|
60
|
+
return out_bbox
|
61
|
+
|
62
|
+
def as_tuple(self):
|
63
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
64
|
+
return (self.l, self.t, self.r, self.b)
|
65
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
66
|
+
return (self.l, self.b, self.r, self.t)
|
67
|
+
|
68
|
+
@classmethod
|
69
|
+
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
70
|
+
if origin == CoordOrigin.TOPLEFT:
|
71
|
+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
72
|
+
if r < l:
|
73
|
+
l, r = r, l
|
74
|
+
if b < t:
|
75
|
+
b, t = t, b
|
76
|
+
|
77
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
78
|
+
elif origin == CoordOrigin.BOTTOMLEFT:
|
79
|
+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
80
|
+
if r < l:
|
81
|
+
l, r = r, l
|
82
|
+
if b > t:
|
83
|
+
b, t = t, b
|
84
|
+
|
85
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
86
|
+
|
87
|
+
def area(self) -> float:
|
88
|
+
return (self.r - self.l) * (self.b - self.t)
|
89
|
+
|
90
|
+
def intersection_area_with(self, other: "BoundingBox") -> float:
|
91
|
+
# Calculate intersection coordinates
|
92
|
+
left = max(self.l, other.l)
|
93
|
+
top = max(self.t, other.t)
|
94
|
+
right = min(self.r, other.r)
|
95
|
+
bottom = min(self.b, other.b)
|
96
|
+
|
97
|
+
# Calculate intersection dimensions
|
98
|
+
width = right - left
|
99
|
+
height = bottom - top
|
100
|
+
|
101
|
+
# If the bounding boxes do not overlap, width or height will be negative
|
102
|
+
if width <= 0 or height <= 0:
|
103
|
+
return 0.0
|
104
|
+
|
105
|
+
return width * height
|
106
|
+
|
107
|
+
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
108
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
109
|
+
return self
|
110
|
+
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
111
|
+
return BoundingBox(
|
112
|
+
l=self.l,
|
113
|
+
r=self.r,
|
114
|
+
t=page_height - self.t,
|
115
|
+
b=page_height - self.b,
|
116
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
117
|
+
)
|
118
|
+
|
119
|
+
def to_top_left_origin(self, page_height):
|
120
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
121
|
+
return self
|
122
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
123
|
+
return BoundingBox(
|
124
|
+
l=self.l,
|
125
|
+
r=self.r,
|
126
|
+
t=page_height - self.t, # self.b
|
127
|
+
b=page_height - self.b, # self.t
|
128
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
129
|
+
)
|
130
|
+
|
131
|
+
|
132
|
+
class Cell(BaseModel):
|
133
|
+
id: int
|
134
|
+
text: str
|
135
|
+
bbox: BoundingBox
|
136
|
+
|
137
|
+
|
138
|
+
class OcrCell(Cell):
|
139
|
+
confidence: float
|
140
|
+
|
141
|
+
|
142
|
+
class Cluster(BaseModel):
|
143
|
+
id: int
|
144
|
+
label: str
|
145
|
+
bbox: BoundingBox
|
146
|
+
confidence: float = 1.0
|
147
|
+
cells: List[Cell] = []
|
148
|
+
|
149
|
+
|
150
|
+
class BasePageElement(BaseModel):
|
151
|
+
label: str
|
152
|
+
id: int
|
153
|
+
page_no: int
|
154
|
+
cluster: Cluster
|
155
|
+
text: Optional[str] = None
|
156
|
+
|
157
|
+
|
158
|
+
class LayoutPrediction(BaseModel):
|
159
|
+
clusters: List[Cluster] = []
|
160
|
+
|
161
|
+
|
162
|
+
class TableCell(BaseModel):
|
163
|
+
bbox: BoundingBox
|
164
|
+
row_span: int
|
165
|
+
col_span: int
|
166
|
+
start_row_offset_idx: int
|
167
|
+
end_row_offset_idx: int
|
168
|
+
start_col_offset_idx: int
|
169
|
+
end_col_offset_idx: int
|
170
|
+
text: str
|
171
|
+
column_header: bool = False
|
172
|
+
row_header: bool = False
|
173
|
+
row_section: bool = False
|
174
|
+
|
175
|
+
@model_validator(mode="before")
|
176
|
+
@classmethod
|
177
|
+
def from_dict_format(cls, data: Any) -> Any:
|
178
|
+
if isinstance(data, Dict):
|
179
|
+
text = data["bbox"].get("token", "")
|
180
|
+
if not len(text):
|
181
|
+
text_cells = data.pop("text_cell_bboxes", None)
|
182
|
+
if text_cells:
|
183
|
+
for el in text_cells:
|
184
|
+
text += el["token"] + " "
|
185
|
+
|
186
|
+
text = text.strip()
|
187
|
+
data["text"] = text
|
188
|
+
|
189
|
+
return data
|
190
|
+
|
191
|
+
|
192
|
+
class TableElement(BasePageElement):
|
193
|
+
otsl_seq: List[str]
|
194
|
+
num_rows: int = 0
|
195
|
+
num_cols: int = 0
|
196
|
+
table_cells: List[TableCell]
|
197
|
+
|
198
|
+
|
199
|
+
class TableStructurePrediction(BaseModel):
|
200
|
+
table_map: Dict[int, TableElement] = {}
|
201
|
+
|
202
|
+
|
203
|
+
class TextElement(BasePageElement): ...
|
204
|
+
|
205
|
+
|
206
|
+
class FigureData(BaseModel):
|
207
|
+
pass
|
208
|
+
|
209
|
+
|
210
|
+
class FigureElement(BasePageElement):
|
211
|
+
data: Optional[FigureData] = None
|
212
|
+
provenance: Optional[str] = None
|
213
|
+
predicted_class: Optional[str] = None
|
214
|
+
confidence: Optional[float] = None
|
215
|
+
|
216
|
+
|
217
|
+
class FigureClassificationPrediction(BaseModel):
|
218
|
+
figure_count: int = 0
|
219
|
+
figure_map: Dict[int, FigureElement] = {}
|
220
|
+
|
221
|
+
|
222
|
+
class EquationPrediction(BaseModel):
|
223
|
+
equation_count: int = 0
|
224
|
+
equation_map: Dict[int, TextElement] = {}
|
225
|
+
|
226
|
+
|
227
|
+
class PagePredictions(BaseModel):
|
228
|
+
layout: LayoutPrediction = None
|
229
|
+
tablestructure: TableStructurePrediction = None
|
230
|
+
figures_classification: FigureClassificationPrediction = None
|
231
|
+
equations_prediction: EquationPrediction = None
|
232
|
+
|
233
|
+
|
234
|
+
PageElement = Union[TextElement, TableElement, FigureElement]
|
235
|
+
|
236
|
+
|
237
|
+
class AssembledUnit(BaseModel):
|
238
|
+
elements: List[PageElement]
|
239
|
+
body: List[PageElement]
|
240
|
+
headers: List[PageElement]
|
241
|
+
|
242
|
+
|
243
|
+
class Page(BaseModel):
|
244
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
245
|
+
|
246
|
+
page_no: int
|
247
|
+
page_hash: Optional[str] = None
|
248
|
+
size: Optional[PageSize] = None
|
249
|
+
cells: List[Cell] = None
|
250
|
+
predictions: PagePredictions = PagePredictions()
|
251
|
+
assembled: Optional[AssembledUnit] = None
|
252
|
+
|
253
|
+
_backend: Optional[PdfPageBackend] = (
|
254
|
+
None # Internal PDF backend. By default it is cleared during assembling.
|
255
|
+
)
|
256
|
+
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
257
|
+
_image_cache: Dict[float, Image] = (
|
258
|
+
{}
|
259
|
+
) # Cache of images in different scales. By default it is cleared during assembling.
|
260
|
+
|
261
|
+
def get_image(self, scale: float = 1.0) -> Optional[Image]:
|
262
|
+
if self._backend is None:
|
263
|
+
return self._image_cache.get(scale, None)
|
264
|
+
if not scale in self._image_cache:
|
265
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
266
|
+
return self._image_cache[scale]
|
267
|
+
|
268
|
+
@property
|
269
|
+
def image(self) -> Optional[Image]:
|
270
|
+
return self.get_image(scale=self._default_image_scale)
|
271
|
+
|
272
|
+
|
273
|
+
class DocumentStream(BaseModel):
|
274
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
275
|
+
|
276
|
+
filename: str
|
277
|
+
stream: BytesIO
|
278
|
+
|
279
|
+
|
280
|
+
class TableStructureOptions(BaseModel):
|
281
|
+
do_cell_matching: bool = (
|
282
|
+
True
|
283
|
+
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
284
|
+
# are merged across table columns.
|
285
|
+
# False: Let table structure model define the text cells, ignore PDF cells.
|
286
|
+
)
|
287
|
+
|
288
|
+
|
289
|
+
class PipelineOptions(BaseModel):
|
290
|
+
do_table_structure: bool = True # True: perform table structure extraction
|
291
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
292
|
+
|
293
|
+
table_structure_options: TableStructureOptions = TableStructureOptions()
|
294
|
+
|
295
|
+
|
296
|
+
class AssembleOptions(BaseModel):
|
297
|
+
keep_page_images: Annotated[
|
298
|
+
bool,
|
299
|
+
Field(
|
300
|
+
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
301
|
+
),
|
302
|
+
] = False # False: page images are removed in the assemble step
|
303
|
+
images_scale: Optional[float] = None # if set, the scale for generated images
|
304
|
+
|
305
|
+
@model_validator(mode="after")
|
306
|
+
def set_page_images_from_deprecated(self) -> Self:
|
307
|
+
with warnings.catch_warnings():
|
308
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
309
|
+
default_scale = 1.0
|
310
|
+
if self.keep_page_images and self.images_scale is None:
|
311
|
+
self.images_scale = default_scale
|
312
|
+
return self
|
@@ -0,0 +1,363 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path, PurePath
|
4
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
|
+
|
6
|
+
from docling_core.types import BaseCell, BaseText
|
7
|
+
from docling_core.types import BoundingBox as DsBoundingBox
|
8
|
+
from docling_core.types import Document as DsDocument
|
9
|
+
from docling_core.types import DocumentDescription as DsDocumentDescription
|
10
|
+
from docling_core.types import FileInfoObject as DsFileInfoObject
|
11
|
+
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
12
|
+
from docling_core.types import Table as DsSchemaTable
|
13
|
+
from docling_core.types import TableCell
|
14
|
+
from pydantic import BaseModel
|
15
|
+
|
16
|
+
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
|
+
from docling.datamodel.base_models import (
|
19
|
+
AssembledUnit,
|
20
|
+
ConversionStatus,
|
21
|
+
DocumentStream,
|
22
|
+
FigureElement,
|
23
|
+
Page,
|
24
|
+
PageElement,
|
25
|
+
TableElement,
|
26
|
+
TextElement,
|
27
|
+
)
|
28
|
+
from docling.datamodel.settings import DocumentLimits
|
29
|
+
from docling.utils.utils import create_file_hash
|
30
|
+
|
31
|
+
_log = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
layout_label_to_ds_type = {
|
34
|
+
"Title": "title",
|
35
|
+
"Document Index": "table-of-path_or_stream",
|
36
|
+
"Section-header": "subtitle-level-1",
|
37
|
+
"Checkbox-Selected": "checkbox-selected",
|
38
|
+
"Checkbox-Unselected": "checkbox-unselected",
|
39
|
+
"Caption": "caption",
|
40
|
+
"Page-header": "page-header",
|
41
|
+
"Page-footer": "page-footer",
|
42
|
+
"Footnote": "footnote",
|
43
|
+
"Table": "table",
|
44
|
+
"Formula": "equation",
|
45
|
+
"List-item": "paragraph",
|
46
|
+
"Code": "paragraph",
|
47
|
+
"Picture": "figure",
|
48
|
+
"Text": "paragraph",
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
class InputDocument(BaseModel):
|
53
|
+
file: PurePath = None
|
54
|
+
document_hash: Optional[str] = None
|
55
|
+
valid: bool = False
|
56
|
+
limits: DocumentLimits = DocumentLimits()
|
57
|
+
|
58
|
+
filesize: Optional[int] = None
|
59
|
+
page_count: Optional[int] = None
|
60
|
+
|
61
|
+
_backend: PdfDocumentBackend = None # Internal PDF backend used
|
62
|
+
|
63
|
+
def __init__(
|
64
|
+
self,
|
65
|
+
path_or_stream: Union[BytesIO, Path],
|
66
|
+
filename: Optional[str] = None,
|
67
|
+
limits: Optional[DocumentLimits] = None,
|
68
|
+
pdf_backend=DoclingParseDocumentBackend,
|
69
|
+
):
|
70
|
+
super().__init__()
|
71
|
+
|
72
|
+
self.limits = limits or DocumentLimits()
|
73
|
+
|
74
|
+
try:
|
75
|
+
if isinstance(path_or_stream, Path):
|
76
|
+
self.file = path_or_stream
|
77
|
+
self.filesize = path_or_stream.stat().st_size
|
78
|
+
if self.filesize > self.limits.max_file_size:
|
79
|
+
self.valid = False
|
80
|
+
else:
|
81
|
+
self.document_hash = create_file_hash(path_or_stream)
|
82
|
+
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
83
|
+
|
84
|
+
elif isinstance(path_or_stream, BytesIO):
|
85
|
+
self.file = PurePath(filename)
|
86
|
+
self.filesize = path_or_stream.getbuffer().nbytes
|
87
|
+
|
88
|
+
if self.filesize > self.limits.max_file_size:
|
89
|
+
self.valid = False
|
90
|
+
else:
|
91
|
+
self.document_hash = create_file_hash(path_or_stream)
|
92
|
+
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
93
|
+
|
94
|
+
if self.document_hash and self._backend.page_count() > 0:
|
95
|
+
self.page_count = self._backend.page_count()
|
96
|
+
|
97
|
+
if self.page_count <= self.limits.max_num_pages:
|
98
|
+
self.valid = True
|
99
|
+
|
100
|
+
except (FileNotFoundError, OSError) as e:
|
101
|
+
_log.exception(
|
102
|
+
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
103
|
+
)
|
104
|
+
# raise
|
105
|
+
except RuntimeError as e:
|
106
|
+
_log.exception(
|
107
|
+
f"An unexpected error occurred while opening the document {self.file.name}",
|
108
|
+
exc_info=e,
|
109
|
+
)
|
110
|
+
# raise
|
111
|
+
|
112
|
+
|
113
|
+
class ConvertedDocument(BaseModel):
|
114
|
+
input: InputDocument
|
115
|
+
|
116
|
+
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
117
|
+
errors: List[Dict] = [] # structure to keep errors
|
118
|
+
|
119
|
+
pages: List[Page] = []
|
120
|
+
assembled: Optional[AssembledUnit] = None
|
121
|
+
|
122
|
+
output: Optional[DsDocument] = None
|
123
|
+
|
124
|
+
def to_ds_document(self) -> DsDocument:
|
125
|
+
title = ""
|
126
|
+
desc = DsDocumentDescription(logs=[])
|
127
|
+
|
128
|
+
page_hashes = [
|
129
|
+
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
|
130
|
+
for p in self.pages
|
131
|
+
]
|
132
|
+
|
133
|
+
file_info = DsFileInfoObject(
|
134
|
+
filename=self.input.file.name,
|
135
|
+
document_hash=self.input.document_hash,
|
136
|
+
num_pages=self.input.page_count,
|
137
|
+
page_hashes=page_hashes,
|
138
|
+
)
|
139
|
+
|
140
|
+
main_text = []
|
141
|
+
tables = []
|
142
|
+
figures = []
|
143
|
+
|
144
|
+
page_no_to_page = {p.page_no: p for p in self.pages}
|
145
|
+
|
146
|
+
for element in self.assembled.elements:
|
147
|
+
# Convert bboxes to lower-left origin.
|
148
|
+
target_bbox = DsBoundingBox(
|
149
|
+
element.cluster.bbox.to_bottom_left_origin(
|
150
|
+
page_no_to_page[element.page_no].size.height
|
151
|
+
).as_tuple()
|
152
|
+
)
|
153
|
+
|
154
|
+
if isinstance(element, TextElement):
|
155
|
+
main_text.append(
|
156
|
+
BaseText(
|
157
|
+
text=element.text,
|
158
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
159
|
+
name=element.label,
|
160
|
+
prov=[
|
161
|
+
Prov(
|
162
|
+
bbox=target_bbox,
|
163
|
+
page=element.page_no + 1,
|
164
|
+
span=[0, len(element.text)],
|
165
|
+
)
|
166
|
+
],
|
167
|
+
)
|
168
|
+
)
|
169
|
+
elif isinstance(element, TableElement):
|
170
|
+
index = len(tables)
|
171
|
+
ref_str = f"#/tables/{index}"
|
172
|
+
main_text.append(
|
173
|
+
Ref(
|
174
|
+
name=element.label,
|
175
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
176
|
+
ref=ref_str,
|
177
|
+
),
|
178
|
+
)
|
179
|
+
|
180
|
+
# Initialise empty table data grid (only empty cells)
|
181
|
+
table_data = [
|
182
|
+
[
|
183
|
+
TableCell(
|
184
|
+
text="",
|
185
|
+
# bbox=[0,0,0,0],
|
186
|
+
spans=[[i, j]],
|
187
|
+
obj_type="body",
|
188
|
+
)
|
189
|
+
for j in range(element.num_cols)
|
190
|
+
]
|
191
|
+
for i in range(element.num_rows)
|
192
|
+
]
|
193
|
+
|
194
|
+
# Overwrite cells in table data for which there is actual cell content.
|
195
|
+
for cell in element.table_cells:
|
196
|
+
for i in range(
|
197
|
+
min(cell.start_row_offset_idx, element.num_rows),
|
198
|
+
min(cell.end_row_offset_idx, element.num_rows),
|
199
|
+
):
|
200
|
+
for j in range(
|
201
|
+
min(cell.start_col_offset_idx, element.num_cols),
|
202
|
+
min(cell.end_col_offset_idx, element.num_cols),
|
203
|
+
):
|
204
|
+
celltype = "body"
|
205
|
+
if cell.column_header:
|
206
|
+
celltype = "col_header"
|
207
|
+
elif cell.row_header:
|
208
|
+
celltype = "row_header"
|
209
|
+
|
210
|
+
def make_spans(cell):
|
211
|
+
for rspan in range(
|
212
|
+
min(cell.start_row_offset_idx, element.num_rows),
|
213
|
+
min(cell.end_row_offset_idx, element.num_rows),
|
214
|
+
):
|
215
|
+
for cspan in range(
|
216
|
+
min(
|
217
|
+
cell.start_col_offset_idx, element.num_cols
|
218
|
+
),
|
219
|
+
min(cell.end_col_offset_idx, element.num_cols),
|
220
|
+
):
|
221
|
+
yield [rspan, cspan]
|
222
|
+
|
223
|
+
spans = list(make_spans(cell))
|
224
|
+
table_data[i][j] = TableCell(
|
225
|
+
text=cell.text,
|
226
|
+
bbox=cell.bbox.to_bottom_left_origin(
|
227
|
+
page_no_to_page[element.page_no].size.height
|
228
|
+
).as_tuple(),
|
229
|
+
# col=j,
|
230
|
+
# row=i,
|
231
|
+
spans=spans,
|
232
|
+
obj_type=celltype,
|
233
|
+
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
234
|
+
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
235
|
+
)
|
236
|
+
|
237
|
+
tables.append(
|
238
|
+
DsSchemaTable(
|
239
|
+
num_cols=element.num_cols,
|
240
|
+
num_rows=element.num_rows,
|
241
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
242
|
+
data=table_data,
|
243
|
+
prov=[
|
244
|
+
Prov(
|
245
|
+
bbox=target_bbox,
|
246
|
+
page=element.page_no + 1,
|
247
|
+
span=[0, 0],
|
248
|
+
)
|
249
|
+
],
|
250
|
+
)
|
251
|
+
)
|
252
|
+
|
253
|
+
elif isinstance(element, FigureElement):
|
254
|
+
index = len(figures)
|
255
|
+
ref_str = f"#/figures/{index}"
|
256
|
+
main_text.append(
|
257
|
+
Ref(
|
258
|
+
name=element.label,
|
259
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
260
|
+
ref=ref_str,
|
261
|
+
),
|
262
|
+
)
|
263
|
+
figures.append(
|
264
|
+
BaseCell(
|
265
|
+
prov=[
|
266
|
+
Prov(
|
267
|
+
bbox=target_bbox,
|
268
|
+
page=element.page_no + 1,
|
269
|
+
span=[0, 0],
|
270
|
+
)
|
271
|
+
],
|
272
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
273
|
+
# data=[[]],
|
274
|
+
)
|
275
|
+
)
|
276
|
+
|
277
|
+
page_dimensions = [
|
278
|
+
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
279
|
+
for p in self.pages
|
280
|
+
]
|
281
|
+
|
282
|
+
ds_doc = DsDocument(
|
283
|
+
name=title,
|
284
|
+
description=desc,
|
285
|
+
file_info=file_info,
|
286
|
+
main_text=main_text,
|
287
|
+
tables=tables,
|
288
|
+
figures=figures,
|
289
|
+
page_dimensions=page_dimensions,
|
290
|
+
)
|
291
|
+
|
292
|
+
return ds_doc
|
293
|
+
|
294
|
+
def render_as_dict(self):
|
295
|
+
if self.output:
|
296
|
+
return self.output.model_dump(by_alias=True, exclude_none=True)
|
297
|
+
else:
|
298
|
+
return {}
|
299
|
+
|
300
|
+
def render_as_markdown(self):
|
301
|
+
if self.output:
|
302
|
+
return self.output.export_to_markdown()
|
303
|
+
else:
|
304
|
+
return ""
|
305
|
+
|
306
|
+
def render_element_images(
|
307
|
+
self, element_types: Tuple[PageElement] = (FigureElement,)
|
308
|
+
):
|
309
|
+
for element in self.assembled.elements:
|
310
|
+
if isinstance(element, element_types):
|
311
|
+
page_ix = element.page_no
|
312
|
+
scale = self.pages[page_ix]._default_image_scale
|
313
|
+
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
314
|
+
page_height=self.pages[page_ix].size.height * scale
|
315
|
+
)
|
316
|
+
|
317
|
+
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
318
|
+
yield element, cropped_im
|
319
|
+
|
320
|
+
|
321
|
+
class DocumentConversionInput(BaseModel):
|
322
|
+
|
323
|
+
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
324
|
+
limits: Optional[DocumentLimits] = DocumentLimits()
|
325
|
+
|
326
|
+
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
327
|
+
|
328
|
+
def docs(
|
329
|
+
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
330
|
+
) -> Iterable[InputDocument]:
|
331
|
+
|
332
|
+
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
333
|
+
|
334
|
+
for obj in self._path_or_stream_iterator:
|
335
|
+
if isinstance(obj, Path):
|
336
|
+
yield InputDocument(
|
337
|
+
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
|
338
|
+
)
|
339
|
+
elif isinstance(obj, DocumentStream):
|
340
|
+
yield InputDocument(
|
341
|
+
path_or_stream=obj.stream,
|
342
|
+
filename=obj.filename,
|
343
|
+
limits=self.limits,
|
344
|
+
pdf_backend=pdf_backend,
|
345
|
+
)
|
346
|
+
|
347
|
+
@classmethod
|
348
|
+
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
349
|
+
paths = [Path(p) for p in paths]
|
350
|
+
|
351
|
+
doc_input = cls(limits=limits)
|
352
|
+
doc_input._path_or_stream_iterator = paths
|
353
|
+
|
354
|
+
return doc_input
|
355
|
+
|
356
|
+
@classmethod
|
357
|
+
def from_streams(
|
358
|
+
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
|
359
|
+
):
|
360
|
+
doc_input = cls(limits=limits)
|
361
|
+
doc_input._path_or_stream_iterator = streams
|
362
|
+
|
363
|
+
return doc_input
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import sys
|
2
|
+
|
3
|
+
from pydantic import BaseModel
|
4
|
+
from pydantic_settings import BaseSettings
|
5
|
+
|
6
|
+
|
7
|
+
class DocumentLimits(BaseModel):
|
8
|
+
max_num_pages: int = sys.maxsize
|
9
|
+
max_file_size: int = sys.maxsize
|
10
|
+
|
11
|
+
|
12
|
+
class BatchConcurrencySettings(BaseModel):
|
13
|
+
doc_batch_size: int = 2
|
14
|
+
doc_batch_concurrency: int = 2
|
15
|
+
page_batch_size: int = 4
|
16
|
+
page_batch_concurrency: int = 2
|
17
|
+
|
18
|
+
# doc_batch_size: int = 1
|
19
|
+
# doc_batch_concurrency: int = 1
|
20
|
+
# page_batch_size: int = 1
|
21
|
+
# page_batch_concurrency: int = 1
|
22
|
+
|
23
|
+
# model_concurrency: int = 2
|
24
|
+
|
25
|
+
# To force models into single core: export OMP_NUM_THREADS=1
|
26
|
+
|
27
|
+
|
28
|
+
class AppSettings(BaseSettings):
|
29
|
+
perf: BatchConcurrencySettings
|
30
|
+
|
31
|
+
|
32
|
+
settings = AppSettings(perf=BatchConcurrencySettings())
|