docling 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +7 -5
- docling/cli/main.py +17 -21
- docling/datamodel/base_models.py +12 -12
- docling/datamodel/document.py +2 -253
- docling/datamodel/pipeline_options.py +27 -2
- docling/document_converter.py +5 -5
- docling/models/ds_glm_model.py +4 -7
- docling/models/rapid_ocr_model.py +18 -17
- docling/pipeline/base_pipeline.py +17 -3
- docling/pipeline/standard_pdf_pipeline.py +2 -0
- docling/utils/glm_utils.py +336 -0
- {docling-2.9.0.dist-info → docling-2.11.0.dist-info}/METADATA +4 -4
- {docling-2.9.0.dist-info → docling-2.11.0.dist-info}/RECORD +17 -16
- {docling-2.9.0.dist-info → docling-2.11.0.dist-info}/LICENSE +0 -0
- {docling-2.9.0.dist-info → docling-2.11.0.dist-info}/WHEEL +0 -0
- {docling-2.9.0.dist-info → docling-2.11.0.dist-info}/entry_points.txt +0 -0
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
|
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
9
|
-
from docling_parse.
|
9
|
+
from docling_parse.pdf_parsers import pdf_parser_v1
|
10
10
|
from PIL import Image, ImageDraw
|
11
11
|
from pypdfium2 import PdfPage
|
12
12
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
-
from docling_parse.
|
9
|
+
from docling_parse.pdf_parsers import pdf_parser_v2
|
10
10
|
from PIL import Image, ImageDraw
|
11
11
|
from pypdfium2 import PdfPage
|
12
12
|
|
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
210
210
|
self.parser = pdf_parser_v2("fatal")
|
211
211
|
|
212
212
|
success = False
|
213
|
-
if isinstance(path_or_stream, BytesIO):
|
213
|
+
if isinstance(self.path_or_stream, BytesIO):
|
214
214
|
success = self.parser.load_document_from_bytesio(
|
215
|
-
self.document_hash, path_or_stream
|
215
|
+
self.document_hash, self.path_or_stream
|
216
|
+
)
|
217
|
+
elif isinstance(self.path_or_stream, Path):
|
218
|
+
success = self.parser.load_document(
|
219
|
+
self.document_hash, str(self.path_or_stream)
|
216
220
|
)
|
217
|
-
elif isinstance(path_or_stream, Path):
|
218
|
-
success = self.parser.load_document(self.document_hash, str(path_or_stream))
|
219
221
|
|
220
222
|
if not success:
|
221
223
|
raise RuntimeError(
|
docling/cli/main.py
CHANGED
@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
|
|
27
27
|
from docling.datamodel.document import ConversionResult
|
28
28
|
from docling.datamodel.pipeline_options import (
|
29
29
|
EasyOcrOptions,
|
30
|
+
OcrEngine,
|
30
31
|
OcrMacOptions,
|
31
32
|
OcrOptions,
|
33
|
+
PdfBackend,
|
32
34
|
PdfPipelineOptions,
|
33
35
|
RapidOcrOptions,
|
34
36
|
TableFormerMode,
|
@@ -68,22 +70,6 @@ def version_callback(value: bool):
|
|
68
70
|
raise typer.Exit()
|
69
71
|
|
70
72
|
|
71
|
-
# Define an enum for the backend options
|
72
|
-
class PdfBackend(str, Enum):
|
73
|
-
PYPDFIUM2 = "pypdfium2"
|
74
|
-
DLPARSE_V1 = "dlparse_v1"
|
75
|
-
DLPARSE_V2 = "dlparse_v2"
|
76
|
-
|
77
|
-
|
78
|
-
# Define an enum for the ocr engines
|
79
|
-
class OcrEngine(str, Enum):
|
80
|
-
EASYOCR = "easyocr"
|
81
|
-
TESSERACT_CLI = "tesseract_cli"
|
82
|
-
TESSERACT = "tesseract"
|
83
|
-
OCRMAC = "ocrmac"
|
84
|
-
RAPIDOCR = "rapidocr"
|
85
|
-
|
86
|
-
|
87
73
|
def export_documents(
|
88
74
|
conv_results: Iterable[ConversionResult],
|
89
75
|
output_dir: Path,
|
@@ -208,7 +194,7 @@ def convert(
|
|
208
194
|
] = None,
|
209
195
|
pdf_backend: Annotated[
|
210
196
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
211
|
-
] = PdfBackend.
|
197
|
+
] = PdfBackend.DLPARSE_V2,
|
212
198
|
table_mode: Annotated[
|
213
199
|
TableFormerMode,
|
214
200
|
typer.Option(..., help="The mode to use in the table structure model."),
|
@@ -264,6 +250,13 @@ def convert(
|
|
264
250
|
help="Show version information.",
|
265
251
|
),
|
266
252
|
] = None,
|
253
|
+
document_timeout: Annotated[
|
254
|
+
Optional[float],
|
255
|
+
typer.Option(
|
256
|
+
...,
|
257
|
+
help="The timeout for processing each document, in seconds.",
|
258
|
+
),
|
259
|
+
] = None,
|
267
260
|
):
|
268
261
|
if verbose == 0:
|
269
262
|
logging.basicConfig(level=logging.WARNING)
|
@@ -347,6 +340,7 @@ def convert(
|
|
347
340
|
do_ocr=ocr,
|
348
341
|
ocr_options=ocr_options,
|
349
342
|
do_table_structure=True,
|
343
|
+
document_timeout=document_timeout,
|
350
344
|
)
|
351
345
|
pipeline_options.table_structure_options.do_cell_matching = (
|
352
346
|
True # do_cell_matching
|
@@ -372,11 +366,13 @@ def convert(
|
|
372
366
|
else:
|
373
367
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
374
368
|
|
369
|
+
pdf_format_option = PdfFormatOption(
|
370
|
+
pipeline_options=pipeline_options,
|
371
|
+
backend=backend, # pdf_backend
|
372
|
+
)
|
375
373
|
format_options: Dict[InputFormat, FormatOption] = {
|
376
|
-
InputFormat.PDF:
|
377
|
-
|
378
|
-
backend=backend, # pdf_backend
|
379
|
-
)
|
374
|
+
InputFormat.PDF: pdf_format_option,
|
375
|
+
InputFormat.IMAGE: pdf_format_option,
|
380
376
|
}
|
381
377
|
doc_converter = DocumentConverter(
|
382
378
|
allowed_formats=from_formats,
|
docling/datamodel/base_models.py
CHANGED
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|
19
19
|
|
20
20
|
|
21
21
|
class ConversionStatus(str, Enum):
|
22
|
-
PENDING =
|
23
|
-
STARTED =
|
24
|
-
FAILURE =
|
25
|
-
SUCCESS =
|
26
|
-
PARTIAL_SUCCESS =
|
27
|
-
SKIPPED =
|
22
|
+
PENDING = "pending"
|
23
|
+
STARTED = "started"
|
24
|
+
FAILURE = "failure"
|
25
|
+
SUCCESS = "success"
|
26
|
+
PARTIAL_SUCCESS = "partial_success"
|
27
|
+
SKIPPED = "skipped"
|
28
28
|
|
29
29
|
|
30
30
|
class InputFormat(str, Enum):
|
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
|
|
89
89
|
|
90
90
|
|
91
91
|
class DocInputType(str, Enum):
|
92
|
-
PATH =
|
93
|
-
STREAM =
|
92
|
+
PATH = "path"
|
93
|
+
STREAM = "stream"
|
94
94
|
|
95
95
|
|
96
96
|
class DoclingComponentType(str, Enum):
|
97
|
-
DOCUMENT_BACKEND =
|
98
|
-
MODEL =
|
99
|
-
DOC_ASSEMBLER =
|
100
|
-
USER_INPUT =
|
97
|
+
DOCUMENT_BACKEND = "document_backend"
|
98
|
+
MODEL = "model"
|
99
|
+
DOC_ASSEMBLER = "doc_assembler"
|
100
|
+
USER_INPUT = "user_input"
|
101
101
|
|
102
102
|
|
103
103
|
class ErrorItem(BaseModel):
|
docling/datamodel/document.py
CHANGED
@@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
|
|
33
33
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
34
34
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
35
35
|
from docling_core.utils.file import resolve_source_to_stream
|
36
|
+
from docling_core.utils.legacy import docling_document_to_legacy
|
36
37
|
from pydantic import BaseModel
|
37
38
|
from typing_extensions import deprecated
|
38
39
|
|
@@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
|
|
189
190
|
@property
|
190
191
|
@deprecated("Use document instead.")
|
191
192
|
def legacy_document(self):
|
192
|
-
|
193
|
-
DocItemLabel.CAPTION.value: "Caption",
|
194
|
-
DocItemLabel.FOOTNOTE.value: "Footnote",
|
195
|
-
DocItemLabel.FORMULA.value: "Formula",
|
196
|
-
DocItemLabel.LIST_ITEM.value: "List-item",
|
197
|
-
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
198
|
-
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
199
|
-
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
200
|
-
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
201
|
-
DocItemLabel.TABLE.value: "Table",
|
202
|
-
DocItemLabel.TEXT.value: "Text",
|
203
|
-
DocItemLabel.TITLE.value: "Title",
|
204
|
-
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
205
|
-
DocItemLabel.CODE.value: "Code",
|
206
|
-
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
207
|
-
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
208
|
-
DocItemLabel.FORM.value: "Form",
|
209
|
-
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
210
|
-
DocItemLabel.PARAGRAPH.value: "paragraph",
|
211
|
-
}
|
212
|
-
|
213
|
-
title = ""
|
214
|
-
desc = DsDocumentDescription(logs=[])
|
215
|
-
|
216
|
-
page_hashes = [
|
217
|
-
PageReference(
|
218
|
-
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
219
|
-
page=p.page_no,
|
220
|
-
model="default",
|
221
|
-
)
|
222
|
-
for p in self.document.pages.values()
|
223
|
-
]
|
224
|
-
|
225
|
-
file_info = DsFileInfoObject(
|
226
|
-
filename=self.input.file.name,
|
227
|
-
document_hash=self.input.document_hash,
|
228
|
-
num_pages=self.input.page_count,
|
229
|
-
page_hashes=page_hashes,
|
230
|
-
)
|
231
|
-
|
232
|
-
main_text = []
|
233
|
-
tables = []
|
234
|
-
figures = []
|
235
|
-
equations = []
|
236
|
-
footnotes = []
|
237
|
-
page_headers = []
|
238
|
-
page_footers = []
|
239
|
-
|
240
|
-
embedded_captions = set()
|
241
|
-
for ix, (item, level) in enumerate(
|
242
|
-
self.document.iterate_items(self.document.body)
|
243
|
-
):
|
244
|
-
|
245
|
-
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
246
|
-
caption = item.caption_text(self.document)
|
247
|
-
if caption:
|
248
|
-
embedded_captions.add(caption)
|
249
|
-
|
250
|
-
for item, level in self.document.iterate_items():
|
251
|
-
if isinstance(item, DocItem):
|
252
|
-
item_type = item.label
|
253
|
-
|
254
|
-
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
255
|
-
|
256
|
-
if isinstance(item, ListItem) and item.marker:
|
257
|
-
text = f"{item.marker} {item.text}"
|
258
|
-
else:
|
259
|
-
text = item.text
|
260
|
-
|
261
|
-
# Can be empty.
|
262
|
-
prov = [
|
263
|
-
Prov(
|
264
|
-
bbox=p.bbox.as_tuple(),
|
265
|
-
page=p.page_no,
|
266
|
-
span=[0, len(item.text)],
|
267
|
-
)
|
268
|
-
for p in item.prov
|
269
|
-
]
|
270
|
-
main_text.append(
|
271
|
-
BaseText(
|
272
|
-
text=text,
|
273
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
274
|
-
name=reverse_label_mapping[item.label],
|
275
|
-
prov=prov,
|
276
|
-
)
|
277
|
-
)
|
278
|
-
|
279
|
-
# skip captions of they are embedded in the actual
|
280
|
-
# floating object
|
281
|
-
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
282
|
-
continue
|
283
|
-
|
284
|
-
elif isinstance(item, TableItem) and item.data:
|
285
|
-
index = len(tables)
|
286
|
-
ref_str = f"#/tables/{index}"
|
287
|
-
main_text.append(
|
288
|
-
Ref(
|
289
|
-
name=reverse_label_mapping[item.label],
|
290
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
291
|
-
ref=ref_str,
|
292
|
-
),
|
293
|
-
)
|
294
|
-
|
295
|
-
# Initialise empty table data grid (only empty cells)
|
296
|
-
table_data = [
|
297
|
-
[
|
298
|
-
TableCell(
|
299
|
-
text="",
|
300
|
-
# bbox=[0,0,0,0],
|
301
|
-
spans=[[i, j]],
|
302
|
-
obj_type="body",
|
303
|
-
)
|
304
|
-
for j in range(item.data.num_cols)
|
305
|
-
]
|
306
|
-
for i in range(item.data.num_rows)
|
307
|
-
]
|
308
|
-
|
309
|
-
# Overwrite cells in table data for which there is actual cell content.
|
310
|
-
for cell in item.data.table_cells:
|
311
|
-
for i in range(
|
312
|
-
min(cell.start_row_offset_idx, item.data.num_rows),
|
313
|
-
min(cell.end_row_offset_idx, item.data.num_rows),
|
314
|
-
):
|
315
|
-
for j in range(
|
316
|
-
min(cell.start_col_offset_idx, item.data.num_cols),
|
317
|
-
min(cell.end_col_offset_idx, item.data.num_cols),
|
318
|
-
):
|
319
|
-
celltype = "body"
|
320
|
-
if cell.column_header:
|
321
|
-
celltype = "col_header"
|
322
|
-
elif cell.row_header:
|
323
|
-
celltype = "row_header"
|
324
|
-
elif cell.row_section:
|
325
|
-
celltype = "row_section"
|
326
|
-
|
327
|
-
def make_spans(cell):
|
328
|
-
for rspan in range(
|
329
|
-
min(
|
330
|
-
cell.start_row_offset_idx,
|
331
|
-
item.data.num_rows,
|
332
|
-
),
|
333
|
-
min(
|
334
|
-
cell.end_row_offset_idx, item.data.num_rows
|
335
|
-
),
|
336
|
-
):
|
337
|
-
for cspan in range(
|
338
|
-
min(
|
339
|
-
cell.start_col_offset_idx,
|
340
|
-
item.data.num_cols,
|
341
|
-
),
|
342
|
-
min(
|
343
|
-
cell.end_col_offset_idx,
|
344
|
-
item.data.num_cols,
|
345
|
-
),
|
346
|
-
):
|
347
|
-
yield [rspan, cspan]
|
348
|
-
|
349
|
-
spans = list(make_spans(cell))
|
350
|
-
table_data[i][j] = GlmTableCell(
|
351
|
-
text=cell.text,
|
352
|
-
bbox=(
|
353
|
-
cell.bbox.as_tuple()
|
354
|
-
if cell.bbox is not None
|
355
|
-
else None
|
356
|
-
), # check if this is bottom-left
|
357
|
-
spans=spans,
|
358
|
-
obj_type=celltype,
|
359
|
-
col=j,
|
360
|
-
row=i,
|
361
|
-
row_header=cell.row_header,
|
362
|
-
row_section=cell.row_section,
|
363
|
-
col_header=cell.column_header,
|
364
|
-
row_span=[
|
365
|
-
cell.start_row_offset_idx,
|
366
|
-
cell.end_row_offset_idx,
|
367
|
-
],
|
368
|
-
col_span=[
|
369
|
-
cell.start_col_offset_idx,
|
370
|
-
cell.end_col_offset_idx,
|
371
|
-
],
|
372
|
-
)
|
373
|
-
|
374
|
-
# Compute the caption
|
375
|
-
caption = item.caption_text(self.document)
|
376
|
-
|
377
|
-
tables.append(
|
378
|
-
DsSchemaTable(
|
379
|
-
text=caption,
|
380
|
-
num_cols=item.data.num_cols,
|
381
|
-
num_rows=item.data.num_rows,
|
382
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
383
|
-
data=table_data,
|
384
|
-
prov=[
|
385
|
-
Prov(
|
386
|
-
bbox=p.bbox.as_tuple(),
|
387
|
-
page=p.page_no,
|
388
|
-
span=[0, 0],
|
389
|
-
)
|
390
|
-
for p in item.prov
|
391
|
-
],
|
392
|
-
)
|
393
|
-
)
|
394
|
-
|
395
|
-
elif isinstance(item, PictureItem):
|
396
|
-
index = len(figures)
|
397
|
-
ref_str = f"#/figures/{index}"
|
398
|
-
main_text.append(
|
399
|
-
Ref(
|
400
|
-
name=reverse_label_mapping[item.label],
|
401
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
402
|
-
ref=ref_str,
|
403
|
-
),
|
404
|
-
)
|
405
|
-
|
406
|
-
# Compute the caption
|
407
|
-
caption = item.caption_text(self.document)
|
408
|
-
|
409
|
-
figures.append(
|
410
|
-
Figure(
|
411
|
-
prov=[
|
412
|
-
Prov(
|
413
|
-
bbox=p.bbox.as_tuple(),
|
414
|
-
page=p.page_no,
|
415
|
-
span=[0, len(caption)],
|
416
|
-
)
|
417
|
-
for p in item.prov
|
418
|
-
],
|
419
|
-
obj_type=layout_label_to_ds_type.get(item.label),
|
420
|
-
text=caption,
|
421
|
-
# data=[[]],
|
422
|
-
)
|
423
|
-
)
|
424
|
-
|
425
|
-
page_dimensions = [
|
426
|
-
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
427
|
-
for p in self.document.pages.values()
|
428
|
-
]
|
429
|
-
|
430
|
-
ds_doc = DsDocument(
|
431
|
-
name=title,
|
432
|
-
description=desc,
|
433
|
-
file_info=file_info,
|
434
|
-
main_text=main_text,
|
435
|
-
equations=equations,
|
436
|
-
footnotes=footnotes,
|
437
|
-
page_headers=page_headers,
|
438
|
-
page_footers=page_footers,
|
439
|
-
tables=tables,
|
440
|
-
figures=figures,
|
441
|
-
page_dimensions=page_dimensions,
|
442
|
-
)
|
443
|
-
|
444
|
-
return ds_doc
|
193
|
+
return docling_document_to_legacy(self.document)
|
445
194
|
|
446
195
|
|
447
196
|
class _DummyBackend(AbstractDocumentBackend):
|
@@ -126,12 +126,33 @@ class OcrMacOptions(OcrOptions):
|
|
126
126
|
)
|
127
127
|
|
128
128
|
|
129
|
+
# Define an enum for the backend options
|
130
|
+
class PdfBackend(str, Enum):
|
131
|
+
"""Enum of valid PDF backends."""
|
132
|
+
|
133
|
+
PYPDFIUM2 = "pypdfium2"
|
134
|
+
DLPARSE_V1 = "dlparse_v1"
|
135
|
+
DLPARSE_V2 = "dlparse_v2"
|
136
|
+
|
137
|
+
|
138
|
+
# Define an enum for the ocr engines
|
139
|
+
class OcrEngine(str, Enum):
|
140
|
+
"""Enum of valid OCR engines."""
|
141
|
+
|
142
|
+
EASYOCR = "easyocr"
|
143
|
+
TESSERACT_CLI = "tesseract_cli"
|
144
|
+
TESSERACT = "tesseract"
|
145
|
+
OCRMAC = "ocrmac"
|
146
|
+
RAPIDOCR = "rapidocr"
|
147
|
+
|
148
|
+
|
129
149
|
class PipelineOptions(BaseModel):
|
130
150
|
"""Base pipeline options."""
|
131
151
|
|
132
152
|
create_legacy_output: bool = (
|
133
|
-
True # This
|
153
|
+
True # This default will be set to False on a future version of docling
|
134
154
|
)
|
155
|
+
document_timeout: Optional[float] = None
|
135
156
|
|
136
157
|
|
137
158
|
class PdfPipelineOptions(PipelineOptions):
|
@@ -143,7 +164,11 @@ class PdfPipelineOptions(PipelineOptions):
|
|
143
164
|
|
144
165
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
145
166
|
ocr_options: Union[
|
146
|
-
EasyOcrOptions,
|
167
|
+
EasyOcrOptions,
|
168
|
+
TesseractCliOcrOptions,
|
169
|
+
TesseractOcrOptions,
|
170
|
+
OcrMacOptions,
|
171
|
+
RapidOcrOptions,
|
147
172
|
] = Field(EasyOcrOptions(), discriminator="kind")
|
148
173
|
|
149
174
|
images_scale: float = 1.0
|
docling/document_converter.py
CHANGED
@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|
9
9
|
|
10
10
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
11
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
12
|
-
from docling.backend.
|
12
|
+
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
13
13
|
from docling.backend.html_backend import HTMLDocumentBackend
|
14
14
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
|
|
84
84
|
|
85
85
|
class PdfFormatOption(FormatOption):
|
86
86
|
pipeline_cls: Type = StandardPdfPipeline
|
87
|
-
backend: Type[AbstractDocumentBackend] =
|
87
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
88
88
|
|
89
89
|
|
90
90
|
class ImageFormatOption(FormatOption):
|
91
91
|
pipeline_cls: Type = StandardPdfPipeline
|
92
|
-
backend: Type[AbstractDocumentBackend] =
|
92
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
93
93
|
|
94
94
|
|
95
95
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
113
113
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
114
114
|
),
|
115
115
|
InputFormat.IMAGE: FormatOption(
|
116
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
116
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
117
117
|
),
|
118
118
|
InputFormat.PDF: FormatOption(
|
119
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
119
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
120
120
|
),
|
121
121
|
}
|
122
122
|
if (options := format_to_default_options.get(format)) is not None:
|
docling/models/ds_glm_model.py
CHANGED
@@ -3,9 +3,7 @@ import random
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import List, Union
|
5
5
|
|
6
|
-
from deepsearch_glm.
|
7
|
-
from deepsearch_glm.utils.doc_utils import to_docling_document
|
8
|
-
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
6
|
+
from deepsearch_glm.andromeda_nlp import nlp_model
|
9
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
10
8
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
11
9
|
from docling_core.types.legacy_doc.base import (
|
@@ -29,6 +27,7 @@ from pydantic import BaseModel, ConfigDict
|
|
29
27
|
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
30
28
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
31
29
|
from docling.datamodel.settings import settings
|
30
|
+
from docling.utils.glm_utils import to_docling_document
|
32
31
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
33
32
|
from docling.utils.utils import create_hash
|
34
33
|
|
@@ -43,9 +42,7 @@ class GlmModel:
|
|
43
42
|
def __init__(self, options: GlmOptions):
|
44
43
|
self.options = options
|
45
44
|
|
46
|
-
|
47
|
-
load_pretrained_nlp_models()
|
48
|
-
self.model = init_nlp_model(model_names=self.options.model_names)
|
45
|
+
self.model = nlp_model(loglevel="error", text_ordering=True)
|
49
46
|
|
50
47
|
def _to_legacy_document(self, conv_res) -> DsDocument:
|
51
48
|
title = ""
|
@@ -232,7 +229,7 @@ class GlmModel:
|
|
232
229
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
233
230
|
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
234
231
|
ds_doc = self._to_legacy_document(conv_res)
|
235
|
-
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
232
|
+
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
|
236
233
|
|
237
234
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
238
235
|
|
@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
|
|
118
118
|
del high_res_image
|
119
119
|
del im
|
120
120
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
(
|
129
|
-
|
130
|
-
|
131
|
-
|
121
|
+
if result is not None:
|
122
|
+
cells = [
|
123
|
+
OcrCell(
|
124
|
+
id=ix,
|
125
|
+
text=line[1],
|
126
|
+
confidence=line[2],
|
127
|
+
bbox=BoundingBox.from_tuple(
|
128
|
+
coord=(
|
129
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
130
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
131
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
132
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
133
|
+
),
|
134
|
+
origin=CoordOrigin.TOPLEFT,
|
132
135
|
),
|
133
|
-
|
134
|
-
)
|
135
|
-
|
136
|
-
|
137
|
-
]
|
138
|
-
all_ocr_cells.extend(cells)
|
136
|
+
)
|
137
|
+
for ix, line in enumerate(result)
|
138
|
+
]
|
139
|
+
all_ocr_cells.extend(cells)
|
139
140
|
|
140
141
|
# Post-process the cells
|
141
142
|
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
@@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
126
126
|
# conv_res.status = ConversionStatus.FAILURE
|
127
127
|
# return conv_res
|
128
128
|
|
129
|
+
total_elapsed_time = 0.0
|
129
130
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
130
131
|
|
131
132
|
for i in range(0, conv_res.input.page_count):
|
@@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
136
137
|
for page_batch in chunkify(
|
137
138
|
conv_res.pages, settings.perf.page_batch_size
|
138
139
|
):
|
139
|
-
|
140
|
+
start_batch_time = time.monotonic()
|
140
141
|
|
141
142
|
# 1. Initialise the page resources
|
142
143
|
init_pages = map(
|
@@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
149
150
|
for p in pipeline_pages: # Must exhaust!
|
150
151
|
pass
|
151
152
|
|
152
|
-
|
153
|
-
|
153
|
+
end_batch_time = time.monotonic()
|
154
|
+
total_elapsed_time += end_batch_time - start_batch_time
|
155
|
+
if (
|
156
|
+
self.pipeline_options.document_timeout is not None
|
157
|
+
and total_elapsed_time > self.pipeline_options.document_timeout
|
158
|
+
):
|
159
|
+
_log.warning(
|
160
|
+
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
|
161
|
+
)
|
162
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
163
|
+
break
|
164
|
+
|
165
|
+
_log.debug(
|
166
|
+
f"Finished converting page batch time={end_batch_time:.3f}"
|
167
|
+
)
|
154
168
|
|
155
169
|
except Exception as e:
|
156
170
|
conv_res.status = ConversionStatus.FAILURE
|
@@ -97,7 +97,9 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
97
97
|
local_dir: Optional[Path] = None, force: bool = False
|
98
98
|
) -> Path:
|
99
99
|
from huggingface_hub import snapshot_download
|
100
|
+
from huggingface_hub.utils import disable_progress_bars
|
100
101
|
|
102
|
+
disable_progress_bars()
|
101
103
|
download_path = snapshot_download(
|
102
104
|
repo_id="ds4sd/docling-models",
|
103
105
|
force_download=force,
|
@@ -0,0 +1,336 @@
|
|
1
|
+
import re
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItemLabel,
|
10
|
+
DoclingDocument,
|
11
|
+
DocumentOrigin,
|
12
|
+
GroupLabel,
|
13
|
+
ProvenanceItem,
|
14
|
+
Size,
|
15
|
+
TableCell,
|
16
|
+
TableData,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def resolve_item(paths, obj):
|
21
|
+
"""Find item in document from a reference path"""
|
22
|
+
|
23
|
+
if len(paths) == 0:
|
24
|
+
return obj
|
25
|
+
|
26
|
+
if paths[0] == "#":
|
27
|
+
return resolve_item(paths[1:], obj)
|
28
|
+
|
29
|
+
try:
|
30
|
+
key = int(paths[0])
|
31
|
+
except:
|
32
|
+
key = paths[0]
|
33
|
+
|
34
|
+
if len(paths) == 1:
|
35
|
+
if isinstance(key, str) and key in obj:
|
36
|
+
return obj[key]
|
37
|
+
elif isinstance(key, int) and key < len(obj):
|
38
|
+
return obj[key]
|
39
|
+
else:
|
40
|
+
return None
|
41
|
+
|
42
|
+
elif len(paths) > 1:
|
43
|
+
if isinstance(key, str) and key in obj:
|
44
|
+
return resolve_item(paths[1:], obj[key])
|
45
|
+
elif isinstance(key, int) and key < len(obj):
|
46
|
+
return resolve_item(paths[1:], obj[key])
|
47
|
+
else:
|
48
|
+
return None
|
49
|
+
|
50
|
+
else:
|
51
|
+
return None
|
52
|
+
|
53
|
+
|
54
|
+
def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
55
|
+
unique_objects = []
|
56
|
+
seen_spans = set()
|
57
|
+
|
58
|
+
for sublist in grid:
|
59
|
+
for obj in sublist:
|
60
|
+
# Convert the spans list to a tuple of tuples for hashing
|
61
|
+
spans_tuple = tuple(tuple(span) for span in obj["spans"])
|
62
|
+
if spans_tuple not in seen_spans:
|
63
|
+
seen_spans.add(spans_tuple)
|
64
|
+
unique_objects.append(obj)
|
65
|
+
|
66
|
+
return unique_objects
|
67
|
+
|
68
|
+
|
69
|
+
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
70
|
+
origin = DocumentOrigin(
|
71
|
+
mimetype="application/pdf",
|
72
|
+
filename=doc_glm["file-info"]["filename"],
|
73
|
+
binary_hash=doc_glm["file-info"]["document-hash"],
|
74
|
+
)
|
75
|
+
doc_name = Path(origin.filename).stem
|
76
|
+
|
77
|
+
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
78
|
+
|
79
|
+
for page_dim in doc_glm["page-dimensions"]:
|
80
|
+
page_no = int(page_dim["page"])
|
81
|
+
size = Size(width=page_dim["width"], height=page_dim["height"])
|
82
|
+
|
83
|
+
doc.add_page(page_no=page_no, size=size)
|
84
|
+
|
85
|
+
if "properties" in doc_glm:
|
86
|
+
props = pd.DataFrame(
|
87
|
+
doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
|
88
|
+
)
|
89
|
+
else:
|
90
|
+
props = pd.DataFrame()
|
91
|
+
|
92
|
+
current_list = None
|
93
|
+
|
94
|
+
for ix, pelem in enumerate(doc_glm["page-elements"]):
|
95
|
+
ptype = pelem["type"]
|
96
|
+
span_i = pelem["span"][0]
|
97
|
+
span_j = pelem["span"][1]
|
98
|
+
|
99
|
+
if "iref" not in pelem:
|
100
|
+
# print(json.dumps(pelem, indent=2))
|
101
|
+
continue
|
102
|
+
|
103
|
+
iref = pelem["iref"]
|
104
|
+
|
105
|
+
if re.match("#/figures/(\\d+)/captions/(.+)", iref):
|
106
|
+
# print(f"skip {iref}")
|
107
|
+
continue
|
108
|
+
|
109
|
+
if re.match("#/tables/(\\d+)/captions/(.+)", iref):
|
110
|
+
# print(f"skip {iref}")
|
111
|
+
continue
|
112
|
+
|
113
|
+
path = iref.split("/")
|
114
|
+
obj = resolve_item(path, doc_glm)
|
115
|
+
|
116
|
+
if obj is None:
|
117
|
+
current_list = None
|
118
|
+
print(f"warning: undefined {path}")
|
119
|
+
continue
|
120
|
+
|
121
|
+
if ptype == "figure":
|
122
|
+
current_list = None
|
123
|
+
text = ""
|
124
|
+
caption_refs = []
|
125
|
+
for caption in obj["captions"]:
|
126
|
+
text += caption["text"]
|
127
|
+
|
128
|
+
for nprov in caption["prov"]:
|
129
|
+
npaths = nprov["$ref"].split("/")
|
130
|
+
nelem = resolve_item(npaths, doc_glm)
|
131
|
+
|
132
|
+
if nelem is None:
|
133
|
+
# print(f"warning: undefined caption {npaths}")
|
134
|
+
continue
|
135
|
+
|
136
|
+
span_i = nelem["span"][0]
|
137
|
+
span_j = nelem["span"][1]
|
138
|
+
|
139
|
+
cap_text = caption["text"][span_i:span_j]
|
140
|
+
|
141
|
+
# doc_glm["page-elements"].remove(nelem)
|
142
|
+
|
143
|
+
prov = ProvenanceItem(
|
144
|
+
page_no=nelem["page"],
|
145
|
+
charspan=tuple(nelem["span"]),
|
146
|
+
bbox=BoundingBox.from_tuple(
|
147
|
+
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
148
|
+
),
|
149
|
+
)
|
150
|
+
|
151
|
+
caption_obj = doc.add_text(
|
152
|
+
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
153
|
+
)
|
154
|
+
caption_refs.append(caption_obj.get_ref())
|
155
|
+
|
156
|
+
prov = ProvenanceItem(
|
157
|
+
page_no=pelem["page"],
|
158
|
+
charspan=(0, len(text)),
|
159
|
+
bbox=BoundingBox.from_tuple(
|
160
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
161
|
+
),
|
162
|
+
)
|
163
|
+
|
164
|
+
pic = doc.add_picture(prov=prov)
|
165
|
+
pic.captions.extend(caption_refs)
|
166
|
+
_add_child_elements(pic, doc, obj, pelem)
|
167
|
+
|
168
|
+
elif ptype == "table":
|
169
|
+
current_list = None
|
170
|
+
text = ""
|
171
|
+
caption_refs = []
|
172
|
+
for caption in obj["captions"]:
|
173
|
+
text += caption["text"]
|
174
|
+
|
175
|
+
for nprov in caption["prov"]:
|
176
|
+
npaths = nprov["$ref"].split("/")
|
177
|
+
nelem = resolve_item(npaths, doc_glm)
|
178
|
+
|
179
|
+
if nelem is None:
|
180
|
+
# print(f"warning: undefined caption {npaths}")
|
181
|
+
continue
|
182
|
+
|
183
|
+
span_i = nelem["span"][0]
|
184
|
+
span_j = nelem["span"][1]
|
185
|
+
|
186
|
+
cap_text = caption["text"][span_i:span_j]
|
187
|
+
|
188
|
+
# doc_glm["page-elements"].remove(nelem)
|
189
|
+
|
190
|
+
prov = ProvenanceItem(
|
191
|
+
page_no=nelem["page"],
|
192
|
+
charspan=tuple(nelem["span"]),
|
193
|
+
bbox=BoundingBox.from_tuple(
|
194
|
+
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
195
|
+
),
|
196
|
+
)
|
197
|
+
|
198
|
+
caption_obj = doc.add_text(
|
199
|
+
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
200
|
+
)
|
201
|
+
caption_refs.append(caption_obj.get_ref())
|
202
|
+
|
203
|
+
table_cells_glm = _flatten_table_grid(obj["data"])
|
204
|
+
|
205
|
+
table_cells = []
|
206
|
+
for tbl_cell_glm in table_cells_glm:
|
207
|
+
if tbl_cell_glm["bbox"] is not None:
|
208
|
+
bbox = BoundingBox.from_tuple(
|
209
|
+
tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
210
|
+
)
|
211
|
+
else:
|
212
|
+
bbox = None
|
213
|
+
|
214
|
+
is_col_header = False
|
215
|
+
is_row_header = False
|
216
|
+
is_row_section = False
|
217
|
+
|
218
|
+
if tbl_cell_glm["type"] == "col_header":
|
219
|
+
is_col_header = True
|
220
|
+
elif tbl_cell_glm["type"] == "row_header":
|
221
|
+
is_row_header = True
|
222
|
+
elif tbl_cell_glm["type"] == "row_section":
|
223
|
+
is_row_section = True
|
224
|
+
|
225
|
+
table_cells.append(
|
226
|
+
TableCell(
|
227
|
+
row_span=tbl_cell_glm["row-span"][1]
|
228
|
+
- tbl_cell_glm["row-span"][0],
|
229
|
+
col_span=tbl_cell_glm["col-span"][1]
|
230
|
+
- tbl_cell_glm["col-span"][0],
|
231
|
+
start_row_offset_idx=tbl_cell_glm["row-span"][0],
|
232
|
+
end_row_offset_idx=tbl_cell_glm["row-span"][1],
|
233
|
+
start_col_offset_idx=tbl_cell_glm["col-span"][0],
|
234
|
+
end_col_offset_idx=tbl_cell_glm["col-span"][1],
|
235
|
+
text=tbl_cell_glm["text"],
|
236
|
+
bbox=bbox,
|
237
|
+
column_header=is_col_header,
|
238
|
+
row_header=is_row_header,
|
239
|
+
row_section=is_row_section,
|
240
|
+
)
|
241
|
+
)
|
242
|
+
|
243
|
+
tbl_data = TableData(
|
244
|
+
num_rows=obj.get("#-rows", 0),
|
245
|
+
num_cols=obj.get("#-cols", 0),
|
246
|
+
table_cells=table_cells,
|
247
|
+
)
|
248
|
+
|
249
|
+
prov = ProvenanceItem(
|
250
|
+
page_no=pelem["page"],
|
251
|
+
charspan=(0, 0),
|
252
|
+
bbox=BoundingBox.from_tuple(
|
253
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
254
|
+
),
|
255
|
+
)
|
256
|
+
|
257
|
+
tbl = doc.add_table(data=tbl_data, prov=prov)
|
258
|
+
tbl.captions.extend(caption_refs)
|
259
|
+
|
260
|
+
elif ptype in ["form", "key_value_region"]:
|
261
|
+
label = DocItemLabel(ptype)
|
262
|
+
container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
|
263
|
+
|
264
|
+
_add_child_elements(container_el, doc, obj, pelem)
|
265
|
+
|
266
|
+
elif "text" in obj:
|
267
|
+
text = obj["text"][span_i:span_j]
|
268
|
+
|
269
|
+
type_label = pelem["type"]
|
270
|
+
name_label = pelem["name"]
|
271
|
+
if update_name_label and len(props) > 0 and type_label == "paragraph":
|
272
|
+
prop = props[
|
273
|
+
(props["type"] == "semantic") & (props["subj_path"] == iref)
|
274
|
+
]
|
275
|
+
if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
|
276
|
+
name_label = prop.iloc[0]["label"]
|
277
|
+
|
278
|
+
prov = ProvenanceItem(
|
279
|
+
page_no=pelem["page"],
|
280
|
+
charspan=(0, len(text)),
|
281
|
+
bbox=BoundingBox.from_tuple(
|
282
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
283
|
+
),
|
284
|
+
)
|
285
|
+
label = DocItemLabel(name_label)
|
286
|
+
|
287
|
+
if label == DocItemLabel.LIST_ITEM:
|
288
|
+
if current_list is None:
|
289
|
+
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
|
290
|
+
|
291
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
292
|
+
doc.add_list_item(
|
293
|
+
text=text, enumerated=False, prov=prov, parent=current_list
|
294
|
+
)
|
295
|
+
elif label == DocItemLabel.SECTION_HEADER:
|
296
|
+
current_list = None
|
297
|
+
|
298
|
+
doc.add_heading(text=text, prov=prov)
|
299
|
+
else:
|
300
|
+
current_list = None
|
301
|
+
|
302
|
+
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
|
303
|
+
|
304
|
+
return doc
|
305
|
+
|
306
|
+
|
307
|
+
def _add_child_elements(container_el, doc, obj, pelem):
|
308
|
+
payload = obj.get("payload")
|
309
|
+
if payload is not None:
|
310
|
+
children = payload.get("children", [])
|
311
|
+
|
312
|
+
for child in children:
|
313
|
+
c_label = DocItemLabel(child["label"])
|
314
|
+
c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
|
315
|
+
doc.pages[pelem["page"]].size.height
|
316
|
+
)
|
317
|
+
c_text = " ".join(
|
318
|
+
[
|
319
|
+
cell["text"].replace("\x02", "-").strip()
|
320
|
+
for cell in child["cells"]
|
321
|
+
if len(cell["text"].strip()) > 0
|
322
|
+
]
|
323
|
+
)
|
324
|
+
|
325
|
+
c_prov = ProvenanceItem(
|
326
|
+
page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
|
327
|
+
)
|
328
|
+
if c_label == DocItemLabel.LIST_ITEM:
|
329
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
330
|
+
doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
|
331
|
+
elif c_label == DocItemLabel.SECTION_HEADER:
|
332
|
+
doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
|
333
|
+
else:
|
334
|
+
doc.add_text(
|
335
|
+
parent=container_el, label=c_label, text=c_text, prov=c_prov
|
336
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.11.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
|
|
25
25
|
Provides-Extra: tesserocr
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
|
-
Requires-Dist: deepsearch-glm (>=0.
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
28
|
+
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
31
|
-
Requires-Dist: docling-parse (>=
|
31
|
+
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
33
33
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
34
34
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -2,8 +2,8 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
|
5
|
-
docling/backend/docling_parse_backend.py,sha256=
|
6
|
-
docling/backend/docling_parse_v2_backend.py,sha256=
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
|
7
7
|
docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
|
8
8
|
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
9
|
docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
|
@@ -13,39 +13,40 @@ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6
|
|
13
13
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
14
14
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
15
15
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
docling/cli/main.py,sha256=
|
16
|
+
docling/cli/main.py,sha256=FFDUDADvK7QNW7xCs6dlsC7Bt_BMyrKdbZewKTEjm54,14624
|
17
17
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
docling/datamodel/base_models.py,sha256=
|
19
|
-
docling/datamodel/document.py,sha256=
|
20
|
-
docling/datamodel/pipeline_options.py,sha256=
|
18
|
+
docling/datamodel/base_models.py,sha256=vwy59eDrkzCSaay24RlUvx4zEyuaUukOdOhw3622u2I,5616
|
19
|
+
docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
|
20
|
+
docling/datamodel/pipeline_options.py,sha256=1ouWNE5VhZolrWMb4RE6s_AxgNFr3_3PMtxB_YQ391A,5495
|
21
21
|
docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
|
22
|
-
docling/document_converter.py,sha256=
|
22
|
+
docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
|
23
23
|
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
24
24
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
26
26
|
docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
|
27
|
-
docling/models/ds_glm_model.py,sha256=
|
27
|
+
docling/models/ds_glm_model.py,sha256=YJkGxV46wh7G2Wr4vVzt9b8oewkUDPWpvI6AEaZDrs0,11872
|
28
28
|
docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
|
29
29
|
docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
|
30
30
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
31
31
|
docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
|
32
32
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
33
|
-
docling/models/rapid_ocr_model.py,sha256=
|
33
|
+
docling/models/rapid_ocr_model.py,sha256=ui152cerv9b9OeWyyyefs8qMLwYn0qsE2DFE_gHmaCM,6124
|
34
34
|
docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
|
35
35
|
docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
|
36
36
|
docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
|
37
37
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
docling/pipeline/base_pipeline.py,sha256=
|
38
|
+
docling/pipeline/base_pipeline.py,sha256=hVvtk5E4DVZdl_SyNs_pYRUjN9C8PABhpVaeN5Z_fAY,7885
|
39
39
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
40
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
40
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=B1q8xt3Dfecpi8s8DrcfPzdATh8TYgL43FDzzcS4vEA,8885
|
41
41
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
42
42
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
43
43
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
44
|
+
docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11211
|
44
45
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
45
46
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
46
47
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
47
|
-
docling-2.
|
48
|
-
docling-2.
|
49
|
-
docling-2.
|
50
|
-
docling-2.
|
51
|
-
docling-2.
|
48
|
+
docling-2.11.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
49
|
+
docling-2.11.0.dist-info/METADATA,sha256=ajUVy5CuNDUp0x9tMCqO2px2M-ia-Vs7frIyb0_HxMo,7731
|
50
|
+
docling-2.11.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
51
|
+
docling-2.11.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
52
|
+
docling-2.11.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|