docling 2.40.0__tar.gz → 2.42.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.40.0 → docling-2.42.0}/PKG-INFO +4 -3
- {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/latex/omml.py +9 -1
- {docling-2.40.0 → docling-2.42.0}/docling/backend/html_backend.py +25 -17
- {docling-2.40.0 → docling-2.42.0}/docling/backend/xml/jats_backend.py +12 -4
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/asr_model_specs.py +6 -6
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/base_models.py +23 -1
- docling-2.42.0/docling/datamodel/layout_model_specs.py +90 -0
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/pipeline_options.py +14 -1
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/pipeline_options_vlm_model.py +11 -3
- {docling-2.40.0 → docling-2.42.0}/docling/document_converter.py +14 -11
- {docling-2.40.0 → docling-2.42.0}/docling/models/api_vlm_model.py +7 -5
- {docling-2.40.0 → docling-2.42.0}/docling/models/document_picture_classifier.py +12 -13
- {docling-2.40.0 → docling-2.42.0}/docling/models/layout_model.py +17 -15
- {docling-2.40.0 → docling-2.42.0}/docling/models/picture_description_vlm_model.py +2 -1
- {docling-2.40.0 → docling-2.42.0}/docling/models/vlm_models_inline/hf_transformers_model.py +39 -20
- {docling-2.40.0 → docling-2.42.0}/docling/models/vlm_models_inline/mlx_model.py +5 -3
- {docling-2.40.0 → docling-2.42.0}/docling/pipeline/standard_pdf_pipeline.py +2 -3
- {docling-2.40.0 → docling-2.42.0}/docling/pipeline/vlm_pipeline.py +1 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/layout_postprocessor.py +3 -2
- {docling-2.40.0 → docling-2.42.0}/docling/utils/model_downloader.py +2 -1
- {docling-2.40.0 → docling-2.42.0}/docling/utils/ocr_utils.py +1 -1
- {docling-2.40.0 → docling-2.42.0}/docling/utils/orientation.py +22 -28
- {docling-2.40.0 → docling-2.42.0}/docling.egg-info/PKG-INFO +4 -3
- {docling-2.40.0 → docling-2.42.0}/docling.egg-info/SOURCES.txt +2 -0
- {docling-2.40.0 → docling-2.42.0}/docling.egg-info/requires.txt +3 -2
- {docling-2.40.0 → docling-2.42.0}/pyproject.toml +5 -5
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_jats.py +14 -14
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_markdown.py +6 -1
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_patent_uspto.py +11 -3
- {docling-2.40.0 → docling-2.42.0}/tests/test_document_picture_classifier.py +2 -1
- {docling-2.40.0 → docling-2.42.0}/tests/test_e2e_conversion.py +2 -8
- {docling-2.40.0 → docling-2.42.0}/tests/test_e2e_ocr_conversion.py +5 -10
- {docling-2.40.0 → docling-2.42.0}/tests/test_interfaces.py +2 -9
- {docling-2.40.0 → docling-2.42.0}/tests/test_legacy_format_transform.py +1 -0
- docling-2.42.0/tests/test_ocr_utils.py +80 -0
- {docling-2.40.0 → docling-2.42.0}/LICENSE +0 -0
- {docling-2.40.0 → docling-2.42.0}/README.md +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/md_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/chunking/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/cli/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/cli/main.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/cli/models.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/cli/tools.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/document.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/settings.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/exceptions.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/base_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/py.typed +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/__init__.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/export.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/locks.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/profiling.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/utils.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling/utils/visualization.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.40.0 → docling-2.42.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.40.0 → docling-2.42.0}/setup.cfg +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_csv.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_html.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_msword.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_backend_webp.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_cli.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_code_formula.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_input_doc.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_invalid_input.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_options.py +0 -0
- {docling-2.40.0 → docling-2.42.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.42.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
|
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
30
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.6.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
|
|
50
50
|
Requires-Dist: pluggy<2.0.0,>=1.0.0
|
51
51
|
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
52
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
53
|
+
Requires-Dist: accelerate<2,>=1.0.0
|
53
54
|
Provides-Extra: tesserocr
|
54
55
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
55
56
|
Provides-Extra: ocrmac
|
@@ -62,7 +63,7 @@ Provides-Extra: rapidocr
|
|
62
63
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
64
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
64
65
|
Provides-Extra: asr
|
65
|
-
Requires-Dist: openai-whisper>=
|
66
|
+
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
66
67
|
Dynamic: license-file
|
67
68
|
|
68
69
|
<p align="center">
|
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
|
|
260
260
|
the fraction object
|
261
261
|
"""
|
262
262
|
c_dict = self.process_children_dict(elm)
|
263
|
-
pr = c_dict
|
263
|
+
pr = c_dict.get("fPr")
|
264
|
+
if pr is None:
|
265
|
+
# Handle missing fPr element gracefully
|
266
|
+
_log.debug("Missing fPr element in fraction, using default formatting")
|
267
|
+
latex_s = F_DEFAULT
|
268
|
+
return latex_s.format(
|
269
|
+
num=c_dict.get("num"),
|
270
|
+
den=c_dict.get("den"),
|
271
|
+
)
|
264
272
|
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
265
273
|
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
266
274
|
|
@@ -379,6 +379,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
379
379
|
else:
|
380
380
|
_log.debug(f"list-item has no text: {element}")
|
381
381
|
|
382
|
+
@staticmethod
|
383
|
+
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
384
|
+
"""Extract colspan and rowspan values from a table cell tag.
|
385
|
+
|
386
|
+
This function retrieves the 'colspan' and 'rowspan' attributes from a given
|
387
|
+
table cell tag.
|
388
|
+
If the attribute does not exist or it is not numeric, it defaults to 1.
|
389
|
+
"""
|
390
|
+
raw_spans: tuple[str, str] = (
|
391
|
+
str(cell.get("colspan", "1")),
|
392
|
+
str(cell.get("rowspan", "1")),
|
393
|
+
)
|
394
|
+
int_spans: tuple[int, int] = (
|
395
|
+
int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
|
396
|
+
int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
|
397
|
+
)
|
398
|
+
|
399
|
+
return int_spans
|
400
|
+
|
382
401
|
@staticmethod
|
383
402
|
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
384
403
|
nested_tables = element.find("table")
|
@@ -398,10 +417,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
398
417
|
if not isinstance(row, Tag):
|
399
418
|
continue
|
400
419
|
cell_tag = cast(Tag, cell)
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
420
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
421
|
+
col_count += col_span
|
422
|
+
if cell_tag.name == "td" or row_span == 1:
|
405
423
|
is_row_header = False
|
406
424
|
num_cols = max(num_cols, col_count)
|
407
425
|
if not is_row_header:
|
@@ -428,10 +446,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
428
446
|
row_header = True
|
429
447
|
for html_cell in cells:
|
430
448
|
if isinstance(html_cell, Tag):
|
449
|
+
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
431
450
|
if html_cell.name == "td":
|
432
451
|
col_header = False
|
433
452
|
row_header = False
|
434
|
-
elif
|
453
|
+
elif row_span == 1:
|
435
454
|
row_header = False
|
436
455
|
if not row_header:
|
437
456
|
row_idx += 1
|
@@ -456,18 +475,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
456
475
|
text = html_cell.text
|
457
476
|
|
458
477
|
# label = html_cell.name
|
459
|
-
|
460
|
-
col_span = (
|
461
|
-
int(col_val)
|
462
|
-
if isinstance(col_val, str) and col_val.isnumeric()
|
463
|
-
else 1
|
464
|
-
)
|
465
|
-
row_val = html_cell.get("rowspan", "1")
|
466
|
-
row_span = (
|
467
|
-
int(row_val)
|
468
|
-
if isinstance(row_val, str) and row_val.isnumeric()
|
469
|
-
else 1
|
470
|
-
)
|
478
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
471
479
|
if row_header:
|
472
480
|
row_span -= 1
|
473
481
|
while (
|
@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
93
93
|
|
94
94
|
# Initialize the root of the document hierarchy
|
95
95
|
self.root: Optional[NodeItem] = None
|
96
|
-
|
97
|
-
self.valid = False
|
96
|
+
self.hlevel: int = 0
|
97
|
+
self.valid: bool = False
|
98
98
|
try:
|
99
99
|
if isinstance(self.path_or_stream, BytesIO):
|
100
100
|
self.path_or_stream.seek(0)
|
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
147
147
|
binary_hash=self.document_hash,
|
148
148
|
)
|
149
149
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
150
|
+
self.hlevel = 0
|
150
151
|
|
151
152
|
# Get metadata XML components
|
152
153
|
xml_components: XMLComponents = self._parse_metadata()
|
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
304
305
|
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
305
306
|
if not text:
|
306
307
|
continue
|
307
|
-
parent = doc.add_heading(
|
308
|
+
parent = doc.add_heading(
|
309
|
+
parent=self.root, text=title, level=self.hlevel + 1
|
310
|
+
)
|
308
311
|
doc.add_text(
|
309
312
|
parent=parent,
|
310
313
|
text=text,
|
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
637
640
|
elif child.tag == "ack":
|
638
641
|
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
|
639
642
|
if text:
|
640
|
-
|
643
|
+
self.hlevel += 1
|
644
|
+
new_parent = doc.add_heading(
|
645
|
+
text=text, parent=parent, level=self.hlevel
|
646
|
+
)
|
641
647
|
elif child.tag == "list":
|
642
648
|
new_parent = doc.add_group(
|
643
649
|
label=GroupLabel.LIST, name="list", parent=parent
|
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
694
700
|
new_text = self._walk_linear(doc, new_parent, child)
|
695
701
|
if not (node.getparent().tag == "p" and node.tag in flush_tags):
|
696
702
|
node_text += new_text
|
703
|
+
if child.tag in ("sec", "ack") and text:
|
704
|
+
self.hlevel -= 1
|
697
705
|
|
698
706
|
# pick up the tail text
|
699
707
|
node_text += child.tail.replace("\n", " ") if child.tail else ""
|
@@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
|
22
22
|
verbose=True,
|
23
23
|
timestamps=True,
|
24
24
|
word_timestamps=True,
|
25
|
-
|
25
|
+
temperature=0.0,
|
26
26
|
max_new_tokens=256,
|
27
27
|
max_time_chunk=30.0,
|
28
28
|
)
|
@@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
|
33
33
|
verbose=True,
|
34
34
|
timestamps=True,
|
35
35
|
word_timestamps=True,
|
36
|
-
|
36
|
+
temperature=0.0,
|
37
37
|
max_new_tokens=256,
|
38
38
|
max_time_chunk=30.0,
|
39
39
|
)
|
@@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
|
44
44
|
verbose=True,
|
45
45
|
timestamps=True,
|
46
46
|
word_timestamps=True,
|
47
|
-
|
47
|
+
temperature=0.0,
|
48
48
|
max_new_tokens=256,
|
49
49
|
max_time_chunk=30.0,
|
50
50
|
)
|
@@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
|
55
55
|
verbose=True,
|
56
56
|
timestamps=True,
|
57
57
|
word_timestamps=True,
|
58
|
-
|
58
|
+
temperature=0.0,
|
59
59
|
max_new_tokens=256,
|
60
60
|
max_time_chunk=30.0,
|
61
61
|
)
|
@@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
|
66
66
|
verbose=True,
|
67
67
|
timestamps=True,
|
68
68
|
word_timestamps=True,
|
69
|
-
|
69
|
+
temperature=0.0,
|
70
70
|
max_new_tokens=256,
|
71
71
|
max_time_chunk=30.0,
|
72
72
|
)
|
@@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
|
77
77
|
verbose=True,
|
78
78
|
timestamps=True,
|
79
79
|
word_timestamps=True,
|
80
|
-
|
80
|
+
temperature=0.0,
|
81
81
|
max_new_tokens=256,
|
82
82
|
max_time_chunk=30.0,
|
83
83
|
)
|
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
|
|
12
12
|
Size,
|
13
13
|
TableCell,
|
14
14
|
)
|
15
|
+
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
|
15
16
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
16
17
|
from docling_core.types.io import (
|
17
18
|
DocumentStream,
|
@@ -19,7 +20,14 @@ from docling_core.types.io import (
|
|
19
20
|
|
20
21
|
# DO NOT REMOVE; explicitly exposed from this location
|
21
22
|
from PIL.Image import Image
|
22
|
-
from pydantic import
|
23
|
+
from pydantic import (
|
24
|
+
BaseModel,
|
25
|
+
ConfigDict,
|
26
|
+
Field,
|
27
|
+
FieldSerializationInfo,
|
28
|
+
computed_field,
|
29
|
+
field_serializer,
|
30
|
+
)
|
23
31
|
|
24
32
|
if TYPE_CHECKING:
|
25
33
|
from docling.backend.pdf_backend import PdfPageBackend
|
@@ -142,6 +150,10 @@ class Cluster(BaseModel):
|
|
142
150
|
cells: List[TextCell] = []
|
143
151
|
children: List["Cluster"] = [] # Add child cluster support
|
144
152
|
|
153
|
+
@field_serializer("confidence")
|
154
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
155
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
156
|
+
|
145
157
|
|
146
158
|
class BasePageElement(BaseModel):
|
147
159
|
label: DocItemLabel
|
@@ -194,6 +206,16 @@ class FigureElement(BasePageElement):
|
|
194
206
|
predicted_class: Optional[str] = None
|
195
207
|
confidence: Optional[float] = None
|
196
208
|
|
209
|
+
@field_serializer("confidence")
|
210
|
+
def _serialize(
|
211
|
+
self, value: Optional[float], info: FieldSerializationInfo
|
212
|
+
) -> Optional[float]:
|
213
|
+
return (
|
214
|
+
round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
215
|
+
if value is not None
|
216
|
+
else None
|
217
|
+
)
|
218
|
+
|
197
219
|
|
198
220
|
class FigureClassificationPrediction(BaseModel):
|
199
221
|
figure_count: int = 0
|
@@ -0,0 +1,90 @@
|
|
1
|
+
import logging
|
2
|
+
from enum import Enum
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class LayoutModelConfig(BaseModel):
|
14
|
+
name: str
|
15
|
+
repo_id: str
|
16
|
+
revision: str
|
17
|
+
model_path: str
|
18
|
+
supported_devices: list[AcceleratorDevice] = [
|
19
|
+
AcceleratorDevice.CPU,
|
20
|
+
AcceleratorDevice.CUDA,
|
21
|
+
AcceleratorDevice.MPS,
|
22
|
+
]
|
23
|
+
|
24
|
+
@property
|
25
|
+
def model_repo_folder(self) -> str:
|
26
|
+
return self.repo_id.replace("/", "--")
|
27
|
+
|
28
|
+
|
29
|
+
# HuggingFace Layout Models
|
30
|
+
|
31
|
+
# Default Docling Layout Model
|
32
|
+
DOCLING_LAYOUT_V2 = LayoutModelConfig(
|
33
|
+
name="docling_layout_v2",
|
34
|
+
repo_id="ds4sd/docling-layout-old",
|
35
|
+
revision="main",
|
36
|
+
model_path="",
|
37
|
+
)
|
38
|
+
|
39
|
+
DOCLING_LAYOUT_HERON = LayoutModelConfig(
|
40
|
+
name="docling_layout_heron",
|
41
|
+
repo_id="ds4sd/docling-layout-heron",
|
42
|
+
revision="main",
|
43
|
+
model_path="",
|
44
|
+
)
|
45
|
+
|
46
|
+
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
|
47
|
+
name="docling_layout_heron_101",
|
48
|
+
repo_id="ds4sd/docling-layout-heron-101",
|
49
|
+
revision="main",
|
50
|
+
model_path="",
|
51
|
+
)
|
52
|
+
|
53
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
|
54
|
+
name="docling_layout_egret_medium",
|
55
|
+
repo_id="ds4sd/docling-layout-egret-medium",
|
56
|
+
revision="main",
|
57
|
+
model_path="",
|
58
|
+
)
|
59
|
+
|
60
|
+
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
|
61
|
+
name="docling_layout_egret_large",
|
62
|
+
repo_id="ds4sd/docling-layout-egret-large",
|
63
|
+
revision="main",
|
64
|
+
model_path="",
|
65
|
+
)
|
66
|
+
|
67
|
+
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
|
68
|
+
name="docling_layout_egret_xlarge",
|
69
|
+
repo_id="ds4sd/docling-layout-egret-xlarge",
|
70
|
+
revision="main",
|
71
|
+
model_path="",
|
72
|
+
)
|
73
|
+
|
74
|
+
# Example for a hypothetical alternative model
|
75
|
+
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
|
76
|
+
# name="alternative_layout",
|
77
|
+
# repo_id="someorg/alternative-layout",
|
78
|
+
# revision="main",
|
79
|
+
# model_path="model_artifacts/layout_alt",
|
80
|
+
# )
|
81
|
+
|
82
|
+
|
83
|
+
class LayoutModelType(str, Enum):
|
84
|
+
DOCLING_LAYOUT_V2 = "docling_layout_v2"
|
85
|
+
DOCLING_LAYOUT_HERON = "docling_layout_heron"
|
86
|
+
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
|
87
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
|
88
|
+
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
|
89
|
+
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
|
90
|
+
# ALTERNATIVE_LAYOUT = "alternative_layout"
|
@@ -16,6 +16,15 @@ from docling.datamodel import asr_model_specs
|
|
16
16
|
|
17
17
|
# Import the following for backwards compatibility
|
18
18
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
19
|
+
from docling.datamodel.layout_model_specs import (
|
20
|
+
DOCLING_LAYOUT_EGRET_LARGE,
|
21
|
+
DOCLING_LAYOUT_EGRET_MEDIUM,
|
22
|
+
DOCLING_LAYOUT_EGRET_XLARGE,
|
23
|
+
DOCLING_LAYOUT_HERON,
|
24
|
+
DOCLING_LAYOUT_HERON_101,
|
25
|
+
DOCLING_LAYOUT_V2,
|
26
|
+
LayoutModelConfig,
|
27
|
+
)
|
19
28
|
from docling.datamodel.pipeline_options_asr_model import (
|
20
29
|
InlineAsrOptions,
|
21
30
|
)
|
@@ -208,7 +217,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
208
217
|
|
209
218
|
# GraniteVision
|
210
219
|
granite_picture_description = PictureDescriptionVlmOptions(
|
211
|
-
repo_id="ibm-granite/granite-vision-3.
|
220
|
+
repo_id="ibm-granite/granite-vision-3.3-2b",
|
212
221
|
prompt="What is shown in this image?",
|
213
222
|
)
|
214
223
|
|
@@ -270,6 +279,10 @@ class LayoutOptions(BaseModel):
|
|
270
279
|
"""Options for layout processing."""
|
271
280
|
|
272
281
|
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
282
|
+
keep_empty_clusters: bool = (
|
283
|
+
False # Whether to keep clusters that contain no text cells
|
284
|
+
)
|
285
|
+
model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
|
273
286
|
|
274
287
|
|
275
288
|
class AsrPipelineOptions(PipelineOptions):
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any, Dict, List, Literal, Optional, Union
|
2
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
3
3
|
|
4
|
+
from docling_core.types.doc.page import SegmentedPage
|
4
5
|
from pydantic import AnyUrl, BaseModel
|
5
6
|
from typing_extensions import deprecated
|
6
7
|
|
@@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
9
10
|
|
10
11
|
class BaseVlmOptions(BaseModel):
|
11
12
|
kind: str
|
12
|
-
prompt: str
|
13
|
+
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
|
13
14
|
scale: float = 2.0
|
14
15
|
max_size: Optional[int] = None
|
16
|
+
temperature: float = 0.0
|
15
17
|
|
16
18
|
|
17
19
|
class ResponseFormat(str, Enum):
|
@@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
|
|
29
31
|
AUTOMODEL = "automodel"
|
30
32
|
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
31
33
|
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
34
|
+
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
|
35
|
+
|
36
|
+
|
37
|
+
class TransformersPromptStyle(str, Enum):
|
38
|
+
CHAT = "chat"
|
39
|
+
RAW = "raw"
|
32
40
|
|
33
41
|
|
34
42
|
class InlineVlmOptions(BaseVlmOptions):
|
@@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
42
50
|
|
43
51
|
inference_framework: InferenceFramework
|
44
52
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
53
|
+
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
|
45
54
|
response_format: ResponseFormat
|
46
55
|
|
47
56
|
torch_dtype: Optional[str] = None
|
@@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
51
60
|
AcceleratorDevice.MPS,
|
52
61
|
]
|
53
62
|
|
54
|
-
temperature: float = 0.0
|
55
63
|
stop_strings: List[str] = []
|
56
64
|
extra_generation_config: Dict[str, Any] = {}
|
57
65
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
3
|
import sys
|
4
|
+
import threading
|
4
5
|
import time
|
5
6
|
from collections.abc import Iterable, Iterator
|
6
7
|
from functools import partial
|
@@ -49,6 +50,7 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|
49
50
|
from docling.utils.utils import chunkify
|
50
51
|
|
51
52
|
_log = logging.getLogger(__name__)
|
53
|
+
_PIPELINE_CACHE_LOCK = threading.Lock()
|
52
54
|
|
53
55
|
|
54
56
|
class FormatOption(BaseModel):
|
@@ -315,17 +317,18 @@ class DocumentConverter:
|
|
315
317
|
# Use a composite key to cache pipelines
|
316
318
|
cache_key = (pipeline_class, options_hash)
|
317
319
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
320
|
+
with _PIPELINE_CACHE_LOCK:
|
321
|
+
if cache_key not in self.initialized_pipelines:
|
322
|
+
_log.info(
|
323
|
+
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
324
|
+
)
|
325
|
+
self.initialized_pipelines[cache_key] = pipeline_class(
|
326
|
+
pipeline_options=pipeline_options
|
327
|
+
)
|
328
|
+
else:
|
329
|
+
_log.debug(
|
330
|
+
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
331
|
+
)
|
329
332
|
|
330
333
|
return self.initialized_pipelines[cache_key]
|
331
334
|
|
@@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
|
|
29
29
|
|
30
30
|
self.timeout = self.vlm_options.timeout
|
31
31
|
self.concurrency = self.vlm_options.concurrency
|
32
|
-
self.prompt_content = (
|
33
|
-
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
34
|
-
)
|
35
32
|
self.params = {
|
36
33
|
**self.vlm_options.params,
|
37
|
-
"temperature":
|
34
|
+
"temperature": self.vlm_options.temperature,
|
38
35
|
}
|
39
36
|
|
40
37
|
def __call__(
|
@@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
|
|
56
53
|
if hi_res_image.mode != "RGB":
|
57
54
|
hi_res_image = hi_res_image.convert("RGB")
|
58
55
|
|
56
|
+
if callable(self.vlm_options.prompt):
|
57
|
+
prompt = self.vlm_options.prompt(page.parsed_page)
|
58
|
+
else:
|
59
|
+
prompt = self.vlm_options.prompt
|
60
|
+
|
59
61
|
page_tags = api_image_request(
|
60
62
|
image=hi_res_image,
|
61
|
-
prompt=
|
63
|
+
prompt=prompt,
|
62
64
|
url=self.vlm_options.url,
|
63
65
|
timeout=self.timeout,
|
64
66
|
headers=self.vlm_options.headers,
|
@@ -14,7 +14,8 @@ from PIL import Image
|
|
14
14
|
from pydantic import BaseModel
|
15
15
|
|
16
16
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
17
|
-
from docling.
|
17
|
+
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
18
|
+
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
18
19
|
from docling.models.utils.hf_model_download import download_hf_model
|
19
20
|
from docling.utils.accelerator_utils import decide_device
|
20
21
|
|
@@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
|
|
32
33
|
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
33
34
|
|
34
35
|
|
35
|
-
class DocumentPictureClassifier(
|
36
|
+
class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
|
36
37
|
"""
|
37
38
|
A model for classifying pictures in documents.
|
38
39
|
|
@@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
135
136
|
def __call__(
|
136
137
|
self,
|
137
138
|
doc: DoclingDocument,
|
138
|
-
element_batch: Iterable[
|
139
|
+
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
139
140
|
) -> Iterable[NodeItem]:
|
140
141
|
"""
|
141
142
|
Processes a batch of elements and enriches them with classification predictions.
|
@@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
144
145
|
----------
|
145
146
|
doc : DoclingDocument
|
146
147
|
The document containing the elements to be processed.
|
147
|
-
element_batch : Iterable[
|
148
|
+
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
148
149
|
A batch of pictures to classify.
|
149
150
|
|
150
151
|
Returns
|
@@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
155
156
|
"""
|
156
157
|
if not self.enabled:
|
157
158
|
for element in element_batch:
|
158
|
-
yield element
|
159
|
+
yield element.item
|
159
160
|
return
|
160
161
|
|
161
162
|
images: List[Union[Image.Image, np.ndarray]] = []
|
162
163
|
elements: List[PictureItem] = []
|
163
164
|
for el in element_batch:
|
164
|
-
assert isinstance(el, PictureItem)
|
165
|
-
elements.append(el)
|
166
|
-
|
167
|
-
assert img is not None
|
168
|
-
images.append(img)
|
165
|
+
assert isinstance(el.item, PictureItem)
|
166
|
+
elements.append(el.item)
|
167
|
+
images.append(el.image)
|
169
168
|
|
170
169
|
outputs = self.document_picture_classifier.predict(images)
|
171
170
|
|
172
|
-
for
|
173
|
-
|
171
|
+
for item, output in zip(elements, outputs):
|
172
|
+
item.annotations.append(
|
174
173
|
PictureClassificationData(
|
175
174
|
provenance="DocumentPictureClassifier",
|
176
175
|
predicted_classes=[
|
@@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
183
182
|
)
|
184
183
|
)
|
185
184
|
|
186
|
-
yield
|
185
|
+
yield item
|
@@ -12,6 +12,7 @@ from PIL import Image
|
|
12
12
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
13
13
|
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
14
14
|
from docling.datamodel.document import ConversionResult
|
15
|
+
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
15
16
|
from docling.datamodel.pipeline_options import LayoutOptions
|
16
17
|
from docling.datamodel.settings import settings
|
17
18
|
from docling.models.base_model import BasePageModel
|
@@ -25,9 +26,6 @@ _log = logging.getLogger(__name__)
|
|
25
26
|
|
26
27
|
|
27
28
|
class LayoutModel(BasePageModel):
|
28
|
-
_model_repo_folder = "ds4sd--docling-models"
|
29
|
-
_model_path = "model_artifacts/layout"
|
30
|
-
|
31
29
|
TEXT_ELEM_LABELS = [
|
32
30
|
DocItemLabel.TEXT,
|
33
31
|
DocItemLabel.FOOTNOTE,
|
@@ -59,25 +57,28 @@ class LayoutModel(BasePageModel):
|
|
59
57
|
self.options = options
|
60
58
|
|
61
59
|
device = decide_device(accelerator_options.device)
|
60
|
+
layout_model_config = options.model_spec
|
61
|
+
model_repo_folder = layout_model_config.model_repo_folder
|
62
|
+
model_path = layout_model_config.model_path
|
62
63
|
|
63
64
|
if artifacts_path is None:
|
64
|
-
artifacts_path =
|
65
|
+
artifacts_path = (
|
66
|
+
self.download_models(layout_model_config=layout_model_config)
|
67
|
+
/ model_path
|
68
|
+
)
|
65
69
|
else:
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
artifacts_path / self._model_repo_folder / self._model_path
|
70
|
-
)
|
71
|
-
elif (artifacts_path / self._model_path).exists():
|
70
|
+
if (artifacts_path / model_repo_folder).exists():
|
71
|
+
artifacts_path = artifacts_path / model_repo_folder / model_path
|
72
|
+
elif (artifacts_path / model_path).exists():
|
72
73
|
warnings.warn(
|
73
74
|
"The usage of artifacts_path containing directly "
|
74
|
-
f"{
|
75
|
+
f"{model_path} is deprecated. Please point "
|
75
76
|
"the artifacts_path to the parent containing "
|
76
|
-
f"the {
|
77
|
+
f"the {model_repo_folder} folder.",
|
77
78
|
DeprecationWarning,
|
78
79
|
stacklevel=3,
|
79
80
|
)
|
80
|
-
artifacts_path = artifacts_path /
|
81
|
+
artifacts_path = artifacts_path / model_path
|
81
82
|
|
82
83
|
self.layout_predictor = LayoutPredictor(
|
83
84
|
artifact_path=str(artifacts_path),
|
@@ -90,10 +91,11 @@ class LayoutModel(BasePageModel):
|
|
90
91
|
local_dir: Optional[Path] = None,
|
91
92
|
force: bool = False,
|
92
93
|
progress: bool = False,
|
94
|
+
layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
|
93
95
|
) -> Path:
|
94
96
|
return download_hf_model(
|
95
|
-
repo_id=
|
96
|
-
revision=
|
97
|
+
repo_id=layout_model_config.repo_id,
|
98
|
+
revision=layout_model_config.revision,
|
97
99
|
local_dir=local_dir,
|
98
100
|
force=force,
|
99
101
|
progress=progress,
|