docling 2.39.0__tar.gz → 2.41.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.39.0 → docling-2.41.0}/PKG-INFO +5 -5
- {docling-2.39.0 → docling-2.41.0}/docling/backend/docling_parse_v4_backend.py +14 -4
- {docling-2.39.0 → docling-2.41.0}/docling/backend/msexcel_backend.py +33 -14
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/asr_model_specs.py +6 -6
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/base_models.py +23 -1
- docling-2.41.0/docling/datamodel/layout_model_specs.py +90 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/pipeline_options.py +18 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/pipeline_options_vlm_model.py +11 -3
- {docling-2.39.0 → docling-2.41.0}/docling/models/api_vlm_model.py +7 -5
- {docling-2.39.0 → docling-2.41.0}/docling/models/base_ocr_model.py +6 -2
- {docling-2.39.0 → docling-2.41.0}/docling/models/document_picture_classifier.py +12 -13
- {docling-2.39.0 → docling-2.41.0}/docling/models/layout_model.py +27 -18
- {docling-2.39.0 → docling-2.41.0}/docling/models/picture_description_vlm_model.py +16 -11
- docling-2.41.0/docling/models/plugins/defaults.py +28 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/readingorder_model.py +8 -1
- {docling-2.39.0 → docling-2.41.0}/docling/models/table_structure_model.py +3 -1
- {docling-2.39.0 → docling-2.41.0}/docling/models/tesseract_ocr_model.py +10 -4
- {docling-2.39.0 → docling-2.41.0}/docling/models/vlm_models_inline/hf_transformers_model.py +39 -20
- {docling-2.39.0 → docling-2.41.0}/docling/models/vlm_models_inline/mlx_model.py +5 -3
- {docling-2.39.0 → docling-2.41.0}/docling/pipeline/standard_pdf_pipeline.py +3 -3
- {docling-2.39.0 → docling-2.41.0}/docling/pipeline/vlm_pipeline.py +1 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/accelerator_utils.py +2 -2
- {docling-2.39.0 → docling-2.41.0}/docling/utils/layout_postprocessor.py +7 -2
- {docling-2.39.0 → docling-2.41.0}/docling/utils/model_downloader.py +2 -1
- {docling-2.39.0 → docling-2.41.0}/docling/utils/ocr_utils.py +1 -1
- {docling-2.39.0 → docling-2.41.0}/docling/utils/orientation.py +22 -28
- {docling-2.39.0 → docling-2.41.0}/docling.egg-info/PKG-INFO +5 -5
- {docling-2.39.0 → docling-2.41.0}/docling.egg-info/SOURCES.txt +2 -0
- {docling-2.39.0 → docling-2.41.0}/docling.egg-info/requires.txt +4 -4
- {docling-2.39.0 → docling-2.41.0}/pyproject.toml +5 -6
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_docling_parse_v4.py +17 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_markdown.py +6 -1
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_patent_uspto.py +11 -3
- {docling-2.39.0 → docling-2.41.0}/tests/test_document_picture_classifier.py +2 -1
- {docling-2.39.0 → docling-2.41.0}/tests/test_e2e_conversion.py +2 -8
- {docling-2.39.0 → docling-2.41.0}/tests/test_e2e_ocr_conversion.py +5 -10
- {docling-2.39.0 → docling-2.41.0}/tests/test_interfaces.py +2 -9
- {docling-2.39.0 → docling-2.41.0}/tests/test_legacy_format_transform.py +1 -0
- docling-2.41.0/tests/test_ocr_utils.py +80 -0
- docling-2.39.0/docling/models/plugins/defaults.py +0 -28
- {docling-2.39.0 → docling-2.41.0}/LICENSE +0 -0
- {docling-2.39.0 → docling-2.41.0}/README.md +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/html_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/md_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/chunking/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/cli/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/cli/main.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/cli/models.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/cli/tools.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/document.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/settings.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/document_converter.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/exceptions.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/base_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/py.typed +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/__init__.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/export.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/locks.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/profiling.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/utils.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling/utils/visualization.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.39.0 → docling-2.41.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.39.0 → docling-2.41.0}/setup.cfg +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_csv.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_html.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_jats.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_msword.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_backend_webp.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_cli.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_code_formula.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_input_doc.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_invalid_input.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_options.py +0 -0
- {docling-2.39.0 → docling-2.41.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.41.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -26,9 +26,9 @@ Requires-Python: <4.0,>=3.9
|
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
30
|
-
Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
31
30
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
31
|
+
Requires-Dist: docling-ibm-models<4,>=3.6.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2<5.0.0,>=4.30.0
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
@@ -57,12 +57,12 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
57
57
|
Provides-Extra: vlm
|
58
58
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
59
59
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
60
|
-
Requires-Dist: mlx-vlm
|
60
|
+
Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
61
61
|
Provides-Extra: rapidocr
|
62
62
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
63
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
64
64
|
Provides-Extra: asr
|
65
|
-
Requires-Dist: openai-whisper>=
|
65
|
+
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
66
66
|
Dynamic: license-file
|
67
67
|
|
68
68
|
<p align="center">
|
@@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
187
187
|
|
188
188
|
def unload(self):
|
189
189
|
super().unload()
|
190
|
-
|
191
|
-
|
192
|
-
self.
|
193
|
-
|
190
|
+
# Unload docling-parse document first
|
191
|
+
if self.dp_doc is not None:
|
192
|
+
self.dp_doc.unload()
|
193
|
+
self.dp_doc = None
|
194
|
+
|
195
|
+
# Then close pypdfium2 document with proper locking
|
196
|
+
if self._pdoc is not None:
|
197
|
+
with pypdfium2_lock:
|
198
|
+
try:
|
199
|
+
self._pdoc.close()
|
200
|
+
except Exception:
|
201
|
+
# Ignore cleanup errors
|
202
|
+
pass
|
203
|
+
self._pdoc = None
|
@@ -337,10 +337,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
337
337
|
# Collect the data within the bounds
|
338
338
|
data = []
|
339
339
|
visited_cells: set[tuple[int, int]] = set()
|
340
|
-
for ri in
|
341
|
-
|
342
|
-
|
343
|
-
|
340
|
+
for ri, row in enumerate(
|
341
|
+
sheet.iter_rows(
|
342
|
+
min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
|
343
|
+
max_row=max_row + 1,
|
344
|
+
min_col=start_col + 1,
|
345
|
+
max_col=max_col + 1,
|
346
|
+
values_only=False,
|
347
|
+
),
|
348
|
+
start_row,
|
349
|
+
):
|
350
|
+
for rj, cell in enumerate(row, start_col):
|
344
351
|
# Check if the cell belongs to a merged range
|
345
352
|
row_span = 1
|
346
353
|
col_span = 1
|
@@ -397,10 +404,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
397
404
|
"""
|
398
405
|
max_row: int = start_row
|
399
406
|
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
407
|
+
for ri, (cell,) in enumerate(
|
408
|
+
sheet.iter_rows(
|
409
|
+
min_row=start_row + 2,
|
410
|
+
max_row=sheet.max_row,
|
411
|
+
min_col=start_col + 1,
|
412
|
+
max_col=start_col + 1,
|
413
|
+
values_only=False,
|
414
|
+
),
|
415
|
+
start_row + 1,
|
416
|
+
):
|
404
417
|
# Check if the cell is part of a merged range
|
405
418
|
merged_range = next(
|
406
419
|
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
@@ -414,7 +427,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
414
427
|
if merged_range:
|
415
428
|
max_row = max(max_row, merged_range.max_row - 1)
|
416
429
|
else:
|
417
|
-
max_row
|
430
|
+
max_row = ri
|
418
431
|
|
419
432
|
return max_row
|
420
433
|
|
@@ -433,10 +446,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
433
446
|
"""
|
434
447
|
max_col: int = start_col
|
435
448
|
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
449
|
+
for rj, (cell,) in enumerate(
|
450
|
+
sheet.iter_cols(
|
451
|
+
min_row=start_row + 1,
|
452
|
+
max_row=start_row + 1,
|
453
|
+
min_col=start_col + 2,
|
454
|
+
max_col=sheet.max_column,
|
455
|
+
values_only=False,
|
456
|
+
),
|
457
|
+
start_col + 1,
|
458
|
+
):
|
440
459
|
# Check if the cell is part of a merged range
|
441
460
|
merged_range = next(
|
442
461
|
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
@@ -450,7 +469,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
450
469
|
if merged_range:
|
451
470
|
max_col = max(max_col, merged_range.max_col - 1)
|
452
471
|
else:
|
453
|
-
max_col
|
472
|
+
max_col = rj
|
454
473
|
|
455
474
|
return max_col
|
456
475
|
|
@@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
|
22
22
|
verbose=True,
|
23
23
|
timestamps=True,
|
24
24
|
word_timestamps=True,
|
25
|
-
|
25
|
+
temperature=0.0,
|
26
26
|
max_new_tokens=256,
|
27
27
|
max_time_chunk=30.0,
|
28
28
|
)
|
@@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
|
33
33
|
verbose=True,
|
34
34
|
timestamps=True,
|
35
35
|
word_timestamps=True,
|
36
|
-
|
36
|
+
temperature=0.0,
|
37
37
|
max_new_tokens=256,
|
38
38
|
max_time_chunk=30.0,
|
39
39
|
)
|
@@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
|
44
44
|
verbose=True,
|
45
45
|
timestamps=True,
|
46
46
|
word_timestamps=True,
|
47
|
-
|
47
|
+
temperature=0.0,
|
48
48
|
max_new_tokens=256,
|
49
49
|
max_time_chunk=30.0,
|
50
50
|
)
|
@@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
|
55
55
|
verbose=True,
|
56
56
|
timestamps=True,
|
57
57
|
word_timestamps=True,
|
58
|
-
|
58
|
+
temperature=0.0,
|
59
59
|
max_new_tokens=256,
|
60
60
|
max_time_chunk=30.0,
|
61
61
|
)
|
@@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
|
66
66
|
verbose=True,
|
67
67
|
timestamps=True,
|
68
68
|
word_timestamps=True,
|
69
|
-
|
69
|
+
temperature=0.0,
|
70
70
|
max_new_tokens=256,
|
71
71
|
max_time_chunk=30.0,
|
72
72
|
)
|
@@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
|
77
77
|
verbose=True,
|
78
78
|
timestamps=True,
|
79
79
|
word_timestamps=True,
|
80
|
-
|
80
|
+
temperature=0.0,
|
81
81
|
max_new_tokens=256,
|
82
82
|
max_time_chunk=30.0,
|
83
83
|
)
|
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
|
|
12
12
|
Size,
|
13
13
|
TableCell,
|
14
14
|
)
|
15
|
+
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
|
15
16
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
16
17
|
from docling_core.types.io import (
|
17
18
|
DocumentStream,
|
@@ -19,7 +20,14 @@ from docling_core.types.io import (
|
|
19
20
|
|
20
21
|
# DO NOT REMOVE; explicitly exposed from this location
|
21
22
|
from PIL.Image import Image
|
22
|
-
from pydantic import
|
23
|
+
from pydantic import (
|
24
|
+
BaseModel,
|
25
|
+
ConfigDict,
|
26
|
+
Field,
|
27
|
+
FieldSerializationInfo,
|
28
|
+
computed_field,
|
29
|
+
field_serializer,
|
30
|
+
)
|
23
31
|
|
24
32
|
if TYPE_CHECKING:
|
25
33
|
from docling.backend.pdf_backend import PdfPageBackend
|
@@ -142,6 +150,10 @@ class Cluster(BaseModel):
|
|
142
150
|
cells: List[TextCell] = []
|
143
151
|
children: List["Cluster"] = [] # Add child cluster support
|
144
152
|
|
153
|
+
@field_serializer("confidence")
|
154
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
155
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
156
|
+
|
145
157
|
|
146
158
|
class BasePageElement(BaseModel):
|
147
159
|
label: DocItemLabel
|
@@ -194,6 +206,16 @@ class FigureElement(BasePageElement):
|
|
194
206
|
predicted_class: Optional[str] = None
|
195
207
|
confidence: Optional[float] = None
|
196
208
|
|
209
|
+
@field_serializer("confidence")
|
210
|
+
def _serialize(
|
211
|
+
self, value: Optional[float], info: FieldSerializationInfo
|
212
|
+
) -> Optional[float]:
|
213
|
+
return (
|
214
|
+
round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
215
|
+
if value is not None
|
216
|
+
else None
|
217
|
+
)
|
218
|
+
|
197
219
|
|
198
220
|
class FigureClassificationPrediction(BaseModel):
|
199
221
|
figure_count: int = 0
|
@@ -0,0 +1,90 @@
|
|
1
|
+
import logging
|
2
|
+
from enum import Enum
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from pydantic import BaseModel
|
7
|
+
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class LayoutModelConfig(BaseModel):
|
14
|
+
name: str
|
15
|
+
repo_id: str
|
16
|
+
revision: str
|
17
|
+
model_path: str
|
18
|
+
supported_devices: list[AcceleratorDevice] = [
|
19
|
+
AcceleratorDevice.CPU,
|
20
|
+
AcceleratorDevice.CUDA,
|
21
|
+
AcceleratorDevice.MPS,
|
22
|
+
]
|
23
|
+
|
24
|
+
@property
|
25
|
+
def model_repo_folder(self) -> str:
|
26
|
+
return self.repo_id.replace("/", "--")
|
27
|
+
|
28
|
+
|
29
|
+
# HuggingFace Layout Models
|
30
|
+
|
31
|
+
# Default Docling Layout Model
|
32
|
+
DOCLING_LAYOUT_V2 = LayoutModelConfig(
|
33
|
+
name="docling_layout_v2",
|
34
|
+
repo_id="ds4sd/docling-layout-old",
|
35
|
+
revision="main",
|
36
|
+
model_path="",
|
37
|
+
)
|
38
|
+
|
39
|
+
DOCLING_LAYOUT_HERON = LayoutModelConfig(
|
40
|
+
name="docling_layout_heron",
|
41
|
+
repo_id="ds4sd/docling-layout-heron",
|
42
|
+
revision="main",
|
43
|
+
model_path="",
|
44
|
+
)
|
45
|
+
|
46
|
+
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
|
47
|
+
name="docling_layout_heron_101",
|
48
|
+
repo_id="ds4sd/docling-layout-heron-101",
|
49
|
+
revision="main",
|
50
|
+
model_path="",
|
51
|
+
)
|
52
|
+
|
53
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
|
54
|
+
name="docling_layout_egret_medium",
|
55
|
+
repo_id="ds4sd/docling-layout-egret-medium",
|
56
|
+
revision="main",
|
57
|
+
model_path="",
|
58
|
+
)
|
59
|
+
|
60
|
+
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
|
61
|
+
name="docling_layout_egret_large",
|
62
|
+
repo_id="ds4sd/docling-layout-egret-large",
|
63
|
+
revision="main",
|
64
|
+
model_path="",
|
65
|
+
)
|
66
|
+
|
67
|
+
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
|
68
|
+
name="docling_layout_egret_xlarge",
|
69
|
+
repo_id="ds4sd/docling-layout-egret-xlarge",
|
70
|
+
revision="main",
|
71
|
+
model_path="",
|
72
|
+
)
|
73
|
+
|
74
|
+
# Example for a hypothetical alternative model
|
75
|
+
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
|
76
|
+
# name="alternative_layout",
|
77
|
+
# repo_id="someorg/alternative-layout",
|
78
|
+
# revision="main",
|
79
|
+
# model_path="model_artifacts/layout_alt",
|
80
|
+
# )
|
81
|
+
|
82
|
+
|
83
|
+
class LayoutModelType(str, Enum):
|
84
|
+
DOCLING_LAYOUT_V2 = "docling_layout_v2"
|
85
|
+
DOCLING_LAYOUT_HERON = "docling_layout_heron"
|
86
|
+
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
|
87
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
|
88
|
+
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
|
89
|
+
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
|
90
|
+
# ALTERNATIVE_LAYOUT = "alternative_layout"
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
from datetime import datetime
|
2
3
|
from enum import Enum
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
@@ -15,6 +16,15 @@ from docling.datamodel import asr_model_specs
|
|
15
16
|
|
16
17
|
# Import the following for backwards compatibility
|
17
18
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
19
|
+
from docling.datamodel.layout_model_specs import (
|
20
|
+
DOCLING_LAYOUT_EGRET_LARGE,
|
21
|
+
DOCLING_LAYOUT_EGRET_MEDIUM,
|
22
|
+
DOCLING_LAYOUT_EGRET_XLARGE,
|
23
|
+
DOCLING_LAYOUT_HERON,
|
24
|
+
DOCLING_LAYOUT_HERON_101,
|
25
|
+
DOCLING_LAYOUT_V2,
|
26
|
+
LayoutModelConfig,
|
27
|
+
)
|
18
28
|
from docling.datamodel.pipeline_options_asr_model import (
|
19
29
|
InlineAsrOptions,
|
20
30
|
)
|
@@ -265,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
265
275
|
)
|
266
276
|
|
267
277
|
|
278
|
+
class LayoutOptions(BaseModel):
|
279
|
+
"""Options for layout processing."""
|
280
|
+
|
281
|
+
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
282
|
+
model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
|
283
|
+
|
284
|
+
|
268
285
|
class AsrPipelineOptions(PipelineOptions):
|
269
286
|
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
270
287
|
artifacts_path: Optional[Union[Path, str]] = None
|
@@ -289,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
289
306
|
picture_description_options: PictureDescriptionBaseOptions = (
|
290
307
|
smolvlm_picture_description
|
291
308
|
)
|
309
|
+
layout_options: LayoutOptions = LayoutOptions()
|
292
310
|
|
293
311
|
images_scale: float = 1.0
|
294
312
|
generate_page_images: bool = False
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any, Dict, List, Literal, Optional, Union
|
2
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
3
3
|
|
4
|
+
from docling_core.types.doc.page import SegmentedPage
|
4
5
|
from pydantic import AnyUrl, BaseModel
|
5
6
|
from typing_extensions import deprecated
|
6
7
|
|
@@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
9
10
|
|
10
11
|
class BaseVlmOptions(BaseModel):
|
11
12
|
kind: str
|
12
|
-
prompt: str
|
13
|
+
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
|
13
14
|
scale: float = 2.0
|
14
15
|
max_size: Optional[int] = None
|
16
|
+
temperature: float = 0.0
|
15
17
|
|
16
18
|
|
17
19
|
class ResponseFormat(str, Enum):
|
@@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
|
|
29
31
|
AUTOMODEL = "automodel"
|
30
32
|
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
31
33
|
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
34
|
+
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
|
35
|
+
|
36
|
+
|
37
|
+
class TransformersPromptStyle(str, Enum):
|
38
|
+
CHAT = "chat"
|
39
|
+
RAW = "raw"
|
32
40
|
|
33
41
|
|
34
42
|
class InlineVlmOptions(BaseVlmOptions):
|
@@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
42
50
|
|
43
51
|
inference_framework: InferenceFramework
|
44
52
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
53
|
+
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
|
45
54
|
response_format: ResponseFormat
|
46
55
|
|
47
56
|
torch_dtype: Optional[str] = None
|
@@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
51
60
|
AcceleratorDevice.MPS,
|
52
61
|
]
|
53
62
|
|
54
|
-
temperature: float = 0.0
|
55
63
|
stop_strings: List[str] = []
|
56
64
|
extra_generation_config: Dict[str, Any] = {}
|
57
65
|
|
@@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
|
|
29
29
|
|
30
30
|
self.timeout = self.vlm_options.timeout
|
31
31
|
self.concurrency = self.vlm_options.concurrency
|
32
|
-
self.prompt_content = (
|
33
|
-
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
34
|
-
)
|
35
32
|
self.params = {
|
36
33
|
**self.vlm_options.params,
|
37
|
-
"temperature":
|
34
|
+
"temperature": self.vlm_options.temperature,
|
38
35
|
}
|
39
36
|
|
40
37
|
def __call__(
|
@@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
|
|
56
53
|
if hi_res_image.mode != "RGB":
|
57
54
|
hi_res_image = hi_res_image.convert("RGB")
|
58
55
|
|
56
|
+
if callable(self.vlm_options.prompt):
|
57
|
+
prompt = self.vlm_options.prompt(page.parsed_page)
|
58
|
+
else:
|
59
|
+
prompt = self.vlm_options.prompt
|
60
|
+
|
59
61
|
page_tags = api_image_request(
|
60
62
|
image=hi_res_image,
|
61
|
-
prompt=
|
63
|
+
prompt=prompt,
|
62
64
|
url=self.vlm_options.url,
|
63
65
|
timeout=self.timeout,
|
64
66
|
headers=self.vlm_options.headers,
|
@@ -3,14 +3,13 @@ import logging
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
from collections.abc import Iterable
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import List, Optional, Type
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Type
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
10
10
|
from docling_core.types.doc.page import TextCell
|
11
11
|
from PIL import Image, ImageDraw
|
12
12
|
from rtree import index
|
13
|
-
from scipy.ndimage import binary_dilation, find_objects, label
|
14
13
|
|
15
14
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
16
15
|
from docling.datamodel.base_models import Page
|
@@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
31
30
|
options: OcrOptions,
|
32
31
|
accelerator_options: AcceleratorOptions,
|
33
32
|
):
|
33
|
+
# Make sure any delay/error from import occurs on ocr model init and not first use
|
34
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
35
|
+
|
34
36
|
self.enabled = enabled
|
35
37
|
self.options = options
|
36
38
|
|
37
39
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
38
40
|
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
41
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
42
|
+
|
39
43
|
BITMAP_COVERAGE_TRESHOLD = 0.75
|
40
44
|
assert page.size is not None
|
41
45
|
|
@@ -14,7 +14,8 @@ from PIL import Image
|
|
14
14
|
from pydantic import BaseModel
|
15
15
|
|
16
16
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
17
|
-
from docling.
|
17
|
+
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
18
|
+
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
18
19
|
from docling.models.utils.hf_model_download import download_hf_model
|
19
20
|
from docling.utils.accelerator_utils import decide_device
|
20
21
|
|
@@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
|
|
32
33
|
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
33
34
|
|
34
35
|
|
35
|
-
class DocumentPictureClassifier(
|
36
|
+
class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
|
36
37
|
"""
|
37
38
|
A model for classifying pictures in documents.
|
38
39
|
|
@@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
135
136
|
def __call__(
|
136
137
|
self,
|
137
138
|
doc: DoclingDocument,
|
138
|
-
element_batch: Iterable[
|
139
|
+
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
139
140
|
) -> Iterable[NodeItem]:
|
140
141
|
"""
|
141
142
|
Processes a batch of elements and enriches them with classification predictions.
|
@@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
144
145
|
----------
|
145
146
|
doc : DoclingDocument
|
146
147
|
The document containing the elements to be processed.
|
147
|
-
element_batch : Iterable[
|
148
|
+
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
148
149
|
A batch of pictures to classify.
|
149
150
|
|
150
151
|
Returns
|
@@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
155
156
|
"""
|
156
157
|
if not self.enabled:
|
157
158
|
for element in element_batch:
|
158
|
-
yield element
|
159
|
+
yield element.item
|
159
160
|
return
|
160
161
|
|
161
162
|
images: List[Union[Image.Image, np.ndarray]] = []
|
162
163
|
elements: List[PictureItem] = []
|
163
164
|
for el in element_batch:
|
164
|
-
assert isinstance(el, PictureItem)
|
165
|
-
elements.append(el)
|
166
|
-
|
167
|
-
assert img is not None
|
168
|
-
images.append(img)
|
165
|
+
assert isinstance(el.item, PictureItem)
|
166
|
+
elements.append(el.item)
|
167
|
+
images.append(el.image)
|
169
168
|
|
170
169
|
outputs = self.document_picture_classifier.predict(images)
|
171
170
|
|
172
|
-
for
|
173
|
-
|
171
|
+
for item, output in zip(elements, outputs):
|
172
|
+
item.annotations.append(
|
174
173
|
PictureClassificationData(
|
175
174
|
provenance="DocumentPictureClassifier",
|
176
175
|
predicted_classes=[
|
@@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
183
182
|
)
|
184
183
|
)
|
185
184
|
|
186
|
-
yield
|
185
|
+
yield item
|
@@ -7,12 +7,13 @@ from typing import Optional
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import DocItemLabel
|
10
|
-
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
11
10
|
from PIL import Image
|
12
11
|
|
13
12
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
14
13
|
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
15
14
|
from docling.datamodel.document import ConversionResult
|
15
|
+
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
16
|
+
from docling.datamodel.pipeline_options import LayoutOptions
|
16
17
|
from docling.datamodel.settings import settings
|
17
18
|
from docling.models.base_model import BasePageModel
|
18
19
|
from docling.models.utils.hf_model_download import download_hf_model
|
@@ -25,9 +26,6 @@ _log = logging.getLogger(__name__)
|
|
25
26
|
|
26
27
|
|
27
28
|
class LayoutModel(BasePageModel):
|
28
|
-
_model_repo_folder = "ds4sd--docling-models"
|
29
|
-
_model_path = "model_artifacts/layout"
|
30
|
-
|
31
29
|
TEXT_ELEM_LABELS = [
|
32
30
|
DocItemLabel.TEXT,
|
33
31
|
DocItemLabel.FOOTNOTE,
|
@@ -49,28 +47,38 @@ class LayoutModel(BasePageModel):
|
|
49
47
|
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
50
48
|
|
51
49
|
def __init__(
|
52
|
-
self,
|
50
|
+
self,
|
51
|
+
artifacts_path: Optional[Path],
|
52
|
+
accelerator_options: AcceleratorOptions,
|
53
|
+
options: LayoutOptions,
|
53
54
|
):
|
55
|
+
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
56
|
+
|
57
|
+
self.options = options
|
58
|
+
|
54
59
|
device = decide_device(accelerator_options.device)
|
60
|
+
layout_model_config = options.model_spec
|
61
|
+
model_repo_folder = layout_model_config.model_repo_folder
|
62
|
+
model_path = layout_model_config.model_path
|
55
63
|
|
56
64
|
if artifacts_path is None:
|
57
|
-
artifacts_path =
|
65
|
+
artifacts_path = (
|
66
|
+
self.download_models(layout_model_config=layout_model_config)
|
67
|
+
/ model_path
|
68
|
+
)
|
58
69
|
else:
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
artifacts_path / self._model_repo_folder / self._model_path
|
63
|
-
)
|
64
|
-
elif (artifacts_path / self._model_path).exists():
|
70
|
+
if (artifacts_path / model_repo_folder).exists():
|
71
|
+
artifacts_path = artifacts_path / model_repo_folder / model_path
|
72
|
+
elif (artifacts_path / model_path).exists():
|
65
73
|
warnings.warn(
|
66
74
|
"The usage of artifacts_path containing directly "
|
67
|
-
f"{
|
75
|
+
f"{model_path} is deprecated. Please point "
|
68
76
|
"the artifacts_path to the parent containing "
|
69
|
-
f"the {
|
77
|
+
f"the {model_repo_folder} folder.",
|
70
78
|
DeprecationWarning,
|
71
79
|
stacklevel=3,
|
72
80
|
)
|
73
|
-
artifacts_path = artifacts_path /
|
81
|
+
artifacts_path = artifacts_path / model_path
|
74
82
|
|
75
83
|
self.layout_predictor = LayoutPredictor(
|
76
84
|
artifact_path=str(artifacts_path),
|
@@ -83,10 +91,11 @@ class LayoutModel(BasePageModel):
|
|
83
91
|
local_dir: Optional[Path] = None,
|
84
92
|
force: bool = False,
|
85
93
|
progress: bool = False,
|
94
|
+
layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
|
86
95
|
) -> Path:
|
87
96
|
return download_hf_model(
|
88
|
-
repo_id=
|
89
|
-
revision=
|
97
|
+
repo_id=layout_model_config.repo_id,
|
98
|
+
revision=layout_model_config.revision,
|
90
99
|
local_dir=local_dir,
|
91
100
|
force=force,
|
92
101
|
progress=progress,
|
@@ -176,7 +185,7 @@ class LayoutModel(BasePageModel):
|
|
176
185
|
# Apply postprocessing
|
177
186
|
|
178
187
|
processed_clusters, processed_cells = LayoutPostprocessor(
|
179
|
-
page, clusters
|
188
|
+
page, clusters, self.options
|
180
189
|
).postprocess()
|
181
190
|
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
182
191
|
|