docling 2.40.0__tar.gz → 2.42.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. {docling-2.40.0 → docling-2.42.0}/PKG-INFO +4 -3
  2. {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/latex/omml.py +9 -1
  3. {docling-2.40.0 → docling-2.42.0}/docling/backend/html_backend.py +25 -17
  4. {docling-2.40.0 → docling-2.42.0}/docling/backend/xml/jats_backend.py +12 -4
  5. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/asr_model_specs.py +6 -6
  6. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/base_models.py +23 -1
  7. docling-2.42.0/docling/datamodel/layout_model_specs.py +90 -0
  8. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/pipeline_options.py +14 -1
  9. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/pipeline_options_vlm_model.py +11 -3
  10. {docling-2.40.0 → docling-2.42.0}/docling/document_converter.py +14 -11
  11. {docling-2.40.0 → docling-2.42.0}/docling/models/api_vlm_model.py +7 -5
  12. {docling-2.40.0 → docling-2.42.0}/docling/models/document_picture_classifier.py +12 -13
  13. {docling-2.40.0 → docling-2.42.0}/docling/models/layout_model.py +17 -15
  14. {docling-2.40.0 → docling-2.42.0}/docling/models/picture_description_vlm_model.py +2 -1
  15. {docling-2.40.0 → docling-2.42.0}/docling/models/vlm_models_inline/hf_transformers_model.py +39 -20
  16. {docling-2.40.0 → docling-2.42.0}/docling/models/vlm_models_inline/mlx_model.py +5 -3
  17. {docling-2.40.0 → docling-2.42.0}/docling/pipeline/standard_pdf_pipeline.py +2 -3
  18. {docling-2.40.0 → docling-2.42.0}/docling/pipeline/vlm_pipeline.py +1 -0
  19. {docling-2.40.0 → docling-2.42.0}/docling/utils/layout_postprocessor.py +3 -2
  20. {docling-2.40.0 → docling-2.42.0}/docling/utils/model_downloader.py +2 -1
  21. {docling-2.40.0 → docling-2.42.0}/docling/utils/ocr_utils.py +1 -1
  22. {docling-2.40.0 → docling-2.42.0}/docling/utils/orientation.py +22 -28
  23. {docling-2.40.0 → docling-2.42.0}/docling.egg-info/PKG-INFO +4 -3
  24. {docling-2.40.0 → docling-2.42.0}/docling.egg-info/SOURCES.txt +2 -0
  25. {docling-2.40.0 → docling-2.42.0}/docling.egg-info/requires.txt +3 -2
  26. {docling-2.40.0 → docling-2.42.0}/pyproject.toml +5 -5
  27. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_jats.py +14 -14
  28. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_markdown.py +6 -1
  29. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_patent_uspto.py +11 -3
  30. {docling-2.40.0 → docling-2.42.0}/tests/test_document_picture_classifier.py +2 -1
  31. {docling-2.40.0 → docling-2.42.0}/tests/test_e2e_conversion.py +2 -8
  32. {docling-2.40.0 → docling-2.42.0}/tests/test_e2e_ocr_conversion.py +5 -10
  33. {docling-2.40.0 → docling-2.42.0}/tests/test_interfaces.py +2 -9
  34. {docling-2.40.0 → docling-2.42.0}/tests/test_legacy_format_transform.py +1 -0
  35. docling-2.42.0/tests/test_ocr_utils.py +80 -0
  36. {docling-2.40.0 → docling-2.42.0}/LICENSE +0 -0
  37. {docling-2.40.0 → docling-2.42.0}/README.md +0 -0
  38. {docling-2.40.0 → docling-2.42.0}/docling/__init__.py +0 -0
  39. {docling-2.40.0 → docling-2.42.0}/docling/backend/__init__.py +0 -0
  40. {docling-2.40.0 → docling-2.42.0}/docling/backend/abstract_backend.py +0 -0
  41. {docling-2.40.0 → docling-2.42.0}/docling/backend/asciidoc_backend.py +0 -0
  42. {docling-2.40.0 → docling-2.42.0}/docling/backend/csv_backend.py +0 -0
  43. {docling-2.40.0 → docling-2.42.0}/docling/backend/docling_parse_backend.py +0 -0
  44. {docling-2.40.0 → docling-2.42.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  45. {docling-2.40.0 → docling-2.42.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  46. {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/__init__.py +0 -0
  47. {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/latex/__init__.py +0 -0
  48. {docling-2.40.0 → docling-2.42.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  49. {docling-2.40.0 → docling-2.42.0}/docling/backend/json/__init__.py +0 -0
  50. {docling-2.40.0 → docling-2.42.0}/docling/backend/json/docling_json_backend.py +0 -0
  51. {docling-2.40.0 → docling-2.42.0}/docling/backend/md_backend.py +0 -0
  52. {docling-2.40.0 → docling-2.42.0}/docling/backend/msexcel_backend.py +0 -0
  53. {docling-2.40.0 → docling-2.42.0}/docling/backend/mspowerpoint_backend.py +0 -0
  54. {docling-2.40.0 → docling-2.42.0}/docling/backend/msword_backend.py +0 -0
  55. {docling-2.40.0 → docling-2.42.0}/docling/backend/noop_backend.py +0 -0
  56. {docling-2.40.0 → docling-2.42.0}/docling/backend/pdf_backend.py +0 -0
  57. {docling-2.40.0 → docling-2.42.0}/docling/backend/pypdfium2_backend.py +0 -0
  58. {docling-2.40.0 → docling-2.42.0}/docling/backend/xml/__init__.py +0 -0
  59. {docling-2.40.0 → docling-2.42.0}/docling/backend/xml/uspto_backend.py +0 -0
  60. {docling-2.40.0 → docling-2.42.0}/docling/chunking/__init__.py +0 -0
  61. {docling-2.40.0 → docling-2.42.0}/docling/cli/__init__.py +0 -0
  62. {docling-2.40.0 → docling-2.42.0}/docling/cli/main.py +0 -0
  63. {docling-2.40.0 → docling-2.42.0}/docling/cli/models.py +0 -0
  64. {docling-2.40.0 → docling-2.42.0}/docling/cli/tools.py +0 -0
  65. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/__init__.py +0 -0
  66. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/accelerator_options.py +0 -0
  67. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/document.py +0 -0
  68. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  69. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/settings.py +0 -0
  70. {docling-2.40.0 → docling-2.42.0}/docling/datamodel/vlm_model_specs.py +0 -0
  71. {docling-2.40.0 → docling-2.42.0}/docling/exceptions.py +0 -0
  72. {docling-2.40.0 → docling-2.42.0}/docling/models/__init__.py +0 -0
  73. {docling-2.40.0 → docling-2.42.0}/docling/models/base_model.py +0 -0
  74. {docling-2.40.0 → docling-2.42.0}/docling/models/base_ocr_model.py +0 -0
  75. {docling-2.40.0 → docling-2.42.0}/docling/models/code_formula_model.py +0 -0
  76. {docling-2.40.0 → docling-2.42.0}/docling/models/easyocr_model.py +0 -0
  77. {docling-2.40.0 → docling-2.42.0}/docling/models/factories/__init__.py +0 -0
  78. {docling-2.40.0 → docling-2.42.0}/docling/models/factories/base_factory.py +0 -0
  79. {docling-2.40.0 → docling-2.42.0}/docling/models/factories/ocr_factory.py +0 -0
  80. {docling-2.40.0 → docling-2.42.0}/docling/models/factories/picture_description_factory.py +0 -0
  81. {docling-2.40.0 → docling-2.42.0}/docling/models/ocr_mac_model.py +0 -0
  82. {docling-2.40.0 → docling-2.42.0}/docling/models/page_assemble_model.py +0 -0
  83. {docling-2.40.0 → docling-2.42.0}/docling/models/page_preprocessing_model.py +0 -0
  84. {docling-2.40.0 → docling-2.42.0}/docling/models/picture_description_api_model.py +0 -0
  85. {docling-2.40.0 → docling-2.42.0}/docling/models/picture_description_base_model.py +0 -0
  86. {docling-2.40.0 → docling-2.42.0}/docling/models/plugins/__init__.py +0 -0
  87. {docling-2.40.0 → docling-2.42.0}/docling/models/plugins/defaults.py +0 -0
  88. {docling-2.40.0 → docling-2.42.0}/docling/models/rapid_ocr_model.py +0 -0
  89. {docling-2.40.0 → docling-2.42.0}/docling/models/readingorder_model.py +0 -0
  90. {docling-2.40.0 → docling-2.42.0}/docling/models/table_structure_model.py +0 -0
  91. {docling-2.40.0 → docling-2.42.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  92. {docling-2.40.0 → docling-2.42.0}/docling/models/tesseract_ocr_model.py +0 -0
  93. {docling-2.40.0 → docling-2.42.0}/docling/models/utils/__init__.py +0 -0
  94. {docling-2.40.0 → docling-2.42.0}/docling/models/utils/hf_model_download.py +0 -0
  95. {docling-2.40.0 → docling-2.42.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  96. {docling-2.40.0 → docling-2.42.0}/docling/pipeline/__init__.py +0 -0
  97. {docling-2.40.0 → docling-2.42.0}/docling/pipeline/asr_pipeline.py +0 -0
  98. {docling-2.40.0 → docling-2.42.0}/docling/pipeline/base_pipeline.py +0 -0
  99. {docling-2.40.0 → docling-2.42.0}/docling/pipeline/simple_pipeline.py +0 -0
  100. {docling-2.40.0 → docling-2.42.0}/docling/py.typed +0 -0
  101. {docling-2.40.0 → docling-2.42.0}/docling/utils/__init__.py +0 -0
  102. {docling-2.40.0 → docling-2.42.0}/docling/utils/accelerator_utils.py +0 -0
  103. {docling-2.40.0 → docling-2.42.0}/docling/utils/api_image_request.py +0 -0
  104. {docling-2.40.0 → docling-2.42.0}/docling/utils/export.py +0 -0
  105. {docling-2.40.0 → docling-2.42.0}/docling/utils/glm_utils.py +0 -0
  106. {docling-2.40.0 → docling-2.42.0}/docling/utils/locks.py +0 -0
  107. {docling-2.40.0 → docling-2.42.0}/docling/utils/profiling.py +0 -0
  108. {docling-2.40.0 → docling-2.42.0}/docling/utils/utils.py +0 -0
  109. {docling-2.40.0 → docling-2.42.0}/docling/utils/visualization.py +0 -0
  110. {docling-2.40.0 → docling-2.42.0}/docling.egg-info/dependency_links.txt +0 -0
  111. {docling-2.40.0 → docling-2.42.0}/docling.egg-info/entry_points.txt +0 -0
  112. {docling-2.40.0 → docling-2.42.0}/docling.egg-info/top_level.txt +0 -0
  113. {docling-2.40.0 → docling-2.42.0}/setup.cfg +0 -0
  114. {docling-2.40.0 → docling-2.42.0}/tests/test_asr_pipeline.py +0 -0
  115. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_asciidoc.py +0 -0
  116. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_csv.py +0 -0
  117. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_json.py +0 -0
  118. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_parse.py +0 -0
  119. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_parse_v2.py +0 -0
  120. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_docling_parse_v4.py +0 -0
  121. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_html.py +0 -0
  122. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_msexcel.py +0 -0
  123. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_msword.py +0 -0
  124. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_pdfium.py +0 -0
  125. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_pptx.py +0 -0
  126. {docling-2.40.0 → docling-2.42.0}/tests/test_backend_webp.py +0 -0
  127. {docling-2.40.0 → docling-2.42.0}/tests/test_cli.py +0 -0
  128. {docling-2.40.0 → docling-2.42.0}/tests/test_code_formula.py +0 -0
  129. {docling-2.40.0 → docling-2.42.0}/tests/test_data_gen_flag.py +0 -0
  130. {docling-2.40.0 → docling-2.42.0}/tests/test_input_doc.py +0 -0
  131. {docling-2.40.0 → docling-2.42.0}/tests/test_invalid_input.py +0 -0
  132. {docling-2.40.0 → docling-2.42.0}/tests/test_options.py +0 -0
  133. {docling-2.40.0 → docling-2.42.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.40.0
3
+ Version: 2.42.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
30
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.6.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
50
50
  Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
+ Requires-Dist: accelerate<2,>=1.0.0
53
54
  Provides-Extra: tesserocr
54
55
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
55
56
  Provides-Extra: ocrmac
@@ -62,7 +63,7 @@ Provides-Extra: rapidocr
62
63
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
64
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
64
65
  Provides-Extra: asr
65
- Requires-Dist: openai-whisper>=20240930; extra == "asr"
66
+ Requires-Dist: openai-whisper>=20250625; extra == "asr"
66
67
  Dynamic: license-file
67
68
 
68
69
  <p align="center">
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
260
260
  the fraction object
261
261
  """
262
262
  c_dict = self.process_children_dict(elm)
263
- pr = c_dict["fPr"]
263
+ pr = c_dict.get("fPr")
264
+ if pr is None:
265
+ # Handle missing fPr element gracefully
266
+ _log.debug("Missing fPr element in fraction, using default formatting")
267
+ latex_s = F_DEFAULT
268
+ return latex_s.format(
269
+ num=c_dict.get("num"),
270
+ den=c_dict.get("den"),
271
+ )
264
272
  latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
265
273
  return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
266
274
 
@@ -379,6 +379,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
379
379
  else:
380
380
  _log.debug(f"list-item has no text: {element}")
381
381
 
382
+ @staticmethod
383
+ def _get_cell_spans(cell: Tag) -> tuple[int, int]:
384
+ """Extract colspan and rowspan values from a table cell tag.
385
+
386
+ This function retrieves the 'colspan' and 'rowspan' attributes from a given
387
+ table cell tag.
388
+ If the attribute does not exist or it is not numeric, it defaults to 1.
389
+ """
390
+ raw_spans: tuple[str, str] = (
391
+ str(cell.get("colspan", "1")),
392
+ str(cell.get("rowspan", "1")),
393
+ )
394
+ int_spans: tuple[int, int] = (
395
+ int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
396
+ int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
397
+ )
398
+
399
+ return int_spans
400
+
382
401
  @staticmethod
383
402
  def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
384
403
  nested_tables = element.find("table")
@@ -398,10 +417,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
398
417
  if not isinstance(row, Tag):
399
418
  continue
400
419
  cell_tag = cast(Tag, cell)
401
- val = cell_tag.get("colspan", "1")
402
- colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
403
- col_count += colspan
404
- if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
420
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
421
+ col_count += col_span
422
+ if cell_tag.name == "td" or row_span == 1:
405
423
  is_row_header = False
406
424
  num_cols = max(num_cols, col_count)
407
425
  if not is_row_header:
@@ -428,10 +446,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
428
446
  row_header = True
429
447
  for html_cell in cells:
430
448
  if isinstance(html_cell, Tag):
449
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
431
450
  if html_cell.name == "td":
432
451
  col_header = False
433
452
  row_header = False
434
- elif html_cell.get("rowspan") is None:
453
+ elif row_span == 1:
435
454
  row_header = False
436
455
  if not row_header:
437
456
  row_idx += 1
@@ -456,18 +475,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
456
475
  text = html_cell.text
457
476
 
458
477
  # label = html_cell.name
459
- col_val = html_cell.get("colspan", "1")
460
- col_span = (
461
- int(col_val)
462
- if isinstance(col_val, str) and col_val.isnumeric()
463
- else 1
464
- )
465
- row_val = html_cell.get("rowspan", "1")
466
- row_span = (
467
- int(row_val)
468
- if isinstance(row_val, str) and row_val.isnumeric()
469
- else 1
470
- )
478
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
471
479
  if row_header:
472
480
  row_span -= 1
473
481
  while (
@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
93
93
 
94
94
  # Initialize the root of the document hierarchy
95
95
  self.root: Optional[NodeItem] = None
96
-
97
- self.valid = False
96
+ self.hlevel: int = 0
97
+ self.valid: bool = False
98
98
  try:
99
99
  if isinstance(self.path_or_stream, BytesIO):
100
100
  self.path_or_stream.seek(0)
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
147
147
  binary_hash=self.document_hash,
148
148
  )
149
149
  doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
150
+ self.hlevel = 0
150
151
 
151
152
  # Get metadata XML components
152
153
  xml_components: XMLComponents = self._parse_metadata()
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
304
305
  title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
305
306
  if not text:
306
307
  continue
307
- parent = doc.add_heading(parent=self.root, text=title)
308
+ parent = doc.add_heading(
309
+ parent=self.root, text=title, level=self.hlevel + 1
310
+ )
308
311
  doc.add_text(
309
312
  parent=parent,
310
313
  text=text,
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
637
640
  elif child.tag == "ack":
638
641
  text = DEFAULT_HEADER_ACKNOWLEDGMENTS
639
642
  if text:
640
- new_parent = doc.add_heading(text=text, parent=parent)
643
+ self.hlevel += 1
644
+ new_parent = doc.add_heading(
645
+ text=text, parent=parent, level=self.hlevel
646
+ )
641
647
  elif child.tag == "list":
642
648
  new_parent = doc.add_group(
643
649
  label=GroupLabel.LIST, name="list", parent=parent
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
694
700
  new_text = self._walk_linear(doc, new_parent, child)
695
701
  if not (node.getparent().tag == "p" and node.tag in flush_tags):
696
702
  node_text += new_text
703
+ if child.tag in ("sec", "ack") and text:
704
+ self.hlevel -= 1
697
705
 
698
706
  # pick up the tail text
699
707
  node_text += child.tail.replace("\n", " ") if child.tail else ""
@@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
22
22
  verbose=True,
23
23
  timestamps=True,
24
24
  word_timestamps=True,
25
- temperatue=0.0,
25
+ temperature=0.0,
26
26
  max_new_tokens=256,
27
27
  max_time_chunk=30.0,
28
28
  )
@@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
33
33
  verbose=True,
34
34
  timestamps=True,
35
35
  word_timestamps=True,
36
- temperatue=0.0,
36
+ temperature=0.0,
37
37
  max_new_tokens=256,
38
38
  max_time_chunk=30.0,
39
39
  )
@@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
44
44
  verbose=True,
45
45
  timestamps=True,
46
46
  word_timestamps=True,
47
- temperatue=0.0,
47
+ temperature=0.0,
48
48
  max_new_tokens=256,
49
49
  max_time_chunk=30.0,
50
50
  )
@@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
55
55
  verbose=True,
56
56
  timestamps=True,
57
57
  word_timestamps=True,
58
- temperatue=0.0,
58
+ temperature=0.0,
59
59
  max_new_tokens=256,
60
60
  max_time_chunk=30.0,
61
61
  )
@@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
66
66
  verbose=True,
67
67
  timestamps=True,
68
68
  word_timestamps=True,
69
- temperatue=0.0,
69
+ temperature=0.0,
70
70
  max_new_tokens=256,
71
71
  max_time_chunk=30.0,
72
72
  )
@@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
77
77
  verbose=True,
78
78
  timestamps=True,
79
79
  word_timestamps=True,
80
- temperatue=0.0,
80
+ temperature=0.0,
81
81
  max_new_tokens=256,
82
82
  max_time_chunk=30.0,
83
83
  )
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
12
12
  Size,
13
13
  TableCell,
14
14
  )
15
+ from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
15
16
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
16
17
  from docling_core.types.io import (
17
18
  DocumentStream,
@@ -19,7 +20,14 @@ from docling_core.types.io import (
19
20
 
20
21
  # DO NOT REMOVE; explicitly exposed from this location
21
22
  from PIL.Image import Image
22
- from pydantic import BaseModel, ConfigDict, Field, computed_field
23
+ from pydantic import (
24
+ BaseModel,
25
+ ConfigDict,
26
+ Field,
27
+ FieldSerializationInfo,
28
+ computed_field,
29
+ field_serializer,
30
+ )
23
31
 
24
32
  if TYPE_CHECKING:
25
33
  from docling.backend.pdf_backend import PdfPageBackend
@@ -142,6 +150,10 @@ class Cluster(BaseModel):
142
150
  cells: List[TextCell] = []
143
151
  children: List["Cluster"] = [] # Add child cluster support
144
152
 
153
+ @field_serializer("confidence")
154
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
155
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
156
+
145
157
 
146
158
  class BasePageElement(BaseModel):
147
159
  label: DocItemLabel
@@ -194,6 +206,16 @@ class FigureElement(BasePageElement):
194
206
  predicted_class: Optional[str] = None
195
207
  confidence: Optional[float] = None
196
208
 
209
+ @field_serializer("confidence")
210
+ def _serialize(
211
+ self, value: Optional[float], info: FieldSerializationInfo
212
+ ) -> Optional[float]:
213
+ return (
214
+ round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
215
+ if value is not None
216
+ else None
217
+ )
218
+
197
219
 
198
220
  class FigureClassificationPrediction(BaseModel):
199
221
  figure_count: int = 0
@@ -0,0 +1,90 @@
1
+ import logging
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorDevice
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class LayoutModelConfig(BaseModel):
14
+ name: str
15
+ repo_id: str
16
+ revision: str
17
+ model_path: str
18
+ supported_devices: list[AcceleratorDevice] = [
19
+ AcceleratorDevice.CPU,
20
+ AcceleratorDevice.CUDA,
21
+ AcceleratorDevice.MPS,
22
+ ]
23
+
24
+ @property
25
+ def model_repo_folder(self) -> str:
26
+ return self.repo_id.replace("/", "--")
27
+
28
+
29
+ # HuggingFace Layout Models
30
+
31
+ # Default Docling Layout Model
32
+ DOCLING_LAYOUT_V2 = LayoutModelConfig(
33
+ name="docling_layout_v2",
34
+ repo_id="ds4sd/docling-layout-old",
35
+ revision="main",
36
+ model_path="",
37
+ )
38
+
39
+ DOCLING_LAYOUT_HERON = LayoutModelConfig(
40
+ name="docling_layout_heron",
41
+ repo_id="ds4sd/docling-layout-heron",
42
+ revision="main",
43
+ model_path="",
44
+ )
45
+
46
+ DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
47
+ name="docling_layout_heron_101",
48
+ repo_id="ds4sd/docling-layout-heron-101",
49
+ revision="main",
50
+ model_path="",
51
+ )
52
+
53
+ DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
54
+ name="docling_layout_egret_medium",
55
+ repo_id="ds4sd/docling-layout-egret-medium",
56
+ revision="main",
57
+ model_path="",
58
+ )
59
+
60
+ DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
61
+ name="docling_layout_egret_large",
62
+ repo_id="ds4sd/docling-layout-egret-large",
63
+ revision="main",
64
+ model_path="",
65
+ )
66
+
67
+ DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
68
+ name="docling_layout_egret_xlarge",
69
+ repo_id="ds4sd/docling-layout-egret-xlarge",
70
+ revision="main",
71
+ model_path="",
72
+ )
73
+
74
+ # Example for a hypothetical alternative model
75
+ # ALTERNATIVE_LAYOUT = LayoutModelConfig(
76
+ # name="alternative_layout",
77
+ # repo_id="someorg/alternative-layout",
78
+ # revision="main",
79
+ # model_path="model_artifacts/layout_alt",
80
+ # )
81
+
82
+
83
+ class LayoutModelType(str, Enum):
84
+ DOCLING_LAYOUT_V2 = "docling_layout_v2"
85
+ DOCLING_LAYOUT_HERON = "docling_layout_heron"
86
+ DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
87
+ DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
88
+ DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
89
+ DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
90
+ # ALTERNATIVE_LAYOUT = "alternative_layout"
@@ -16,6 +16,15 @@ from docling.datamodel import asr_model_specs
16
16
 
17
17
  # Import the following for backwards compatibility
18
18
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
19
+ from docling.datamodel.layout_model_specs import (
20
+ DOCLING_LAYOUT_EGRET_LARGE,
21
+ DOCLING_LAYOUT_EGRET_MEDIUM,
22
+ DOCLING_LAYOUT_EGRET_XLARGE,
23
+ DOCLING_LAYOUT_HERON,
24
+ DOCLING_LAYOUT_HERON_101,
25
+ DOCLING_LAYOUT_V2,
26
+ LayoutModelConfig,
27
+ )
19
28
  from docling.datamodel.pipeline_options_asr_model import (
20
29
  InlineAsrOptions,
21
30
  )
@@ -208,7 +217,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
208
217
 
209
218
  # GraniteVision
210
219
  granite_picture_description = PictureDescriptionVlmOptions(
211
- repo_id="ibm-granite/granite-vision-3.2-2b-preview",
220
+ repo_id="ibm-granite/granite-vision-3.3-2b",
212
221
  prompt="What is shown in this image?",
213
222
  )
214
223
 
@@ -270,6 +279,10 @@ class LayoutOptions(BaseModel):
270
279
  """Options for layout processing."""
271
280
 
272
281
  create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
282
+ keep_empty_clusters: bool = (
283
+ False # Whether to keep clusters that contain no text cells
284
+ )
285
+ model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
273
286
 
274
287
 
275
288
  class AsrPipelineOptions(PipelineOptions):
@@ -1,6 +1,7 @@
1
1
  from enum import Enum
2
- from typing import Any, Dict, List, Literal, Optional, Union
2
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union
3
3
 
4
+ from docling_core.types.doc.page import SegmentedPage
4
5
  from pydantic import AnyUrl, BaseModel
5
6
  from typing_extensions import deprecated
6
7
 
@@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
9
10
 
10
11
  class BaseVlmOptions(BaseModel):
11
12
  kind: str
12
- prompt: str
13
+ prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
13
14
  scale: float = 2.0
14
15
  max_size: Optional[int] = None
16
+ temperature: float = 0.0
15
17
 
16
18
 
17
19
  class ResponseFormat(str, Enum):
@@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
29
31
  AUTOMODEL = "automodel"
30
32
  AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
31
33
  AUTOMODEL_CAUSALLM = "automodel-causallm"
34
+ AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
35
+
36
+
37
+ class TransformersPromptStyle(str, Enum):
38
+ CHAT = "chat"
39
+ RAW = "raw"
32
40
 
33
41
 
34
42
  class InlineVlmOptions(BaseVlmOptions):
@@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
42
50
 
43
51
  inference_framework: InferenceFramework
44
52
  transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
53
+ transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
45
54
  response_format: ResponseFormat
46
55
 
47
56
  torch_dtype: Optional[str] = None
@@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
51
60
  AcceleratorDevice.MPS,
52
61
  ]
53
62
 
54
- temperature: float = 0.0
55
63
  stop_strings: List[str] = []
56
64
  extra_generation_config: Dict[str, Any] = {}
57
65
 
@@ -1,6 +1,7 @@
1
1
  import hashlib
2
2
  import logging
3
3
  import sys
4
+ import threading
4
5
  import time
5
6
  from collections.abc import Iterable, Iterator
6
7
  from functools import partial
@@ -49,6 +50,7 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
49
50
  from docling.utils.utils import chunkify
50
51
 
51
52
  _log = logging.getLogger(__name__)
53
+ _PIPELINE_CACHE_LOCK = threading.Lock()
52
54
 
53
55
 
54
56
  class FormatOption(BaseModel):
@@ -315,17 +317,18 @@ class DocumentConverter:
315
317
  # Use a composite key to cache pipelines
316
318
  cache_key = (pipeline_class, options_hash)
317
319
 
318
- if cache_key not in self.initialized_pipelines:
319
- _log.info(
320
- f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
321
- )
322
- self.initialized_pipelines[cache_key] = pipeline_class(
323
- pipeline_options=pipeline_options
324
- )
325
- else:
326
- _log.debug(
327
- f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
328
- )
320
+ with _PIPELINE_CACHE_LOCK:
321
+ if cache_key not in self.initialized_pipelines:
322
+ _log.info(
323
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
324
+ )
325
+ self.initialized_pipelines[cache_key] = pipeline_class(
326
+ pipeline_options=pipeline_options
327
+ )
328
+ else:
329
+ _log.debug(
330
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
331
+ )
329
332
 
330
333
  return self.initialized_pipelines[cache_key]
331
334
 
@@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
29
29
 
30
30
  self.timeout = self.vlm_options.timeout
31
31
  self.concurrency = self.vlm_options.concurrency
32
- self.prompt_content = (
33
- f"This is a page from a document.\n{self.vlm_options.prompt}"
34
- )
35
32
  self.params = {
36
33
  **self.vlm_options.params,
37
- "temperature": 0,
34
+ "temperature": self.vlm_options.temperature,
38
35
  }
39
36
 
40
37
  def __call__(
@@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
56
53
  if hi_res_image.mode != "RGB":
57
54
  hi_res_image = hi_res_image.convert("RGB")
58
55
 
56
+ if callable(self.vlm_options.prompt):
57
+ prompt = self.vlm_options.prompt(page.parsed_page)
58
+ else:
59
+ prompt = self.vlm_options.prompt
60
+
59
61
  page_tags = api_image_request(
60
62
  image=hi_res_image,
61
- prompt=self.prompt_content,
63
+ prompt=prompt,
62
64
  url=self.vlm_options.url,
63
65
  timeout=self.timeout,
64
66
  headers=self.vlm_options.headers,
@@ -14,7 +14,8 @@ from PIL import Image
14
14
  from pydantic import BaseModel
15
15
 
16
16
  from docling.datamodel.accelerator_options import AcceleratorOptions
17
- from docling.models.base_model import BaseEnrichmentModel
17
+ from docling.datamodel.base_models import ItemAndImageEnrichmentElement
18
+ from docling.models.base_model import BaseItemAndImageEnrichmentModel
18
19
  from docling.models.utils.hf_model_download import download_hf_model
19
20
  from docling.utils.accelerator_utils import decide_device
20
21
 
@@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
32
33
  kind: Literal["document_picture_classifier"] = "document_picture_classifier"
33
34
 
34
35
 
35
- class DocumentPictureClassifier(BaseEnrichmentModel):
36
+ class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
36
37
  """
37
38
  A model for classifying pictures in documents.
38
39
 
@@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
135
136
  def __call__(
136
137
  self,
137
138
  doc: DoclingDocument,
138
- element_batch: Iterable[NodeItem],
139
+ element_batch: Iterable[ItemAndImageEnrichmentElement],
139
140
  ) -> Iterable[NodeItem]:
140
141
  """
141
142
  Processes a batch of elements and enriches them with classification predictions.
@@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
144
145
  ----------
145
146
  doc : DoclingDocument
146
147
  The document containing the elements to be processed.
147
- element_batch : Iterable[NodeItem]
148
+ element_batch : Iterable[ItemAndImageEnrichmentElement]
148
149
  A batch of pictures to classify.
149
150
 
150
151
  Returns
@@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
155
156
  """
156
157
  if not self.enabled:
157
158
  for element in element_batch:
158
- yield element
159
+ yield element.item
159
160
  return
160
161
 
161
162
  images: List[Union[Image.Image, np.ndarray]] = []
162
163
  elements: List[PictureItem] = []
163
164
  for el in element_batch:
164
- assert isinstance(el, PictureItem)
165
- elements.append(el)
166
- img = el.get_image(doc)
167
- assert img is not None
168
- images.append(img)
165
+ assert isinstance(el.item, PictureItem)
166
+ elements.append(el.item)
167
+ images.append(el.image)
169
168
 
170
169
  outputs = self.document_picture_classifier.predict(images)
171
170
 
172
- for element, output in zip(elements, outputs):
173
- element.annotations.append(
171
+ for item, output in zip(elements, outputs):
172
+ item.annotations.append(
174
173
  PictureClassificationData(
175
174
  provenance="DocumentPictureClassifier",
176
175
  predicted_classes=[
@@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
183
182
  )
184
183
  )
185
184
 
186
- yield element
185
+ yield item
@@ -12,6 +12,7 @@ from PIL import Image
12
12
  from docling.datamodel.accelerator_options import AcceleratorOptions
13
13
  from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
14
14
  from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
15
16
  from docling.datamodel.pipeline_options import LayoutOptions
16
17
  from docling.datamodel.settings import settings
17
18
  from docling.models.base_model import BasePageModel
@@ -25,9 +26,6 @@ _log = logging.getLogger(__name__)
25
26
 
26
27
 
27
28
  class LayoutModel(BasePageModel):
28
- _model_repo_folder = "ds4sd--docling-models"
29
- _model_path = "model_artifacts/layout"
30
-
31
29
  TEXT_ELEM_LABELS = [
32
30
  DocItemLabel.TEXT,
33
31
  DocItemLabel.FOOTNOTE,
@@ -59,25 +57,28 @@ class LayoutModel(BasePageModel):
59
57
  self.options = options
60
58
 
61
59
  device = decide_device(accelerator_options.device)
60
+ layout_model_config = options.model_spec
61
+ model_repo_folder = layout_model_config.model_repo_folder
62
+ model_path = layout_model_config.model_path
62
63
 
63
64
  if artifacts_path is None:
64
- artifacts_path = self.download_models() / self._model_path
65
+ artifacts_path = (
66
+ self.download_models(layout_model_config=layout_model_config)
67
+ / model_path
68
+ )
65
69
  else:
66
- # will become the default in the future
67
- if (artifacts_path / self._model_repo_folder).exists():
68
- artifacts_path = (
69
- artifacts_path / self._model_repo_folder / self._model_path
70
- )
71
- elif (artifacts_path / self._model_path).exists():
70
+ if (artifacts_path / model_repo_folder).exists():
71
+ artifacts_path = artifacts_path / model_repo_folder / model_path
72
+ elif (artifacts_path / model_path).exists():
72
73
  warnings.warn(
73
74
  "The usage of artifacts_path containing directly "
74
- f"{self._model_path} is deprecated. Please point "
75
+ f"{model_path} is deprecated. Please point "
75
76
  "the artifacts_path to the parent containing "
76
- f"the {self._model_repo_folder} folder.",
77
+ f"the {model_repo_folder} folder.",
77
78
  DeprecationWarning,
78
79
  stacklevel=3,
79
80
  )
80
- artifacts_path = artifacts_path / self._model_path
81
+ artifacts_path = artifacts_path / model_path
81
82
 
82
83
  self.layout_predictor = LayoutPredictor(
83
84
  artifact_path=str(artifacts_path),
@@ -90,10 +91,11 @@ class LayoutModel(BasePageModel):
90
91
  local_dir: Optional[Path] = None,
91
92
  force: bool = False,
92
93
  progress: bool = False,
94
+ layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
93
95
  ) -> Path:
94
96
  return download_hf_model(
95
- repo_id="ds4sd/docling-models",
96
- revision="v2.2.0",
97
+ repo_id=layout_model_config.repo_id,
98
+ revision=layout_model_config.revision,
97
99
  local_dir=local_dir,
98
100
  force=force,
99
101
  progress=progress,