docling 2.48.0__tar.gz → 2.50.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {docling-2.48.0 → docling-2.50.0}/PKG-INFO +4 -2
  2. {docling-2.48.0 → docling-2.50.0}/docling/backend/html_backend.py +3 -2
  3. {docling-2.48.0 → docling-2.50.0}/docling/backend/msexcel_backend.py +15 -1
  4. {docling-2.48.0 → docling-2.50.0}/docling/backend/pypdfium2_backend.py +24 -2
  5. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/base_models.py +13 -1
  6. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/document.py +5 -3
  7. docling-2.50.0/docling/datamodel/extraction.py +39 -0
  8. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/pipeline_options.py +11 -5
  9. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/vlm_model_specs.py +17 -0
  10. {docling-2.48.0 → docling-2.50.0}/docling/document_converter.py +3 -6
  11. docling-2.50.0/docling/document_extractor.py +325 -0
  12. {docling-2.48.0 → docling-2.50.0}/docling/models/layout_model.py +3 -3
  13. {docling-2.48.0 → docling-2.50.0}/docling/models/page_preprocessing_model.py +1 -1
  14. {docling-2.48.0 → docling-2.50.0}/docling/models/rapid_ocr_model.py +1 -0
  15. {docling-2.48.0 → docling-2.50.0}/docling/models/table_structure_model.py +1 -1
  16. docling-2.50.0/docling/models/vlm_models_inline/nuextract_transformers_model.py +290 -0
  17. docling-2.50.0/docling/pipeline/base_extraction_pipeline.py +58 -0
  18. docling-2.50.0/docling/pipeline/extraction_vlm_pipeline.py +204 -0
  19. {docling-2.48.0 → docling-2.50.0}/docling/utils/model_downloader.py +2 -1
  20. {docling-2.48.0 → docling-2.50.0}/docling.egg-info/PKG-INFO +4 -2
  21. {docling-2.48.0 → docling-2.50.0}/docling.egg-info/SOURCES.txt +6 -0
  22. {docling-2.48.0 → docling-2.50.0}/docling.egg-info/requires.txt +3 -1
  23. {docling-2.48.0 → docling-2.50.0}/pyproject.toml +5 -2
  24. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_msexcel.py +3 -2
  25. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_pdfium.py +19 -0
  26. {docling-2.48.0 → docling-2.50.0}/tests/test_e2e_conversion.py +9 -1
  27. {docling-2.48.0 → docling-2.50.0}/tests/test_e2e_ocr_conversion.py +2 -1
  28. docling-2.50.0/tests/test_extraction.py +108 -0
  29. {docling-2.48.0 → docling-2.50.0}/LICENSE +0 -0
  30. {docling-2.48.0 → docling-2.50.0}/README.md +0 -0
  31. {docling-2.48.0 → docling-2.50.0}/docling/__init__.py +0 -0
  32. {docling-2.48.0 → docling-2.50.0}/docling/backend/__init__.py +0 -0
  33. {docling-2.48.0 → docling-2.50.0}/docling/backend/abstract_backend.py +0 -0
  34. {docling-2.48.0 → docling-2.50.0}/docling/backend/asciidoc_backend.py +0 -0
  35. {docling-2.48.0 → docling-2.50.0}/docling/backend/csv_backend.py +0 -0
  36. {docling-2.48.0 → docling-2.50.0}/docling/backend/docling_parse_backend.py +0 -0
  37. {docling-2.48.0 → docling-2.50.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  38. {docling-2.48.0 → docling-2.50.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  39. {docling-2.48.0 → docling-2.50.0}/docling/backend/docx/__init__.py +0 -0
  40. {docling-2.48.0 → docling-2.50.0}/docling/backend/docx/latex/__init__.py +0 -0
  41. {docling-2.48.0 → docling-2.50.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  42. {docling-2.48.0 → docling-2.50.0}/docling/backend/docx/latex/omml.py +0 -0
  43. {docling-2.48.0 → docling-2.50.0}/docling/backend/json/__init__.py +0 -0
  44. {docling-2.48.0 → docling-2.50.0}/docling/backend/json/docling_json_backend.py +0 -0
  45. {docling-2.48.0 → docling-2.50.0}/docling/backend/md_backend.py +0 -0
  46. {docling-2.48.0 → docling-2.50.0}/docling/backend/mets_gbs_backend.py +0 -0
  47. {docling-2.48.0 → docling-2.50.0}/docling/backend/mspowerpoint_backend.py +0 -0
  48. {docling-2.48.0 → docling-2.50.0}/docling/backend/msword_backend.py +0 -0
  49. {docling-2.48.0 → docling-2.50.0}/docling/backend/noop_backend.py +0 -0
  50. {docling-2.48.0 → docling-2.50.0}/docling/backend/pdf_backend.py +0 -0
  51. {docling-2.48.0 → docling-2.50.0}/docling/backend/xml/__init__.py +0 -0
  52. {docling-2.48.0 → docling-2.50.0}/docling/backend/xml/jats_backend.py +0 -0
  53. {docling-2.48.0 → docling-2.50.0}/docling/backend/xml/uspto_backend.py +0 -0
  54. {docling-2.48.0 → docling-2.50.0}/docling/chunking/__init__.py +0 -0
  55. {docling-2.48.0 → docling-2.50.0}/docling/cli/__init__.py +0 -0
  56. {docling-2.48.0 → docling-2.50.0}/docling/cli/main.py +0 -0
  57. {docling-2.48.0 → docling-2.50.0}/docling/cli/models.py +0 -0
  58. {docling-2.48.0 → docling-2.50.0}/docling/cli/tools.py +0 -0
  59. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/__init__.py +0 -0
  60. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/accelerator_options.py +0 -0
  61. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/asr_model_specs.py +0 -0
  62. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/layout_model_specs.py +0 -0
  63. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  64. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  65. {docling-2.48.0 → docling-2.50.0}/docling/datamodel/settings.py +0 -0
  66. {docling-2.48.0 → docling-2.50.0}/docling/exceptions.py +0 -0
  67. {docling-2.48.0 → docling-2.50.0}/docling/models/__init__.py +0 -0
  68. {docling-2.48.0 → docling-2.50.0}/docling/models/api_vlm_model.py +0 -0
  69. {docling-2.48.0 → docling-2.50.0}/docling/models/base_model.py +0 -0
  70. {docling-2.48.0 → docling-2.50.0}/docling/models/base_ocr_model.py +0 -0
  71. {docling-2.48.0 → docling-2.50.0}/docling/models/code_formula_model.py +0 -0
  72. {docling-2.48.0 → docling-2.50.0}/docling/models/document_picture_classifier.py +0 -0
  73. {docling-2.48.0 → docling-2.50.0}/docling/models/easyocr_model.py +0 -0
  74. {docling-2.48.0 → docling-2.50.0}/docling/models/factories/__init__.py +0 -0
  75. {docling-2.48.0 → docling-2.50.0}/docling/models/factories/base_factory.py +0 -0
  76. {docling-2.48.0 → docling-2.50.0}/docling/models/factories/ocr_factory.py +0 -0
  77. {docling-2.48.0 → docling-2.50.0}/docling/models/factories/picture_description_factory.py +0 -0
  78. {docling-2.48.0 → docling-2.50.0}/docling/models/ocr_mac_model.py +0 -0
  79. {docling-2.48.0 → docling-2.50.0}/docling/models/page_assemble_model.py +0 -0
  80. {docling-2.48.0 → docling-2.50.0}/docling/models/picture_description_api_model.py +0 -0
  81. {docling-2.48.0 → docling-2.50.0}/docling/models/picture_description_base_model.py +0 -0
  82. {docling-2.48.0 → docling-2.50.0}/docling/models/picture_description_vlm_model.py +0 -0
  83. {docling-2.48.0 → docling-2.50.0}/docling/models/plugins/__init__.py +0 -0
  84. {docling-2.48.0 → docling-2.50.0}/docling/models/plugins/defaults.py +0 -0
  85. {docling-2.48.0 → docling-2.50.0}/docling/models/readingorder_model.py +0 -0
  86. {docling-2.48.0 → docling-2.50.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  87. {docling-2.48.0 → docling-2.50.0}/docling/models/tesseract_ocr_model.py +0 -0
  88. {docling-2.48.0 → docling-2.50.0}/docling/models/utils/__init__.py +0 -0
  89. {docling-2.48.0 → docling-2.50.0}/docling/models/utils/hf_model_download.py +0 -0
  90. {docling-2.48.0 → docling-2.50.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  91. {docling-2.48.0 → docling-2.50.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  92. {docling-2.48.0 → docling-2.50.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  93. {docling-2.48.0 → docling-2.50.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  94. {docling-2.48.0 → docling-2.50.0}/docling/pipeline/__init__.py +0 -0
  95. {docling-2.48.0 → docling-2.50.0}/docling/pipeline/asr_pipeline.py +0 -0
  96. {docling-2.48.0 → docling-2.50.0}/docling/pipeline/base_pipeline.py +0 -0
  97. {docling-2.48.0 → docling-2.50.0}/docling/pipeline/simple_pipeline.py +0 -0
  98. {docling-2.48.0 → docling-2.50.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  99. {docling-2.48.0 → docling-2.50.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  100. {docling-2.48.0 → docling-2.50.0}/docling/pipeline/vlm_pipeline.py +0 -0
  101. {docling-2.48.0 → docling-2.50.0}/docling/py.typed +0 -0
  102. {docling-2.48.0 → docling-2.50.0}/docling/utils/__init__.py +0 -0
  103. {docling-2.48.0 → docling-2.50.0}/docling/utils/accelerator_utils.py +0 -0
  104. {docling-2.48.0 → docling-2.50.0}/docling/utils/api_image_request.py +0 -0
  105. {docling-2.48.0 → docling-2.50.0}/docling/utils/export.py +0 -0
  106. {docling-2.48.0 → docling-2.50.0}/docling/utils/glm_utils.py +0 -0
  107. {docling-2.48.0 → docling-2.50.0}/docling/utils/layout_postprocessor.py +0 -0
  108. {docling-2.48.0 → docling-2.50.0}/docling/utils/locks.py +0 -0
  109. {docling-2.48.0 → docling-2.50.0}/docling/utils/ocr_utils.py +0 -0
  110. {docling-2.48.0 → docling-2.50.0}/docling/utils/orientation.py +0 -0
  111. {docling-2.48.0 → docling-2.50.0}/docling/utils/profiling.py +0 -0
  112. {docling-2.48.0 → docling-2.50.0}/docling/utils/utils.py +0 -0
  113. {docling-2.48.0 → docling-2.50.0}/docling/utils/visualization.py +0 -0
  114. {docling-2.48.0 → docling-2.50.0}/docling.egg-info/dependency_links.txt +0 -0
  115. {docling-2.48.0 → docling-2.50.0}/docling.egg-info/entry_points.txt +0 -0
  116. {docling-2.48.0 → docling-2.50.0}/docling.egg-info/top_level.txt +0 -0
  117. {docling-2.48.0 → docling-2.50.0}/setup.cfg +0 -0
  118. {docling-2.48.0 → docling-2.50.0}/tests/test_asr_pipeline.py +0 -0
  119. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_asciidoc.py +0 -0
  120. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_csv.py +0 -0
  121. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_docling_json.py +0 -0
  122. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_docling_parse.py +0 -0
  123. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_docling_parse_v2.py +0 -0
  124. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_docling_parse_v4.py +0 -0
  125. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_html.py +0 -0
  126. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_jats.py +0 -0
  127. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_markdown.py +0 -0
  128. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_mets_gbs.py +0 -0
  129. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_msword.py +0 -0
  130. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_patent_uspto.py +0 -0
  131. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_pptx.py +0 -0
  132. {docling-2.48.0 → docling-2.50.0}/tests/test_backend_webp.py +0 -0
  133. {docling-2.48.0 → docling-2.50.0}/tests/test_cli.py +0 -0
  134. {docling-2.48.0 → docling-2.50.0}/tests/test_code_formula.py +0 -0
  135. {docling-2.48.0 → docling-2.50.0}/tests/test_data_gen_flag.py +0 -0
  136. {docling-2.48.0 → docling-2.50.0}/tests/test_document_picture_classifier.py +0 -0
  137. {docling-2.48.0 → docling-2.50.0}/tests/test_input_doc.py +0 -0
  138. {docling-2.48.0 → docling-2.50.0}/tests/test_interfaces.py +0 -0
  139. {docling-2.48.0 → docling-2.50.0}/tests/test_invalid_input.py +0 -0
  140. {docling-2.48.0 → docling-2.50.0}/tests/test_legacy_format_transform.py +0 -0
  141. {docling-2.48.0 → docling-2.50.0}/tests/test_ocr_utils.py +0 -0
  142. {docling-2.48.0 → docling-2.50.0}/tests/test_options.py +0 -0
  143. {docling-2.48.0 → docling-2.50.0}/tests/test_settings_load.py +0 -0
  144. {docling-2.48.0 → docling-2.50.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.48.0
3
+ Version: 2.50.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -28,7 +28,7 @@ License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
30
  Requires-Dist: docling-parse<5.0.0,>=4.2.2
31
- Requires-Dist: docling-ibm-models<4,>=3.9.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -51,6 +51,7 @@ Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
53
  Requires-Dist: accelerate<2,>=1.0.0
54
+ Requires-Dist: polyfactory>=2.22.2
54
55
  Provides-Extra: tesserocr
55
56
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
56
57
  Provides-Extra: ocrmac
@@ -60,6 +61,7 @@ Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
61
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
62
  Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
63
  Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
64
+ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
63
65
  Provides-Extra: rapidocr
64
66
  Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
65
67
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -467,13 +467,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
467
467
 
468
468
  @contextmanager
469
469
  def _use_hyperlink(self, tag: Tag):
470
+ old_hyperlink: Union[AnyUrl, Path, None] = None
471
+ new_hyperlink: Union[AnyUrl, Path, None] = None
470
472
  this_href = tag.get("href")
471
473
  if this_href is None:
472
474
  yield None
473
475
  else:
474
476
  if isinstance(this_href, str) and this_href:
475
- old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
476
- new_hyperlink: Union[AnyUrl, Path, None] = None
477
+ old_hyperlink = self.hyperlink
477
478
  if self.original_url is not None:
478
479
  this_href = urljoin(str(self.original_url), str(this_href))
479
480
  # ugly fix for relative links since pydantic does not support them.
@@ -1,10 +1,11 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Any, Union, cast
4
+ from typing import Any, Optional, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
8
+ ContentLayer,
8
9
  CoordOrigin,
9
10
  DocItem,
10
11
  DoclingDocument,
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
197
198
  parent=None,
198
199
  label=GroupLabel.SECTION,
199
200
  name=f"sheet: {sheet_name}",
201
+ content_layer=self._get_sheet_content_layer(sheet),
200
202
  )
201
203
  doc = self._convert_sheet(doc, sheet)
202
204
  width, height = self._find_page_size(doc, page_no)
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
237
239
  """
238
240
 
239
241
  if self.workbook is not None:
242
+ content_layer = self._get_sheet_content_layer(sheet)
240
243
  tables = self._find_data_tables(sheet)
241
244
 
242
245
  for excel_table in tables:
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
282
285
  origin=CoordOrigin.TOPLEFT,
283
286
  ),
284
287
  ),
288
+ content_layer=content_layer,
285
289
  )
286
290
 
287
291
  return doc
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
486
490
  The updated DoclingDocument.
487
491
  """
488
492
  if self.workbook is not None:
493
+ content_layer = self._get_sheet_content_layer(sheet)
489
494
  # Iterate over byte images in the sheet
490
495
  for item in sheet._images: # type: ignore[attr-defined]
491
496
  try:
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
511
516
  anchor, origin=CoordOrigin.TOPLEFT
512
517
  ),
513
518
  ),
519
+ content_layer=content_layer,
514
520
  )
515
521
  except Exception:
516
522
  _log.error("could not extract the image from excel sheets")
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
536
542
  bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
537
543
 
538
544
  return (right - left, bottom - top)
545
+
546
+ @staticmethod
547
+ def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
548
+ return (
549
+ None
550
+ if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
551
+ else ContentLayer.INVISIBLE
552
+ )
@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
254
254
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
255
255
  AREA_THRESHOLD = 0 # 32 * 32
256
256
  page_size = self.get_size()
257
+ rotation = self._ppage.get_rotation()
258
+
257
259
  with pypdfium2_lock:
258
260
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
259
261
  pos = obj.get_pos()
262
+ if rotation == 90:
263
+ pos = (
264
+ pos[1],
265
+ page_size.height - pos[2],
266
+ pos[3],
267
+ page_size.height - pos[0],
268
+ )
269
+ elif rotation == 180:
270
+ pos = (
271
+ page_size.width - pos[2],
272
+ page_size.height - pos[3],
273
+ page_size.width - pos[0],
274
+ page_size.height - pos[1],
275
+ )
276
+ elif rotation == 270:
277
+ pos = (
278
+ page_size.width - pos[3],
279
+ pos[0],
280
+ page_size.width - pos[1],
281
+ pos[2],
282
+ )
283
+
260
284
  cropbox = BoundingBox.from_tuple(
261
285
  pos, origin=CoordOrigin.BOTTOMLEFT
262
286
  ).to_top_left_origin(page_height=page_size.height)
263
-
264
287
  if cropbox.area() > AREA_THRESHOLD:
265
288
  cropbox = cropbox.scaled(scale=scale)
266
-
267
289
  yield cropbox
268
290
 
269
291
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  from collections import defaultdict
3
3
  from enum import Enum
4
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
+ from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
5
5
 
6
6
  import numpy as np
7
7
  from docling_core.types.doc import (
@@ -32,6 +32,18 @@ from pydantic import (
32
32
  if TYPE_CHECKING:
33
33
  from docling.backend.pdf_backend import PdfPageBackend
34
34
 
35
+ from docling.backend.abstract_backend import AbstractDocumentBackend
36
+ from docling.datamodel.pipeline_options import PipelineOptions
37
+
38
+
39
+ class BaseFormatOption(BaseModel):
40
+ """Base class for format options used by _DocumentConversionInput."""
41
+
42
+ pipeline_options: Optional[PipelineOptions] = None
43
+ backend: Type[AbstractDocumentBackend]
44
+
45
+ model_config = ConfigDict(arbitrary_types_allowed=True)
46
+
35
47
 
36
48
  class ConversionStatus(str, Enum):
37
49
  PENDING = "pending"
@@ -2,12 +2,13 @@ import csv
2
2
  import logging
3
3
  import re
4
4
  import tarfile
5
- from collections.abc import Iterable
5
+ from collections.abc import Iterable, Mapping
6
6
  from enum import Enum
7
7
  from io import BytesIO
8
8
  from pathlib import Path, PurePath
9
9
  from typing import (
10
10
  TYPE_CHECKING,
11
+ Any,
11
12
  Dict,
12
13
  List,
13
14
  Literal,
@@ -72,7 +73,7 @@ from docling.utils.profiling import ProfilingItem
72
73
  from docling.utils.utils import create_file_hash
73
74
 
74
75
  if TYPE_CHECKING:
75
- from docling.document_converter import FormatOption
76
+ from docling.datamodel.base_models import BaseFormatOption
76
77
 
77
78
  _log = logging.getLogger(__name__)
78
79
 
@@ -238,7 +239,8 @@ class _DocumentConversionInput(BaseModel):
238
239
  limits: Optional[DocumentLimits] = DocumentLimits()
239
240
 
240
241
  def docs(
241
- self, format_options: Dict[InputFormat, "FormatOption"]
242
+ self,
243
+ format_options: Mapping[InputFormat, "BaseFormatOption"],
242
244
  ) -> Iterable[InputDocument]:
243
245
  for item in self.path_or_stream_iterator:
244
246
  obj = (
@@ -0,0 +1,39 @@
1
+ """Data models for document extraction functionality."""
2
+
3
+ from typing import Any, Dict, List, Optional, Type, Union
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem
8
+ from docling.datamodel.document import InputDocument
9
+
10
+
11
+ class ExtractedPageData(BaseModel):
12
+ """Data model for extracted content from a single page."""
13
+
14
+ page_no: int = Field(..., description="1-indexed page number")
15
+ extracted_data: Optional[Dict[str, Any]] = Field(
16
+ None, description="Extracted structured data from the page"
17
+ )
18
+ raw_text: Optional[str] = Field(None, description="Raw extracted text")
19
+ errors: List[str] = Field(
20
+ default_factory=list,
21
+ description="Any errors encountered during extraction for this page",
22
+ )
23
+
24
+
25
+ class ExtractionResult(BaseModel):
26
+ """Result of document extraction."""
27
+
28
+ input: InputDocument
29
+ status: ConversionStatus = ConversionStatus.PENDING
30
+ errors: List[ErrorItem] = []
31
+
32
+ # Pages field - always a list for consistency
33
+ pages: List[ExtractedPageData] = Field(
34
+ default_factory=list, description="Extracted data from each page"
35
+ )
36
+
37
+
38
+ # Type alias for template parameters that can be string, dict, or BaseModel
39
+ ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
@@ -37,6 +37,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
37
37
  from docling.datamodel.vlm_model_specs import (
38
38
  GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
39
39
  GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
40
+ NU_EXTRACT_2B_TRANSFORMERS,
40
41
  SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
41
42
  SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
42
43
  VlmModelType,
@@ -113,6 +114,7 @@ class RapidOcrOptions(OcrOptions):
113
114
  cls_model_path: Optional[str] = None # same default as rapidocr
114
115
  rec_model_path: Optional[str] = None # same default as rapidocr
115
116
  rec_keys_path: Optional[str] = None # same default as rapidocr
117
+ rec_font_path: Optional[str] = None # same default as rapidocr
116
118
 
117
119
  model_config = ConfigDict(
118
120
  extra="forbid",
@@ -246,12 +248,9 @@ class OcrEngine(str, Enum):
246
248
  RAPIDOCR = "rapidocr"
247
249
 
248
250
 
249
- class PipelineOptions(BaseModel):
251
+ class PipelineOptions(BaseOptions):
250
252
  """Base pipeline options."""
251
253
 
252
- create_legacy_output: bool = (
253
- True # This default will be set to False on a future version of docling
254
- )
255
254
  document_timeout: Optional[float] = None
256
255
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
257
256
  enable_remote_services: bool = False
@@ -284,10 +283,10 @@ class LayoutOptions(BaseModel):
284
283
  keep_empty_clusters: bool = (
285
284
  False # Whether to keep clusters that contain no text cells
286
285
  )
286
+ model_spec: LayoutModelConfig = DOCLING_LAYOUT_HERON
287
287
  skip_cell_assignment: bool = (
288
288
  False # Skip cell-to-cluster assignment for VLM-only processing
289
289
  )
290
- model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
291
290
 
292
291
 
293
292
  class AsrPipelineOptions(PipelineOptions):
@@ -295,6 +294,13 @@ class AsrPipelineOptions(PipelineOptions):
295
294
  artifacts_path: Optional[Union[Path, str]] = None
296
295
 
297
296
 
297
+ class VlmExtractionPipelineOptions(PipelineOptions):
298
+ """Options for extraction pipeline."""
299
+
300
+ artifacts_path: Optional[Union[Path, str]] = None
301
+ vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
302
+
303
+
298
304
  class PdfPipelineOptions(PaginatedPipelineOptions):
299
305
  """Options for the PDF pipeline."""
300
306
 
@@ -247,6 +247,23 @@ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
247
247
  temperature=0.0,
248
248
  )
249
249
 
250
+ # NuExtract
251
+ NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
252
+ repo_id="numind/NuExtract-2.0-2B",
253
+ prompt="", # This won't be used, template is passed separately
254
+ torch_dtype="bfloat16",
255
+ inference_framework=InferenceFramework.TRANSFORMERS,
256
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
257
+ response_format=ResponseFormat.PLAINTEXT,
258
+ supported_devices=[
259
+ AcceleratorDevice.CPU,
260
+ AcceleratorDevice.CUDA,
261
+ AcceleratorDevice.MPS,
262
+ ],
263
+ scale=2.0,
264
+ temperature=0.0,
265
+ )
266
+
250
267
 
251
268
  class VlmModelType(str, Enum):
252
269
  SMOLDOCLING = "smoldocling"
@@ -28,6 +28,7 @@ from docling.backend.noop_backend import NoOpBackend
28
28
  from docling.backend.xml.jats_backend import JatsDocumentBackend
29
29
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
30
30
  from docling.datamodel.base_models import (
31
+ BaseFormatOption,
31
32
  ConversionStatus,
32
33
  DoclingComponentType,
33
34
  DocumentStream,
@@ -57,12 +58,8 @@ _log = logging.getLogger(__name__)
57
58
  _PIPELINE_CACHE_LOCK = threading.Lock()
58
59
 
59
60
 
60
- class FormatOption(BaseModel):
61
+ class FormatOption(BaseFormatOption):
61
62
  pipeline_cls: Type[BasePipeline]
62
- pipeline_options: Optional[PipelineOptions] = None
63
- backend: Type[AbstractDocumentBackend]
64
-
65
- model_config = ConfigDict(arbitrary_types_allowed=True)
66
63
 
67
64
  @model_validator(mode="after")
68
65
  def set_optional_field_default(self) -> "FormatOption":
@@ -191,7 +188,7 @@ class DocumentConverter:
191
188
  self.allowed_formats = (
192
189
  allowed_formats if allowed_formats is not None else list(InputFormat)
193
190
  )
194
- self.format_to_options = {
191
+ self.format_to_options: Dict[InputFormat, FormatOption] = {
195
192
  format: (
196
193
  _get_default_option(format=format)
197
194
  if (custom_option := (format_options or {}).get(format)) is None