docling 2.51.0__tar.gz → 2.52.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {docling-2.51.0 → docling-2.52.0}/PKG-INFO +7 -3
  2. {docling-2.51.0 → docling-2.52.0}/README.md +5 -1
  3. {docling-2.51.0 → docling-2.52.0}/docling/cli/main.py +29 -0
  4. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/pipeline_options.py +14 -9
  5. {docling-2.51.0 → docling-2.52.0}/docling/models/base_model.py +27 -2
  6. {docling-2.51.0 → docling-2.52.0}/docling/models/easyocr_model.py +19 -9
  7. {docling-2.51.0 → docling-2.52.0}/docling/models/picture_description_vlm_model.py +1 -1
  8. {docling-2.51.0 → docling-2.52.0}/docling/models/vlm_models_inline/hf_transformers_model.py +1 -1
  9. {docling-2.51.0 → docling-2.52.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +1 -1
  10. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/asr_pipeline.py +1 -13
  11. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/base_extraction_pipeline.py +17 -3
  12. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/base_pipeline.py +75 -9
  13. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/extraction_vlm_pipeline.py +9 -16
  14. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/simple_pipeline.py +6 -6
  15. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/standard_pdf_pipeline.py +6 -55
  16. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +102 -62
  17. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/vlm_pipeline.py +3 -15
  18. {docling-2.51.0 → docling-2.52.0}/docling.egg-info/PKG-INFO +7 -3
  19. {docling-2.51.0 → docling-2.52.0}/docling.egg-info/requires.txt +1 -1
  20. {docling-2.51.0 → docling-2.52.0}/pyproject.toml +2 -2
  21. {docling-2.51.0 → docling-2.52.0}/LICENSE +0 -0
  22. {docling-2.51.0 → docling-2.52.0}/docling/__init__.py +0 -0
  23. {docling-2.51.0 → docling-2.52.0}/docling/backend/__init__.py +0 -0
  24. {docling-2.51.0 → docling-2.52.0}/docling/backend/abstract_backend.py +0 -0
  25. {docling-2.51.0 → docling-2.52.0}/docling/backend/asciidoc_backend.py +0 -0
  26. {docling-2.51.0 → docling-2.52.0}/docling/backend/csv_backend.py +0 -0
  27. {docling-2.51.0 → docling-2.52.0}/docling/backend/docling_parse_backend.py +0 -0
  28. {docling-2.51.0 → docling-2.52.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  29. {docling-2.51.0 → docling-2.52.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  30. {docling-2.51.0 → docling-2.52.0}/docling/backend/docx/__init__.py +0 -0
  31. {docling-2.51.0 → docling-2.52.0}/docling/backend/docx/latex/__init__.py +0 -0
  32. {docling-2.51.0 → docling-2.52.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  33. {docling-2.51.0 → docling-2.52.0}/docling/backend/docx/latex/omml.py +0 -0
  34. {docling-2.51.0 → docling-2.52.0}/docling/backend/html_backend.py +0 -0
  35. {docling-2.51.0 → docling-2.52.0}/docling/backend/json/__init__.py +0 -0
  36. {docling-2.51.0 → docling-2.52.0}/docling/backend/json/docling_json_backend.py +0 -0
  37. {docling-2.51.0 → docling-2.52.0}/docling/backend/md_backend.py +0 -0
  38. {docling-2.51.0 → docling-2.52.0}/docling/backend/mets_gbs_backend.py +0 -0
  39. {docling-2.51.0 → docling-2.52.0}/docling/backend/msexcel_backend.py +0 -0
  40. {docling-2.51.0 → docling-2.52.0}/docling/backend/mspowerpoint_backend.py +0 -0
  41. {docling-2.51.0 → docling-2.52.0}/docling/backend/msword_backend.py +0 -0
  42. {docling-2.51.0 → docling-2.52.0}/docling/backend/noop_backend.py +0 -0
  43. {docling-2.51.0 → docling-2.52.0}/docling/backend/pdf_backend.py +0 -0
  44. {docling-2.51.0 → docling-2.52.0}/docling/backend/pypdfium2_backend.py +0 -0
  45. {docling-2.51.0 → docling-2.52.0}/docling/backend/xml/__init__.py +0 -0
  46. {docling-2.51.0 → docling-2.52.0}/docling/backend/xml/jats_backend.py +0 -0
  47. {docling-2.51.0 → docling-2.52.0}/docling/backend/xml/uspto_backend.py +0 -0
  48. {docling-2.51.0 → docling-2.52.0}/docling/chunking/__init__.py +0 -0
  49. {docling-2.51.0 → docling-2.52.0}/docling/cli/__init__.py +0 -0
  50. {docling-2.51.0 → docling-2.52.0}/docling/cli/models.py +0 -0
  51. {docling-2.51.0 → docling-2.52.0}/docling/cli/tools.py +0 -0
  52. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/__init__.py +0 -0
  53. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/accelerator_options.py +0 -0
  54. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/asr_model_specs.py +0 -0
  55. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/base_models.py +0 -0
  56. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/document.py +0 -0
  57. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/extraction.py +0 -0
  58. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/layout_model_specs.py +0 -0
  59. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  60. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  61. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/settings.py +0 -0
  62. {docling-2.51.0 → docling-2.52.0}/docling/datamodel/vlm_model_specs.py +0 -0
  63. {docling-2.51.0 → docling-2.52.0}/docling/document_converter.py +0 -0
  64. {docling-2.51.0 → docling-2.52.0}/docling/document_extractor.py +0 -0
  65. {docling-2.51.0 → docling-2.52.0}/docling/exceptions.py +0 -0
  66. {docling-2.51.0 → docling-2.52.0}/docling/models/__init__.py +0 -0
  67. {docling-2.51.0 → docling-2.52.0}/docling/models/api_vlm_model.py +0 -0
  68. {docling-2.51.0 → docling-2.52.0}/docling/models/base_ocr_model.py +0 -0
  69. {docling-2.51.0 → docling-2.52.0}/docling/models/code_formula_model.py +0 -0
  70. {docling-2.51.0 → docling-2.52.0}/docling/models/document_picture_classifier.py +0 -0
  71. {docling-2.51.0 → docling-2.52.0}/docling/models/factories/__init__.py +0 -0
  72. {docling-2.51.0 → docling-2.52.0}/docling/models/factories/base_factory.py +0 -0
  73. {docling-2.51.0 → docling-2.52.0}/docling/models/factories/ocr_factory.py +0 -0
  74. {docling-2.51.0 → docling-2.52.0}/docling/models/factories/picture_description_factory.py +0 -0
  75. {docling-2.51.0 → docling-2.52.0}/docling/models/layout_model.py +0 -0
  76. {docling-2.51.0 → docling-2.52.0}/docling/models/ocr_mac_model.py +0 -0
  77. {docling-2.51.0 → docling-2.52.0}/docling/models/page_assemble_model.py +0 -0
  78. {docling-2.51.0 → docling-2.52.0}/docling/models/page_preprocessing_model.py +0 -0
  79. {docling-2.51.0 → docling-2.52.0}/docling/models/picture_description_api_model.py +0 -0
  80. {docling-2.51.0 → docling-2.52.0}/docling/models/picture_description_base_model.py +0 -0
  81. {docling-2.51.0 → docling-2.52.0}/docling/models/plugins/__init__.py +0 -0
  82. {docling-2.51.0 → docling-2.52.0}/docling/models/plugins/defaults.py +0 -0
  83. {docling-2.51.0 → docling-2.52.0}/docling/models/rapid_ocr_model.py +0 -0
  84. {docling-2.51.0 → docling-2.52.0}/docling/models/readingorder_model.py +0 -0
  85. {docling-2.51.0 → docling-2.52.0}/docling/models/table_structure_model.py +0 -0
  86. {docling-2.51.0 → docling-2.52.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  87. {docling-2.51.0 → docling-2.52.0}/docling/models/tesseract_ocr_model.py +0 -0
  88. {docling-2.51.0 → docling-2.52.0}/docling/models/utils/__init__.py +0 -0
  89. {docling-2.51.0 → docling-2.52.0}/docling/models/utils/hf_model_download.py +0 -0
  90. {docling-2.51.0 → docling-2.52.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  91. {docling-2.51.0 → docling-2.52.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  92. {docling-2.51.0 → docling-2.52.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  93. {docling-2.51.0 → docling-2.52.0}/docling/pipeline/__init__.py +0 -0
  94. {docling-2.51.0 → docling-2.52.0}/docling/py.typed +0 -0
  95. {docling-2.51.0 → docling-2.52.0}/docling/utils/__init__.py +0 -0
  96. {docling-2.51.0 → docling-2.52.0}/docling/utils/accelerator_utils.py +0 -0
  97. {docling-2.51.0 → docling-2.52.0}/docling/utils/api_image_request.py +0 -0
  98. {docling-2.51.0 → docling-2.52.0}/docling/utils/export.py +0 -0
  99. {docling-2.51.0 → docling-2.52.0}/docling/utils/glm_utils.py +0 -0
  100. {docling-2.51.0 → docling-2.52.0}/docling/utils/layout_postprocessor.py +0 -0
  101. {docling-2.51.0 → docling-2.52.0}/docling/utils/locks.py +0 -0
  102. {docling-2.51.0 → docling-2.52.0}/docling/utils/model_downloader.py +0 -0
  103. {docling-2.51.0 → docling-2.52.0}/docling/utils/ocr_utils.py +0 -0
  104. {docling-2.51.0 → docling-2.52.0}/docling/utils/orientation.py +0 -0
  105. {docling-2.51.0 → docling-2.52.0}/docling/utils/profiling.py +0 -0
  106. {docling-2.51.0 → docling-2.52.0}/docling/utils/utils.py +0 -0
  107. {docling-2.51.0 → docling-2.52.0}/docling/utils/visualization.py +0 -0
  108. {docling-2.51.0 → docling-2.52.0}/docling.egg-info/SOURCES.txt +0 -0
  109. {docling-2.51.0 → docling-2.52.0}/docling.egg-info/dependency_links.txt +0 -0
  110. {docling-2.51.0 → docling-2.52.0}/docling.egg-info/entry_points.txt +0 -0
  111. {docling-2.51.0 → docling-2.52.0}/docling.egg-info/top_level.txt +0 -0
  112. {docling-2.51.0 → docling-2.52.0}/setup.cfg +0 -0
  113. {docling-2.51.0 → docling-2.52.0}/tests/test_asr_pipeline.py +0 -0
  114. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_asciidoc.py +0 -0
  115. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_csv.py +0 -0
  116. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_docling_json.py +0 -0
  117. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_docling_parse.py +0 -0
  118. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_docling_parse_v2.py +0 -0
  119. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_docling_parse_v4.py +0 -0
  120. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_html.py +0 -0
  121. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_jats.py +0 -0
  122. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_markdown.py +0 -0
  123. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_mets_gbs.py +0 -0
  124. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_msexcel.py +0 -0
  125. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_msword.py +0 -0
  126. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_patent_uspto.py +0 -0
  127. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_pdfium.py +0 -0
  128. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_pptx.py +0 -0
  129. {docling-2.51.0 → docling-2.52.0}/tests/test_backend_webp.py +0 -0
  130. {docling-2.51.0 → docling-2.52.0}/tests/test_cli.py +0 -0
  131. {docling-2.51.0 → docling-2.52.0}/tests/test_code_formula.py +0 -0
  132. {docling-2.51.0 → docling-2.52.0}/tests/test_data_gen_flag.py +0 -0
  133. {docling-2.51.0 → docling-2.52.0}/tests/test_document_picture_classifier.py +0 -0
  134. {docling-2.51.0 → docling-2.52.0}/tests/test_e2e_conversion.py +0 -0
  135. {docling-2.51.0 → docling-2.52.0}/tests/test_e2e_ocr_conversion.py +0 -0
  136. {docling-2.51.0 → docling-2.52.0}/tests/test_extraction.py +0 -0
  137. {docling-2.51.0 → docling-2.52.0}/tests/test_input_doc.py +0 -0
  138. {docling-2.51.0 → docling-2.52.0}/tests/test_interfaces.py +0 -0
  139. {docling-2.51.0 → docling-2.52.0}/tests/test_invalid_input.py +0 -0
  140. {docling-2.51.0 → docling-2.52.0}/tests/test_legacy_format_transform.py +0 -0
  141. {docling-2.51.0 → docling-2.52.0}/tests/test_ocr_utils.py +0 -0
  142. {docling-2.51.0 → docling-2.52.0}/tests/test_options.py +0 -0
  143. {docling-2.51.0 → docling-2.52.0}/tests/test_settings_load.py +0 -0
  144. {docling-2.51.0 → docling-2.52.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.51.0
3
+ Version: 2.52.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
30
30
  Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -110,16 +110,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
110
110
  * 🔍 Extensive OCR support for scanned PDFs and images
111
111
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
112
112
  * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
113
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
113
114
  * 💻 Simple and convenient CLI
114
115
 
115
116
  ### What's new
116
117
  * 📤 Structured [information extraction][extraction] \[🧪 beta\]
118
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
119
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
117
120
 
118
121
  ### Coming soon
119
122
 
120
123
  * 📝 Metadata extraction, including title, authors, references & language
121
124
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
122
125
  * 📝 Complex chemistry understanding (Molecular structures)
126
+ * 📝 Parsing of Web Video Text Tracks (WebVTT) files
123
127
 
124
128
  ## Installation
125
129
 
@@ -145,7 +149,7 @@ result = converter.convert(source)
145
149
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
146
150
  ```
147
151
 
148
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
152
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
149
153
  the docs.
150
154
 
151
155
  ## CLI
@@ -38,16 +38,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
38
38
  * 🔍 Extensive OCR support for scanned PDFs and images
39
39
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
40
40
  * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
41
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
41
42
  * 💻 Simple and convenient CLI
42
43
 
43
44
  ### What's new
44
45
  * 📤 Structured [information extraction][extraction] \[🧪 beta\]
46
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
47
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
45
48
 
46
49
  ### Coming soon
47
50
 
48
51
  * 📝 Metadata extraction, including title, authors, references & language
49
52
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
50
53
  * 📝 Complex chemistry understanding (Molecular structures)
54
+ * 📝 Parsing of Web Video Text Tracks (WebVTT) files
51
55
 
52
56
  ## Installation
53
57
 
@@ -73,7 +77,7 @@ result = converter.convert(source)
73
77
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
74
78
  ```
75
79
 
76
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
80
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
77
81
  the docs.
78
82
 
79
83
  ## CLI
@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
48
48
  from docling.datamodel.document import ConversionResult
49
49
  from docling.datamodel.pipeline_options import (
50
50
  AsrPipelineOptions,
51
+ ConvertPipelineOptions,
51
52
  EasyOcrOptions,
52
53
  OcrOptions,
53
54
  PaginatedPipelineOptions,
@@ -71,8 +72,13 @@ from docling.datamodel.vlm_model_specs import (
71
72
  from docling.document_converter import (
72
73
  AudioFormatOption,
73
74
  DocumentConverter,
75
+ ExcelFormatOption,
74
76
  FormatOption,
77
+ HTMLFormatOption,
78
+ MarkdownFormatOption,
75
79
  PdfFormatOption,
80
+ PowerpointFormatOption,
81
+ WordFormatOption,
76
82
  )
77
83
  from docling.models.factories import get_ocr_factory
78
84
  from docling.pipeline.asr_pipeline import AsrPipeline
@@ -626,10 +632,33 @@ def convert( # noqa: C901
626
632
  backend=MetsGbsDocumentBackend,
627
633
  )
628
634
 
635
+ # SimplePipeline options
636
+ simple_format_option = ConvertPipelineOptions(
637
+ do_picture_description=enrich_picture_description,
638
+ do_picture_classification=enrich_picture_classes,
639
+ )
640
+ if artifacts_path is not None:
641
+ simple_format_option.artifacts_path = artifacts_path
642
+
629
643
  format_options = {
630
644
  InputFormat.PDF: pdf_format_option,
631
645
  InputFormat.IMAGE: pdf_format_option,
632
646
  InputFormat.METS_GBS: mets_gbs_format_option,
647
+ InputFormat.DOCX: WordFormatOption(
648
+ pipeline_options=simple_format_option
649
+ ),
650
+ InputFormat.PPTX: PowerpointFormatOption(
651
+ pipeline_options=simple_format_option
652
+ ),
653
+ InputFormat.XLSX: ExcelFormatOption(
654
+ pipeline_options=simple_format_option
655
+ ),
656
+ InputFormat.HTML: HTMLFormatOption(
657
+ pipeline_options=simple_format_option
658
+ ),
659
+ InputFormat.MD: MarkdownFormatOption(
660
+ pipeline_options=simple_format_option
661
+ ),
633
662
  }
634
663
 
635
664
  elif pipeline == ProcessingPipeline.VLM:
@@ -135,6 +135,8 @@ class EasyOcrOptions(OcrOptions):
135
135
  recog_network: Optional[str] = "standard"
136
136
  download_enabled: bool = True
137
137
 
138
+ suppress_mps_warnings: bool = True
139
+
138
140
  model_config = ConfigDict(
139
141
  extra="forbid",
140
142
  protected_namespaces=(),
@@ -257,11 +259,21 @@ class PipelineOptions(BaseOptions):
257
259
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
258
260
  enable_remote_services: bool = False
259
261
  allow_external_plugins: bool = False
262
+ artifacts_path: Optional[Union[Path, str]] = None
260
263
 
261
264
 
262
- class PaginatedPipelineOptions(PipelineOptions):
263
- artifacts_path: Optional[Union[Path, str]] = None
265
+ class ConvertPipelineOptions(PipelineOptions):
266
+ """Base convert pipeline options."""
267
+
268
+ do_picture_classification: bool = False # True: classify pictures in documents
269
+
270
+ do_picture_description: bool = False # True: run describe pictures in documents
271
+ picture_description_options: PictureDescriptionBaseOptions = (
272
+ smolvlm_picture_description
273
+ )
274
+
264
275
 
276
+ class PaginatedPipelineOptions(ConvertPipelineOptions):
265
277
  images_scale: float = 1.0
266
278
  generate_page_images: bool = False
267
279
  generate_picture_images: bool = False
@@ -293,13 +305,11 @@ class LayoutOptions(BaseModel):
293
305
 
294
306
  class AsrPipelineOptions(PipelineOptions):
295
307
  asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
296
- artifacts_path: Optional[Union[Path, str]] = None
297
308
 
298
309
 
299
310
  class VlmExtractionPipelineOptions(PipelineOptions):
300
311
  """Options for extraction pipeline."""
301
312
 
302
- artifacts_path: Optional[Union[Path, str]] = None
303
313
  vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
304
314
 
305
315
 
@@ -310,8 +320,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
310
320
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
311
321
  do_code_enrichment: bool = False # True: perform code OCR
312
322
  do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
313
- do_picture_classification: bool = False # True: classify pictures in documents
314
- do_picture_description: bool = False # True: run describe pictures in documents
315
323
  force_backend_text: bool = (
316
324
  False # (To be used with vlms, or other generative models)
317
325
  )
@@ -319,9 +327,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
319
327
 
320
328
  table_structure_options: TableStructureOptions = TableStructureOptions()
321
329
  ocr_options: OcrOptions = EasyOcrOptions()
322
- picture_description_options: PictureDescriptionBaseOptions = (
323
- smolvlm_picture_description
324
- )
325
330
  layout_options: LayoutOptions = LayoutOptions()
326
331
 
327
332
  images_scale: float = 1.0
@@ -4,7 +4,13 @@ from collections.abc import Iterable
4
4
  from typing import Any, Generic, Optional, Protocol, Type, Union
5
5
 
6
6
  import numpy as np
7
- from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
7
+ from docling_core.types.doc import (
8
+ BoundingBox,
9
+ DocItem,
10
+ DoclingDocument,
11
+ NodeItem,
12
+ PictureItem,
13
+ )
8
14
  from PIL.Image import Image
9
15
  from typing_extensions import TypeVar
10
16
 
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
164
170
  return None
165
171
 
166
172
  assert isinstance(element, DocItem)
167
- element_prov = element.prov[0]
168
173
 
174
+ # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
175
+ if len(element.prov) == 0 and isinstance(element, PictureItem):
176
+ embedded_im = element.get_image(conv_res.document)
177
+ if embedded_im is not None:
178
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
179
+ else:
180
+ return None
181
+
182
+ # Crop the image form the page
183
+ element_prov = element.prov[0]
169
184
  bbox = element_prov.bbox
170
185
  width = bbox.r - bbox.l
171
186
  height = bbox.t - bbox.b
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
183
198
  cropped_image = conv_res.pages[page_ix].get_image(
184
199
  scale=self.images_scale, cropbox=expanded_bbox
185
200
  )
201
+
202
+ # Allow for images being embedded without the page backend or page images
203
+ if cropped_image is None and isinstance(element, PictureItem):
204
+ embedded_im = element.get_image(conv_res.document)
205
+ if embedded_im is not None:
206
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
207
+ else:
208
+ return None
209
+
210
+ # Return the proper cropped image
186
211
  return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
@@ -78,14 +78,17 @@ class EasyOcrModel(BaseOcrModel):
78
78
  download_enabled = False
79
79
  model_storage_directory = str(artifacts_path / self._model_repo_folder)
80
80
 
81
- self.reader = easyocr.Reader(
82
- lang_list=self.options.lang,
83
- gpu=use_gpu,
84
- model_storage_directory=model_storage_directory,
85
- recog_network=self.options.recog_network,
86
- download_enabled=download_enabled,
87
- verbose=False,
88
- )
81
+ with warnings.catch_warnings():
82
+ if self.options.suppress_mps_warnings:
83
+ warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
84
+ self.reader = easyocr.Reader(
85
+ lang_list=self.options.lang,
86
+ gpu=use_gpu,
87
+ model_storage_directory=model_storage_directory,
88
+ recog_network=self.options.recog_network,
89
+ download_enabled=download_enabled,
90
+ verbose=False,
91
+ )
89
92
 
90
93
  @staticmethod
91
94
  def download_models(
@@ -147,7 +150,14 @@ class EasyOcrModel(BaseOcrModel):
147
150
  scale=self.scale, cropbox=ocr_rect
148
151
  )
149
152
  im = numpy.array(high_res_image)
150
- result = self.reader.readtext(im)
153
+
154
+ with warnings.catch_warnings():
155
+ if self.options.suppress_mps_warnings:
156
+ warnings.filterwarnings(
157
+ "ignore", message=".*pin_memory.*MPS.*"
158
+ )
159
+
160
+ result = self.reader.readtext(im)
151
161
 
152
162
  del high_res_image
153
163
  del im
@@ -67,7 +67,7 @@ class PictureDescriptionVlmModel(
67
67
  self.model = AutoModelForImageTextToText.from_pretrained(
68
68
  artifacts_path,
69
69
  device_map=self.device,
70
- torch_dtype=torch.bfloat16,
70
+ dtype=torch.bfloat16,
71
71
  _attn_implementation=(
72
72
  "flash_attention_2"
73
73
  if self.device.startswith("cuda")
@@ -112,7 +112,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
112
112
  self.vlm_model = model_cls.from_pretrained(
113
113
  artifacts_path,
114
114
  device_map=self.device,
115
- torch_dtype=self.vlm_options.torch_dtype,
115
+ dtype=self.vlm_options.torch_dtype,
116
116
  _attn_implementation=(
117
117
  "flash_attention_2"
118
118
  if self.device.startswith("cuda")
@@ -144,7 +144,7 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
144
144
  self.vlm_model = AutoModelForImageTextToText.from_pretrained(
145
145
  artifacts_path,
146
146
  device_map=self.device,
147
- torch_dtype=self.vlm_options.torch_dtype,
147
+ dtype=self.vlm_options.torch_dtype,
148
148
  _attn_implementation=(
149
149
  "flash_attention_2"
150
150
  if self.device.startswith("cuda")
@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
208
208
 
209
209
  self.pipeline_options: AsrPipelineOptions = pipeline_options
210
210
 
211
- artifacts_path: Optional[Path] = None
212
- if pipeline_options.artifacts_path is not None:
213
- artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
214
- elif settings.artifacts_path is not None:
215
- artifacts_path = Path(settings.artifacts_path).expanduser()
216
-
217
- if artifacts_path is not None and not artifacts_path.is_dir():
218
- raise RuntimeError(
219
- f"The value of {artifacts_path=} is not valid. "
220
- "When defined, it must point to a folder containing all models required by the pipeline."
221
- )
222
-
223
211
  if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
224
212
  asr_options: InlineAsrNativeWhisperOptions = (
225
213
  self.pipeline_options.asr_options
226
214
  )
227
215
  self._model = _NativeWhisperModel(
228
216
  enabled=True, # must be always enabled for this pipeline to make sense.
229
- artifacts_path=artifacts_path,
217
+ artifacts_path=self.artifacts_path,
230
218
  accelerator_options=pipeline_options.accelerator_options,
231
219
  asr_options=asr_options,
232
220
  )
@@ -1,19 +1,33 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
+ from pathlib import Path
3
4
  from typing import Optional
4
5
 
5
6
  from docling.datamodel.base_models import ConversionStatus, ErrorItem
6
7
  from docling.datamodel.document import InputDocument
7
8
  from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
8
- from docling.datamodel.pipeline_options import BaseOptions
9
+ from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
10
+ from docling.datamodel.settings import settings
9
11
 
10
12
  _log = logging.getLogger(__name__)
11
13
 
12
14
 
13
15
  class BaseExtractionPipeline(ABC):
14
- def __init__(self, pipeline_options: BaseOptions):
16
+ def __init__(self, pipeline_options: PipelineOptions):
15
17
  self.pipeline_options = pipeline_options
16
18
 
19
+ self.artifacts_path: Optional[Path] = None
20
+ if pipeline_options.artifacts_path is not None:
21
+ self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
22
+ elif settings.artifacts_path is not None:
23
+ self.artifacts_path = Path(settings.artifacts_path).expanduser()
24
+
25
+ if self.artifacts_path is not None and not self.artifacts_path.is_dir():
26
+ raise RuntimeError(
27
+ f"The value of {self.artifacts_path=} is not valid. "
28
+ "When defined, it must point to a folder containing all models required by the pipeline."
29
+ )
30
+
17
31
  def execute(
18
32
  self,
19
33
  in_doc: InputDocument,
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
54
68
 
55
69
  @classmethod
56
70
  @abstractmethod
57
- def get_default_options(cls) -> BaseOptions:
71
+ def get_default_options(cls) -> PipelineOptions:
58
72
  pass
@@ -4,7 +4,8 @@ import time
4
4
  import traceback
5
5
  from abc import ABC, abstractmethod
6
6
  from collections.abc import Iterable
7
- from typing import Any, Callable, List
7
+ from pathlib import Path
8
+ from typing import Any, Callable, List, Optional
8
9
 
9
10
  from docling_core.types.doc import NodeItem
10
11
 
@@ -20,9 +21,19 @@ from docling.datamodel.base_models import (
20
21
  Page,
21
22
  )
22
23
  from docling.datamodel.document import ConversionResult, InputDocument
23
- from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
24
+ from docling.datamodel.pipeline_options import (
25
+ ConvertPipelineOptions,
26
+ PdfPipelineOptions,
27
+ PipelineOptions,
28
+ )
24
29
  from docling.datamodel.settings import settings
25
30
  from docling.models.base_model import GenericEnrichmentModel
31
+ from docling.models.document_picture_classifier import (
32
+ DocumentPictureClassifier,
33
+ DocumentPictureClassifierOptions,
34
+ )
35
+ from docling.models.factories import get_picture_description_factory
36
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
26
37
  from docling.utils.profiling import ProfilingScope, TimeRecorder
27
38
  from docling.utils.utils import chunkify
28
39
 
@@ -36,6 +47,18 @@ class BasePipeline(ABC):
36
47
  self.build_pipe: List[Callable] = []
37
48
  self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
38
49
 
50
+ self.artifacts_path: Optional[Path] = None
51
+ if pipeline_options.artifacts_path is not None:
52
+ self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
53
+ elif settings.artifacts_path is not None:
54
+ self.artifacts_path = Path(settings.artifacts_path).expanduser()
55
+
56
+ if self.artifacts_path is not None and not self.artifacts_path.is_dir():
57
+ raise RuntimeError(
58
+ f"The value of {self.artifacts_path=} is not valid. "
59
+ "When defined, it must point to a folder containing all models required by the pipeline."
60
+ )
61
+
39
62
  def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
40
63
  conv_res = ConversionResult(input=in_doc)
41
64
 
@@ -108,15 +131,58 @@ class BasePipeline(ABC):
108
131
  def is_backend_supported(cls, backend: AbstractDocumentBackend):
109
132
  pass
110
133
 
111
- # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
112
- # for model in self.build_pipe:
113
- # element_batch = model(element_batch)
114
- #
115
- # yield from element_batch
116
134
 
135
+ class ConvertPipeline(BasePipeline):
136
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
137
+ super().__init__(pipeline_options)
138
+ self.pipeline_options: ConvertPipelineOptions
117
139
 
118
- class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
119
- def __init__(self, pipeline_options: PipelineOptions):
140
+ # ------ Common enrichment models working on all backends
141
+
142
+ # Picture description model
143
+ if (
144
+ picture_description_model := self._get_picture_description_model(
145
+ artifacts_path=self.artifacts_path
146
+ )
147
+ ) is None:
148
+ raise RuntimeError(
149
+ f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
150
+ )
151
+
152
+ self.enrichment_pipe = [
153
+ # Document Picture Classifier
154
+ DocumentPictureClassifier(
155
+ enabled=pipeline_options.do_picture_classification,
156
+ artifacts_path=self.artifacts_path,
157
+ options=DocumentPictureClassifierOptions(),
158
+ accelerator_options=pipeline_options.accelerator_options,
159
+ ),
160
+ # Document Picture description
161
+ picture_description_model,
162
+ ]
163
+
164
+ def _get_picture_description_model(
165
+ self, artifacts_path: Optional[Path] = None
166
+ ) -> Optional[PictureDescriptionBaseModel]:
167
+ factory = get_picture_description_factory(
168
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
169
+ )
170
+ return factory.create_instance(
171
+ options=self.pipeline_options.picture_description_options,
172
+ enabled=self.pipeline_options.do_picture_description,
173
+ enable_remote_services=self.pipeline_options.enable_remote_services,
174
+ artifacts_path=artifacts_path,
175
+ accelerator_options=self.pipeline_options.accelerator_options,
176
+ )
177
+
178
+ @classmethod
179
+ @abstractmethod
180
+ def get_default_options(cls) -> ConvertPipelineOptions:
181
+ pass
182
+
183
+
184
+ class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
185
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
120
186
  super().__init__(pipeline_options)
121
187
  self.keep_backend = False
122
188
 
@@ -1,7 +1,6 @@
1
1
  import inspect
2
2
  import json
3
3
  import logging
4
- from pathlib import Path
5
4
  from typing import Optional
6
5
 
7
6
  from PIL.Image import Image
@@ -16,7 +15,10 @@ from docling.datamodel.extraction import (
16
15
  ExtractionResult,
17
16
  ExtractionTemplateType,
18
17
  )
19
- from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions
18
+ from docling.datamodel.pipeline_options import (
19
+ PipelineOptions,
20
+ VlmExtractionPipelineOptions,
21
+ )
20
22
  from docling.datamodel.settings import settings
21
23
  from docling.models.vlm_models_inline.nuextract_transformers_model import (
22
24
  NuExtractTransformersModel,
@@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
35
37
  self.accelerator_options = pipeline_options.accelerator_options
36
38
  self.pipeline_options: VlmExtractionPipelineOptions
37
39
 
38
- artifacts_path: Optional[Path] = None
39
- if pipeline_options.artifacts_path is not None:
40
- artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
41
- elif settings.artifacts_path is not None:
42
- artifacts_path = Path(settings.artifacts_path).expanduser()
43
-
44
- if artifacts_path is not None and not artifacts_path.is_dir():
45
- raise RuntimeError(
46
- f"The value of {artifacts_path=} is not valid. "
47
- "When defined, it must point to a folder containing all models required by the pipeline."
48
- )
49
-
50
40
  # Create VLM model instance
51
41
  self.vlm_model = NuExtractTransformersModel(
52
42
  enabled=True,
53
- artifacts_path=artifacts_path, # Will download automatically
43
+ artifacts_path=self.artifacts_path, # Will download automatically
54
44
  accelerator_options=self.accelerator_options,
55
45
  vlm_options=pipeline_options.vlm_options,
56
46
  )
@@ -194,11 +184,14 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
194
184
  class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
195
185
  __use_examples__ = True # prefer Field(examples=...) when present
196
186
  __use_defaults__ = True # use field defaults instead of random values
187
+ __check_model__ = (
188
+ True # setting the value to avoid deprecation warnings
189
+ )
197
190
 
198
191
  return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
199
192
  else:
200
193
  raise ValueError(f"Unsupported template type: {type(template)}")
201
194
 
202
195
  @classmethod
203
- def get_default_options(cls) -> BaseOptions:
196
+ def get_default_options(cls) -> PipelineOptions:
204
197
  return VlmExtractionPipelineOptions()
@@ -6,21 +6,21 @@ from docling.backend.abstract_backend import (
6
6
  )
7
7
  from docling.datamodel.base_models import ConversionStatus
8
8
  from docling.datamodel.document import ConversionResult
9
- from docling.datamodel.pipeline_options import PipelineOptions
10
- from docling.pipeline.base_pipeline import BasePipeline
9
+ from docling.datamodel.pipeline_options import ConvertPipelineOptions
10
+ from docling.pipeline.base_pipeline import ConvertPipeline
11
11
  from docling.utils.profiling import ProfilingScope, TimeRecorder
12
12
 
13
13
  _log = logging.getLogger(__name__)
14
14
 
15
15
 
16
- class SimplePipeline(BasePipeline):
16
+ class SimplePipeline(ConvertPipeline):
17
17
  """SimpleModelPipeline.
18
18
 
19
19
  This class is used at the moment for formats / backends
20
20
  which produce straight DoclingDocument output.
21
21
  """
22
22
 
23
- def __init__(self, pipeline_options: PipelineOptions):
23
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
24
24
  super().__init__(pipeline_options)
25
25
 
26
26
  def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
@@ -47,8 +47,8 @@ class SimplePipeline(BasePipeline):
47
47
  return ConversionStatus.SUCCESS
48
48
 
49
49
  @classmethod
50
- def get_default_options(cls) -> PipelineOptions:
51
- return PipelineOptions()
50
+ def get_default_options(cls) -> ConvertPipelineOptions:
51
+ return ConvertPipelineOptions()
52
52
 
53
53
  @classmethod
54
54
  def is_backend_supported(cls, backend: AbstractDocumentBackend):