docling 2.46.0__tar.gz → 2.47.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {docling-2.46.0 → docling-2.47.0}/PKG-INFO +2 -1
  2. {docling-2.46.0 → docling-2.47.0}/docling/backend/html_backend.py +111 -13
  3. {docling-2.46.0 → docling-2.47.0}/docling/backend/msword_backend.py +126 -16
  4. {docling-2.46.0 → docling-2.47.0}/docling/cli/main.py +14 -0
  5. {docling-2.46.0 → docling-2.47.0}/docling/cli/models.py +56 -0
  6. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/base_models.py +1 -1
  7. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/pipeline_options.py +3 -0
  8. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/pipeline_options_vlm_model.py +5 -0
  9. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/vlm_model_specs.py +114 -1
  10. docling-2.47.0/docling/models/base_model.py +186 -0
  11. {docling-2.46.0 → docling-2.47.0}/docling/models/page_preprocessing_model.py +5 -1
  12. {docling-2.46.0 → docling-2.47.0}/docling/models/picture_description_vlm_model.py +4 -2
  13. docling-2.47.0/docling/models/vlm_models_inline/hf_transformers_model.py +314 -0
  14. docling-2.47.0/docling/models/vlm_models_inline/mlx_model.py +260 -0
  15. docling-2.47.0/docling/models/vlm_models_inline/vllm_model.py +235 -0
  16. {docling-2.46.0 → docling-2.47.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +1 -1
  17. {docling-2.46.0 → docling-2.47.0}/docling/pipeline/vlm_pipeline.py +14 -1
  18. docling-2.47.0/docling/py.typed +1 -0
  19. {docling-2.46.0 → docling-2.47.0}/docling/utils/layout_postprocessor.py +51 -43
  20. {docling-2.46.0 → docling-2.47.0}/docling.egg-info/PKG-INFO +2 -1
  21. {docling-2.46.0 → docling-2.47.0}/docling.egg-info/SOURCES.txt +1 -0
  22. {docling-2.46.0 → docling-2.47.0}/docling.egg-info/requires.txt +3 -0
  23. {docling-2.46.0 → docling-2.47.0}/pyproject.toml +3 -1
  24. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_html.py +1 -1
  25. docling-2.46.0/docling/models/base_model.py +0 -93
  26. docling-2.46.0/docling/models/vlm_models_inline/hf_transformers_model.py +0 -214
  27. docling-2.46.0/docling/models/vlm_models_inline/mlx_model.py +0 -149
  28. docling-2.46.0/docling/utils/__init__.py +0 -0
  29. {docling-2.46.0 → docling-2.47.0}/LICENSE +0 -0
  30. {docling-2.46.0 → docling-2.47.0}/README.md +0 -0
  31. {docling-2.46.0 → docling-2.47.0}/docling/__init__.py +0 -0
  32. {docling-2.46.0 → docling-2.47.0}/docling/backend/__init__.py +0 -0
  33. {docling-2.46.0 → docling-2.47.0}/docling/backend/abstract_backend.py +0 -0
  34. {docling-2.46.0 → docling-2.47.0}/docling/backend/asciidoc_backend.py +0 -0
  35. {docling-2.46.0 → docling-2.47.0}/docling/backend/csv_backend.py +0 -0
  36. {docling-2.46.0 → docling-2.47.0}/docling/backend/docling_parse_backend.py +0 -0
  37. {docling-2.46.0 → docling-2.47.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  38. {docling-2.46.0 → docling-2.47.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  39. {docling-2.46.0 → docling-2.47.0}/docling/backend/docx/__init__.py +0 -0
  40. {docling-2.46.0 → docling-2.47.0}/docling/backend/docx/latex/__init__.py +0 -0
  41. {docling-2.46.0 → docling-2.47.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  42. {docling-2.46.0 → docling-2.47.0}/docling/backend/docx/latex/omml.py +0 -0
  43. {docling-2.46.0 → docling-2.47.0}/docling/backend/json/__init__.py +0 -0
  44. {docling-2.46.0 → docling-2.47.0}/docling/backend/json/docling_json_backend.py +0 -0
  45. {docling-2.46.0 → docling-2.47.0}/docling/backend/md_backend.py +0 -0
  46. {docling-2.46.0 → docling-2.47.0}/docling/backend/mets_gbs_backend.py +0 -0
  47. {docling-2.46.0 → docling-2.47.0}/docling/backend/msexcel_backend.py +0 -0
  48. {docling-2.46.0 → docling-2.47.0}/docling/backend/mspowerpoint_backend.py +0 -0
  49. {docling-2.46.0 → docling-2.47.0}/docling/backend/noop_backend.py +0 -0
  50. {docling-2.46.0 → docling-2.47.0}/docling/backend/pdf_backend.py +0 -0
  51. {docling-2.46.0 → docling-2.47.0}/docling/backend/pypdfium2_backend.py +0 -0
  52. {docling-2.46.0 → docling-2.47.0}/docling/backend/xml/__init__.py +0 -0
  53. {docling-2.46.0 → docling-2.47.0}/docling/backend/xml/jats_backend.py +0 -0
  54. {docling-2.46.0 → docling-2.47.0}/docling/backend/xml/uspto_backend.py +0 -0
  55. {docling-2.46.0 → docling-2.47.0}/docling/chunking/__init__.py +0 -0
  56. {docling-2.46.0 → docling-2.47.0}/docling/cli/__init__.py +0 -0
  57. {docling-2.46.0 → docling-2.47.0}/docling/cli/tools.py +0 -0
  58. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/__init__.py +0 -0
  59. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/accelerator_options.py +0 -0
  60. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/asr_model_specs.py +0 -0
  61. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/document.py +0 -0
  62. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/layout_model_specs.py +0 -0
  63. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  64. {docling-2.46.0 → docling-2.47.0}/docling/datamodel/settings.py +0 -0
  65. {docling-2.46.0 → docling-2.47.0}/docling/document_converter.py +0 -0
  66. {docling-2.46.0 → docling-2.47.0}/docling/exceptions.py +0 -0
  67. {docling-2.46.0 → docling-2.47.0}/docling/models/__init__.py +0 -0
  68. {docling-2.46.0 → docling-2.47.0}/docling/models/api_vlm_model.py +0 -0
  69. {docling-2.46.0 → docling-2.47.0}/docling/models/base_ocr_model.py +0 -0
  70. {docling-2.46.0 → docling-2.47.0}/docling/models/code_formula_model.py +0 -0
  71. {docling-2.46.0 → docling-2.47.0}/docling/models/document_picture_classifier.py +0 -0
  72. {docling-2.46.0 → docling-2.47.0}/docling/models/easyocr_model.py +0 -0
  73. {docling-2.46.0 → docling-2.47.0}/docling/models/factories/__init__.py +0 -0
  74. {docling-2.46.0 → docling-2.47.0}/docling/models/factories/base_factory.py +0 -0
  75. {docling-2.46.0 → docling-2.47.0}/docling/models/factories/ocr_factory.py +0 -0
  76. {docling-2.46.0 → docling-2.47.0}/docling/models/factories/picture_description_factory.py +0 -0
  77. {docling-2.46.0 → docling-2.47.0}/docling/models/layout_model.py +0 -0
  78. {docling-2.46.0 → docling-2.47.0}/docling/models/ocr_mac_model.py +0 -0
  79. {docling-2.46.0 → docling-2.47.0}/docling/models/page_assemble_model.py +0 -0
  80. {docling-2.46.0 → docling-2.47.0}/docling/models/picture_description_api_model.py +0 -0
  81. {docling-2.46.0 → docling-2.47.0}/docling/models/picture_description_base_model.py +0 -0
  82. {docling-2.46.0 → docling-2.47.0}/docling/models/plugins/__init__.py +0 -0
  83. {docling-2.46.0 → docling-2.47.0}/docling/models/plugins/defaults.py +0 -0
  84. {docling-2.46.0 → docling-2.47.0}/docling/models/rapid_ocr_model.py +0 -0
  85. {docling-2.46.0 → docling-2.47.0}/docling/models/readingorder_model.py +0 -0
  86. {docling-2.46.0 → docling-2.47.0}/docling/models/table_structure_model.py +0 -0
  87. {docling-2.46.0 → docling-2.47.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  88. {docling-2.46.0 → docling-2.47.0}/docling/models/tesseract_ocr_model.py +0 -0
  89. {docling-2.46.0 → docling-2.47.0}/docling/models/utils/__init__.py +0 -0
  90. {docling-2.46.0 → docling-2.47.0}/docling/models/utils/hf_model_download.py +0 -0
  91. /docling-2.46.0/docling/py.typed → /docling-2.47.0/docling/models/vlm_models_inline/__init__.py +0 -0
  92. {docling-2.46.0/docling/models/vlm_models_inline → docling-2.47.0/docling/pipeline}/__init__.py +0 -0
  93. {docling-2.46.0 → docling-2.47.0}/docling/pipeline/asr_pipeline.py +0 -0
  94. {docling-2.46.0 → docling-2.47.0}/docling/pipeline/base_pipeline.py +0 -0
  95. {docling-2.46.0 → docling-2.47.0}/docling/pipeline/simple_pipeline.py +0 -0
  96. {docling-2.46.0 → docling-2.47.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  97. {docling-2.46.0/docling/pipeline → docling-2.47.0/docling/utils}/__init__.py +0 -0
  98. {docling-2.46.0 → docling-2.47.0}/docling/utils/accelerator_utils.py +0 -0
  99. {docling-2.46.0 → docling-2.47.0}/docling/utils/api_image_request.py +0 -0
  100. {docling-2.46.0 → docling-2.47.0}/docling/utils/export.py +0 -0
  101. {docling-2.46.0 → docling-2.47.0}/docling/utils/glm_utils.py +0 -0
  102. {docling-2.46.0 → docling-2.47.0}/docling/utils/locks.py +0 -0
  103. {docling-2.46.0 → docling-2.47.0}/docling/utils/model_downloader.py +0 -0
  104. {docling-2.46.0 → docling-2.47.0}/docling/utils/ocr_utils.py +0 -0
  105. {docling-2.46.0 → docling-2.47.0}/docling/utils/orientation.py +0 -0
  106. {docling-2.46.0 → docling-2.47.0}/docling/utils/profiling.py +0 -0
  107. {docling-2.46.0 → docling-2.47.0}/docling/utils/utils.py +0 -0
  108. {docling-2.46.0 → docling-2.47.0}/docling/utils/visualization.py +0 -0
  109. {docling-2.46.0 → docling-2.47.0}/docling.egg-info/dependency_links.txt +0 -0
  110. {docling-2.46.0 → docling-2.47.0}/docling.egg-info/entry_points.txt +0 -0
  111. {docling-2.46.0 → docling-2.47.0}/docling.egg-info/top_level.txt +0 -0
  112. {docling-2.46.0 → docling-2.47.0}/setup.cfg +0 -0
  113. {docling-2.46.0 → docling-2.47.0}/tests/test_asr_pipeline.py +0 -0
  114. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_asciidoc.py +0 -0
  115. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_csv.py +0 -0
  116. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_docling_json.py +0 -0
  117. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_docling_parse.py +0 -0
  118. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_docling_parse_v2.py +0 -0
  119. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_docling_parse_v4.py +0 -0
  120. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_jats.py +0 -0
  121. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_markdown.py +0 -0
  122. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_mets_gbs.py +0 -0
  123. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_msexcel.py +0 -0
  124. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_msword.py +0 -0
  125. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_patent_uspto.py +0 -0
  126. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_pdfium.py +0 -0
  127. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_pptx.py +0 -0
  128. {docling-2.46.0 → docling-2.47.0}/tests/test_backend_webp.py +0 -0
  129. {docling-2.46.0 → docling-2.47.0}/tests/test_cli.py +0 -0
  130. {docling-2.46.0 → docling-2.47.0}/tests/test_code_formula.py +0 -0
  131. {docling-2.46.0 → docling-2.47.0}/tests/test_data_gen_flag.py +0 -0
  132. {docling-2.46.0 → docling-2.47.0}/tests/test_document_picture_classifier.py +0 -0
  133. {docling-2.46.0 → docling-2.47.0}/tests/test_e2e_conversion.py +0 -0
  134. {docling-2.46.0 → docling-2.47.0}/tests/test_e2e_ocr_conversion.py +0 -0
  135. {docling-2.46.0 → docling-2.47.0}/tests/test_input_doc.py +0 -0
  136. {docling-2.46.0 → docling-2.47.0}/tests/test_interfaces.py +0 -0
  137. {docling-2.46.0 → docling-2.47.0}/tests/test_invalid_input.py +0 -0
  138. {docling-2.46.0 → docling-2.47.0}/tests/test_legacy_format_transform.py +0 -0
  139. {docling-2.46.0 → docling-2.47.0}/tests/test_ocr_utils.py +0 -0
  140. {docling-2.46.0 → docling-2.47.0}/tests/test_options.py +0 -0
  141. {docling-2.46.0 → docling-2.47.0}/tests/test_settings_load.py +0 -0
  142. {docling-2.46.0 → docling-2.47.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.46.0
3
+ Version: 2.47.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -59,6 +59,7 @@ Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
61
  Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
+ Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
62
63
  Provides-Extra: rapidocr
63
64
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
65
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -20,7 +20,7 @@ from docling_core.types.doc import (
20
20
  TableData,
21
21
  TextItem,
22
22
  )
23
- from docling_core.types.doc.document import ContentLayer
23
+ from docling_core.types.doc.document import ContentLayer, Formatting, Script
24
24
  from pydantic import AnyUrl, BaseModel, ValidationError
25
25
  from typing_extensions import override
26
26
 
@@ -54,6 +54,21 @@ _BLOCK_TAGS: Final = {
54
54
  "table",
55
55
  }
56
56
 
57
+ _FORMAT_TAG_MAP: Final = {
58
+ "b": {"bold": True},
59
+ "strong": {"bold": True},
60
+ "i": {"italic": True},
61
+ "em": {"italic": True},
62
+ # "mark",
63
+ # "small",
64
+ "s": {"strikethrough": True},
65
+ "del": {"strikethrough": True},
66
+ "u": {"underline": True},
67
+ "ins": {"underline": True},
68
+ "sub": {"script": Script.SUB},
69
+ "sup": {"script": Script.SUPER},
70
+ }
71
+
57
72
 
58
73
  class _Context(BaseModel):
59
74
  list_ordered_flag_by_ref: dict[str, bool] = {}
@@ -63,23 +78,34 @@ class _Context(BaseModel):
63
78
  class AnnotatedText(BaseModel):
64
79
  text: str
65
80
  hyperlink: Union[AnyUrl, Path, None] = None
81
+ formatting: Union[Formatting, None] = None
66
82
 
67
83
 
68
84
  class AnnotatedTextList(list):
69
85
  def to_single_text_element(self) -> AnnotatedText:
70
86
  current_h = None
71
87
  current_text = ""
88
+ current_f = None
72
89
  for at in self:
73
90
  t = at.text
74
91
  h = at.hyperlink
92
+ f = at.formatting
75
93
  current_text += t.strip() + " "
94
+ if f is not None and current_f is None:
95
+ current_f = f
96
+ elif f is not None and current_f is not None and f != current_f:
97
+ _log.warning(
98
+ f"Clashing formatting: '{f}' and '{current_f}'! Chose '{current_f}'"
99
+ )
76
100
  if h is not None and current_h is None:
77
101
  current_h = h
78
102
  elif h is not None and current_h is not None and h != current_h:
79
103
  _log.warning(
80
104
  f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
81
105
  )
82
- return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
106
+ return AnnotatedText(
107
+ text=current_text.strip(), hyperlink=current_h, formatting=current_f
108
+ )
83
109
 
84
110
  def simplify_text_elements(self) -> "AnnotatedTextList":
85
111
  simplified = AnnotatedTextList()
@@ -87,21 +113,27 @@ class AnnotatedTextList(list):
87
113
  return self
88
114
  text = self[0].text
89
115
  hyperlink = self[0].hyperlink
116
+ formatting = self[0].formatting
90
117
  last_elm = text
91
118
  for i in range(1, len(self)):
92
- if hyperlink == self[i].hyperlink:
119
+ if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
93
120
  sep = " "
94
121
  if not self[i].text.strip() or not last_elm.strip():
95
122
  sep = ""
96
123
  text += sep + self[i].text
97
124
  last_elm = self[i].text
98
125
  else:
99
- simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
126
+ simplified.append(
127
+ AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
128
+ )
100
129
  text = self[i].text
101
130
  last_elm = text
102
131
  hyperlink = self[i].hyperlink
132
+ formatting = self[i].formatting
103
133
  if text:
104
- simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
134
+ simplified.append(
135
+ AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
136
+ )
105
137
  return simplified
106
138
 
107
139
  def split_by_newline(self):
@@ -144,6 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
144
176
  self.parents[i] = None
145
177
  self.hyperlink = None
146
178
  self.original_url = original_url
179
+ self.format_tags: list[str] = []
147
180
 
148
181
  try:
149
182
  raw = (
@@ -254,6 +287,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
254
287
  label=DocItemLabel.TEXT,
255
288
  text=seg_clean,
256
289
  content_layer=self.content_layer,
290
+ formatting=annotated_text.formatting,
257
291
  hyperlink=annotated_text.hyperlink,
258
292
  )
259
293
 
@@ -263,6 +297,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
263
297
  if name == "img":
264
298
  flush_buffer()
265
299
  self._emit_image(node, doc)
300
+ elif name in _FORMAT_TAG_MAP:
301
+ with self.use_format([name]):
302
+ self._walk(node, doc)
266
303
  elif name == "a":
267
304
  with self.use_hyperlink(node):
268
305
  self._walk(node, doc)
@@ -292,6 +329,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
292
329
 
293
330
  flush_buffer()
294
331
 
332
+ @staticmethod
333
+ def _collect_parent_format_tags(item: PageElement) -> list[str]:
334
+ tags = []
335
+ for format_tag in _FORMAT_TAG_MAP:
336
+ this_parent = item.parent
337
+ while this_parent is not None:
338
+ if this_parent.name == format_tag:
339
+ tags.append(format_tag)
340
+ break
341
+ this_parent = this_parent.parent
342
+ return tags
343
+
344
+ @property
345
+ def _formatting(self):
346
+ kwargs = {}
347
+ for t in self.format_tags:
348
+ kwargs.update(_FORMAT_TAG_MAP[t])
349
+ if not kwargs:
350
+ return None
351
+ return Formatting(**kwargs)
352
+
295
353
  def _extract_text_and_hyperlink_recursively(
296
354
  self,
297
355
  item: PageElement,
@@ -302,15 +360,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
302
360
  result: AnnotatedTextList = AnnotatedTextList()
303
361
 
304
362
  # If find_parent_annotation, make sure that we keep track of
305
- # any a-tag that has been present in the DOM-parents already.
363
+ # any a- or formatting-tag that has been present in the
364
+ # DOM-parents already.
306
365
  if find_parent_annotation:
366
+ format_tags = self._collect_parent_format_tags(item)
307
367
  this_parent = item.parent
308
368
  while this_parent is not None:
309
369
  if this_parent.name == "a" and this_parent.get("href"):
310
- with self.use_hyperlink(this_parent):
311
- return self._extract_text_and_hyperlink_recursively(
312
- item, ignore_list
313
- )
370
+ with self.use_format(format_tags):
371
+ with self.use_hyperlink(this_parent):
372
+ return self._extract_text_and_hyperlink_recursively(
373
+ item, ignore_list
374
+ )
314
375
  this_parent = this_parent.parent
315
376
 
316
377
  if isinstance(item, PreformattedString):
@@ -320,18 +381,37 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
320
381
  text = item.strip()
321
382
  if text:
322
383
  return AnnotatedTextList(
323
- [AnnotatedText(text=text, hyperlink=self.hyperlink)]
384
+ [
385
+ AnnotatedText(
386
+ text=text,
387
+ hyperlink=self.hyperlink,
388
+ formatting=self._formatting,
389
+ )
390
+ ]
324
391
  )
325
392
  if keep_newlines and item.strip("\n\r") == "":
326
393
  return AnnotatedTextList(
327
- [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
394
+ [
395
+ AnnotatedText(
396
+ text="\n",
397
+ hyperlink=self.hyperlink,
398
+ formatting=self._formatting,
399
+ )
400
+ ]
328
401
  )
329
402
  return AnnotatedTextList()
330
403
 
331
404
  tag = cast(Tag, item)
332
405
  if not ignore_list or (tag.name not in ["ul", "ol"]):
333
406
  for child in tag:
334
- if isinstance(child, Tag) and child.name == "a":
407
+ if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
408
+ with self.use_format([child.name]):
409
+ result.extend(
410
+ self._extract_text_and_hyperlink_recursively(
411
+ child, ignore_list, keep_newlines=keep_newlines
412
+ )
413
+ )
414
+ elif isinstance(child, Tag) and child.name == "a":
335
415
  with self.use_hyperlink(child):
336
416
  result.extend(
337
417
  self._extract_text_and_hyperlink_recursively(
@@ -369,6 +449,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
369
449
  if this_href:
370
450
  self.hyperlink = old_hyperlink
371
451
 
452
+ @contextmanager
453
+ def use_format(self, tags: list[str]):
454
+ if not tags:
455
+ yield None
456
+ else:
457
+ self.format_tags.extend(tags)
458
+ try:
459
+ yield None
460
+ finally:
461
+ self.format_tags = self.format_tags[: -len(tags)]
462
+
372
463
  @contextmanager
373
464
  def use_inline_group(
374
465
  self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
@@ -420,6 +511,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
420
511
  self.parents[self.level + 1] = doc.add_title(
421
512
  text_clean,
422
513
  content_layer=self.content_layer,
514
+ formatting=annotated_text.formatting,
423
515
  hyperlink=annotated_text.hyperlink,
424
516
  )
425
517
  # the other levels need to be lowered by 1 if a title was set
@@ -449,6 +541,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
449
541
  orig=annotated_text.text,
450
542
  level=self.level,
451
543
  content_layer=self.content_layer,
544
+ formatting=annotated_text.formatting,
452
545
  hyperlink=annotated_text.hyperlink,
453
546
  )
454
547
  self.level += 1
@@ -529,6 +622,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
529
622
  label=DocItemLabel.TEXT,
530
623
  text=li_clean,
531
624
  content_layer=self.content_layer,
625
+ formatting=annotated_text.formatting,
532
626
  hyperlink=annotated_text.hyperlink,
533
627
  )
534
628
 
@@ -551,6 +645,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
551
645
  orig=li_text,
552
646
  parent=list_group,
553
647
  content_layer=self.content_layer,
648
+ formatting=annotated_text.formatting,
554
649
  hyperlink=annotated_text.hyperlink,
555
650
  )
556
651
 
@@ -603,6 +698,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
603
698
  label=DocItemLabel.TEXT,
604
699
  text=seg_clean,
605
700
  content_layer=self.content_layer,
701
+ formatting=annotated_text.formatting,
606
702
  hyperlink=annotated_text.hyperlink,
607
703
  )
608
704
 
@@ -637,6 +733,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
637
733
  parent=self.parents[self.level],
638
734
  text=text_clean,
639
735
  content_layer=self.content_layer,
736
+ formatting=annotated_text.formatting,
640
737
  hyperlink=annotated_text.hyperlink,
641
738
  )
642
739
 
@@ -696,6 +793,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
696
793
  text=text_clean,
697
794
  orig=caption_anno_text.text,
698
795
  content_layer=self.content_layer,
796
+ formatting=caption_anno_text.formatting,
699
797
  hyperlink=caption_anno_text.hyperlink,
700
798
  )
701
799
 
@@ -67,6 +67,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
67
67
 
68
68
  self.level = 0
69
69
  self.listIter = 0
70
+ # Track list counters per numId and ilvl
71
+ self.list_counters: dict[tuple[int, int], int] = {}
70
72
 
71
73
  self.history: dict[str, Any] = {
72
74
  "names": [None],
@@ -315,6 +317,108 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
315
317
 
316
318
  return None, None # If the paragraph is not part of a list
317
319
 
320
+ def _get_list_counter(self, numid: int, ilvl: int) -> int:
321
+ """Get and increment the counter for a specific numId and ilvl combination."""
322
+ key = (numid, ilvl)
323
+ if key not in self.list_counters:
324
+ self.list_counters[key] = 0
325
+ self.list_counters[key] += 1
326
+ return self.list_counters[key]
327
+
328
+ def _reset_list_counters_for_new_sequence(self, numid: int):
329
+ """Reset counters when starting a new numbering sequence."""
330
+ # Reset all counters for this numid
331
+ keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
332
+ for key in keys_to_reset:
333
+ self.list_counters[key] = 0
334
+
335
+ def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
336
+ """Check if a list is numbered based on its numFmt value."""
337
+ try:
338
+ # Access the numbering part of the document
339
+ if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
340
+ return False
341
+
342
+ numbering_part = None
343
+ # Find the numbering part
344
+ for part in docx_obj.part.package.parts:
345
+ if "numbering" in part.partname:
346
+ numbering_part = part
347
+ break
348
+
349
+ if numbering_part is None:
350
+ return False
351
+
352
+ # Parse the numbering XML
353
+ numbering_root = numbering_part.element
354
+ namespaces = {
355
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
356
+ }
357
+
358
+ # Find the numbering definition with the given numId
359
+ num_xpath = f".//w:num[@w:numId='{numId}']"
360
+ num_element = numbering_root.find(num_xpath, namespaces=namespaces)
361
+
362
+ if num_element is None:
363
+ return False
364
+
365
+ # Get the abstractNumId from the num element
366
+ abstract_num_id_elem = num_element.find(
367
+ ".//w:abstractNumId", namespaces=namespaces
368
+ )
369
+ if abstract_num_id_elem is None:
370
+ return False
371
+
372
+ abstract_num_id = abstract_num_id_elem.get(
373
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
374
+ )
375
+ if abstract_num_id is None:
376
+ return False
377
+
378
+ # Find the abstract numbering definition
379
+ abstract_num_xpath = (
380
+ f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
381
+ )
382
+ abstract_num_element = numbering_root.find(
383
+ abstract_num_xpath, namespaces=namespaces
384
+ )
385
+
386
+ if abstract_num_element is None:
387
+ return False
388
+
389
+ # Find the level definition for the given ilvl
390
+ lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
391
+ lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
392
+
393
+ if lvl_element is None:
394
+ return False
395
+
396
+ # Get the numFmt element
397
+ num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
398
+ if num_fmt_element is None:
399
+ return False
400
+
401
+ num_fmt = num_fmt_element.get(
402
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
403
+ )
404
+
405
+ # Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
406
+ # Bullet formats include: bullet
407
+ numbered_formats = {
408
+ "decimal",
409
+ "lowerRoman",
410
+ "upperRoman",
411
+ "lowerLetter",
412
+ "upperLetter",
413
+ "decimalZero",
414
+ }
415
+
416
+ return num_fmt in numbered_formats
417
+
418
+ except Exception as e:
419
+ _log.debug(f"Error determining if list is numbered: {e}")
420
+ return False
421
+
318
422
  def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
319
423
  parts = self._split_text_and_number(style_label)
320
424
 
@@ -713,8 +817,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
713
817
  # Common styles for bullet and numbered lists.
714
818
  # "List Bullet", "List Number", "List Paragraph"
715
819
  # Identify whether list is a numbered list or not
716
- # is_numbered = "List Bullet" not in paragraph.style.name
717
- is_numbered = False
718
820
  p_style_id, p_level = self._get_label_and_level(paragraph)
719
821
  numid, ilevel = self._get_numId_and_ilvl(paragraph)
720
822
 
@@ -727,6 +829,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
727
829
  and ilevel is not None
728
830
  and p_style_id not in ["Title", "Heading"]
729
831
  ):
832
+ # Check if this is actually a numbered list by examining the numFmt
833
+ is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
834
+
730
835
  self._add_list_item(
731
836
  doc=doc,
732
837
  numid=numid,
@@ -983,15 +1088,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
983
1088
  if self._prev_numid() is None: # Open new list
984
1089
  self.level_at_new_list = level
985
1090
 
1091
+ # Reset counters for the new numbering sequence
1092
+ self._reset_list_counters_for_new_sequence(numid)
1093
+
986
1094
  self.parents[level] = doc.add_list_group(
987
1095
  name="list", parent=self.parents[level - 1]
988
1096
  )
989
1097
 
990
1098
  # Set marker and enumerated arguments if this is an enumeration element.
991
- self.listIter += 1
992
1099
  if is_numbered:
993
- enum_marker = str(self.listIter) + "."
994
- is_numbered = True
1100
+ counter = self._get_list_counter(numid, ilevel)
1101
+ enum_marker = str(counter) + "."
1102
+ else:
1103
+ enum_marker = ""
995
1104
  self._add_formatted_list_item(
996
1105
  doc, elements, enum_marker, is_numbered, level
997
1106
  )
@@ -1005,16 +1114,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1005
1114
  self.level_at_new_list + prev_indent + 1,
1006
1115
  self.level_at_new_list + ilevel + 1,
1007
1116
  ):
1008
- self.listIter = 0
1009
1117
  self.parents[i] = doc.add_list_group(
1010
1118
  name="list", parent=self.parents[i - 1]
1011
1119
  )
1012
1120
 
1013
1121
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
1014
- self.listIter += 1
1015
1122
  if is_numbered:
1016
- enum_marker = str(self.listIter) + "."
1017
- is_numbered = True
1123
+ counter = self._get_list_counter(numid, ilevel)
1124
+ enum_marker = str(counter) + "."
1125
+ else:
1126
+ enum_marker = ""
1018
1127
  self._add_formatted_list_item(
1019
1128
  doc,
1020
1129
  elements,
@@ -1033,10 +1142,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1033
1142
  self.parents[k] = None
1034
1143
 
1035
1144
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
1036
- self.listIter += 1
1037
1145
  if is_numbered:
1038
- enum_marker = str(self.listIter) + "."
1039
- is_numbered = True
1146
+ counter = self._get_list_counter(numid, ilevel)
1147
+ enum_marker = str(counter) + "."
1148
+ else:
1149
+ enum_marker = ""
1040
1150
  self._add_formatted_list_item(
1041
1151
  doc,
1042
1152
  elements,
@@ -1044,14 +1154,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1044
1154
  is_numbered,
1045
1155
  self.level_at_new_list + ilevel,
1046
1156
  )
1047
- self.listIter = 0
1048
1157
 
1049
1158
  elif self._prev_numid() == numid or prev_indent == ilevel:
1050
1159
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
1051
- self.listIter += 1
1052
1160
  if is_numbered:
1053
- enum_marker = str(self.listIter) + "."
1054
- is_numbered = True
1161
+ counter = self._get_list_counter(numid, ilevel)
1162
+ enum_marker = str(counter) + "."
1163
+ else:
1164
+ enum_marker = ""
1055
1165
  self._add_formatted_list_item(
1056
1166
  doc, elements, enum_marker, is_numbered, level - 1
1057
1167
  )
@@ -60,10 +60,12 @@ from docling.datamodel.pipeline_options import (
60
60
  )
61
61
  from docling.datamodel.settings import settings
62
62
  from docling.datamodel.vlm_model_specs import (
63
+ GOT2_TRANSFORMERS,
63
64
  GRANITE_VISION_OLLAMA,
64
65
  GRANITE_VISION_TRANSFORMERS,
65
66
  SMOLDOCLING_MLX,
66
67
  SMOLDOCLING_TRANSFORMERS,
68
+ SMOLDOCLING_VLLM,
67
69
  VlmModelType,
68
70
  )
69
71
  from docling.document_converter import (
@@ -477,6 +479,13 @@ def convert( # noqa: C901
477
479
  "--logo", callback=logo_callback, is_eager=True, help="Docling logo"
478
480
  ),
479
481
  ] = None,
482
+ page_batch_size: Annotated[
483
+ int,
484
+ typer.Option(
485
+ "--page-batch-size",
486
+ help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
487
+ ),
488
+ ] = settings.perf.page_batch_size,
480
489
  ):
481
490
  log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
482
491
 
@@ -491,6 +500,7 @@ def convert( # noqa: C901
491
500
  settings.debug.visualize_layout = debug_visualize_layout
492
501
  settings.debug.visualize_tables = debug_visualize_tables
493
502
  settings.debug.visualize_ocr = debug_visualize_ocr
503
+ settings.perf.page_batch_size = page_batch_size
494
504
 
495
505
  if from_formats is None:
496
506
  from_formats = list(InputFormat)
@@ -631,6 +641,8 @@ def convert( # noqa: C901
631
641
  pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
632
642
  elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
633
643
  pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
644
+ elif vlm_model == VlmModelType.GOT_OCR_2:
645
+ pipeline_options.vlm_options = GOT2_TRANSFORMERS
634
646
  elif vlm_model == VlmModelType.SMOLDOCLING:
635
647
  pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
636
648
  if sys.platform == "darwin":
@@ -643,6 +655,8 @@ def convert( # noqa: C901
643
655
  "To run SmolDocling faster, please install mlx-vlm:\n"
644
656
  "pip install mlx-vlm"
645
657
  )
658
+ elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
659
+ pipeline_options.vlm_options = SMOLDOCLING_VLLM
646
660
 
647
661
  pdf_format_option = PdfFormatOption(
648
662
  pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
@@ -9,6 +9,7 @@ from rich.console import Console
9
9
  from rich.logging import RichHandler
10
10
 
11
11
  from docling.datamodel.settings import settings
12
+ from docling.models.utils.hf_model_download import download_hf_model
12
13
  from docling.utils.model_downloader import download_models
13
14
 
14
15
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -128,6 +129,61 @@ def download(
128
129
  )
129
130
 
130
131
 
132
+ @app.command("download-hf-repo")
133
+ def download_hf_repo(
134
+ models: Annotated[
135
+ list[str],
136
+ typer.Argument(
137
+ help="Specific models to download from HuggingFace identified by their repo id. For example: ds4sd/docling-models .",
138
+ ),
139
+ ],
140
+ output_dir: Annotated[
141
+ Path,
142
+ typer.Option(
143
+ ...,
144
+ "-o",
145
+ "--output-dir",
146
+ help="The directory where to download the models.",
147
+ ),
148
+ ] = (settings.cache_dir / "models"),
149
+ force: Annotated[
150
+ bool, typer.Option(..., help="If true, the download will be forced.")
151
+ ] = False,
152
+ quiet: Annotated[
153
+ bool,
154
+ typer.Option(
155
+ ...,
156
+ "-q",
157
+ "--quiet",
158
+ help="No extra output is generated, the CLI prints only the directory with the cached models.",
159
+ ),
160
+ ] = False,
161
+ ):
162
+ if not quiet:
163
+ logging.basicConfig(
164
+ level=logging.INFO,
165
+ format="[blue]%(message)s[/blue]",
166
+ datefmt="[%X]",
167
+ handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
168
+ )
169
+
170
+ for item in models:
171
+ typer.secho(f"\nDownloading {item} model from HuggingFace...")
172
+ download_hf_model(
173
+ repo_id=item,
174
+ # would be better to reuse "repo_cache_folder" property: https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options_vlm_model.py#L76
175
+ # but creating options objects seams like an overkill
176
+ local_dir=output_dir / item.replace("/", "--"),
177
+ force=force,
178
+ progress=(not quiet),
179
+ )
180
+
181
+ if quiet:
182
+ typer.echo(output_dir)
183
+ else:
184
+ typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
185
+
186
+
131
187
  click_app = typer.main.get_command(app)
132
188
 
133
189
  if __name__ == "__main__":
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  from collections import defaultdict
3
3
  from enum import Enum
4
- from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
4
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
5
5
 
6
6
  import numpy as np
7
7
  from docling_core.types.doc import (
@@ -282,6 +282,9 @@ class LayoutOptions(BaseModel):
282
282
  keep_empty_clusters: bool = (
283
283
  False # Whether to keep clusters that contain no text cells
284
284
  )
285
+ skip_cell_assignment: bool = (
286
+ False # Skip cell-to-cluster assignment for VLM-only processing
287
+ )
285
288
  model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
286
289
 
287
290
 
@@ -26,11 +26,14 @@ class ResponseFormat(str, Enum):
26
26
  DOCTAGS = "doctags"
27
27
  MARKDOWN = "markdown"
28
28
  HTML = "html"
29
+ OTSL = "otsl"
30
+ PLAINTEXT = "plaintext"
29
31
 
30
32
 
31
33
  class InferenceFramework(str, Enum):
32
34
  MLX = "mlx"
33
35
  TRANSFORMERS = "transformers"
36
+ VLLM = "vllm"
34
37
 
35
38
 
36
39
  class TransformersModelType(str, Enum):
@@ -43,6 +46,7 @@ class TransformersModelType(str, Enum):
43
46
  class TransformersPromptStyle(str, Enum):
44
47
  CHAT = "chat"
45
48
  RAW = "raw"
49
+ NONE = "none"
46
50
 
47
51
 
48
52
  class InlineVlmOptions(BaseVlmOptions):
@@ -68,6 +72,7 @@ class InlineVlmOptions(BaseVlmOptions):
68
72
 
69
73
  stop_strings: List[str] = []
70
74
  extra_generation_config: Dict[str, Any] = {}
75
+ extra_processor_kwargs: Dict[str, Any] = {}
71
76
 
72
77
  use_kv_cache: bool = True
73
78
  max_new_tokens: int = 4096