docling 2.43.0__tar.gz → 2.44.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. {docling-2.43.0 → docling-2.44.0}/PKG-INFO +2 -2
  2. {docling-2.43.0 → docling-2.44.0}/docling/backend/html_backend.py +77 -12
  3. {docling-2.43.0 → docling-2.44.0}/docling/cli/main.py +6 -0
  4. {docling-2.43.0 → docling-2.44.0}/docling/document_converter.py +30 -0
  5. {docling-2.43.0 → docling-2.44.0}/docling/models/vlm_models_inline/mlx_model.py +2 -2
  6. {docling-2.43.0 → docling-2.44.0}/docling.egg-info/PKG-INFO +2 -2
  7. {docling-2.43.0 → docling-2.44.0}/docling.egg-info/requires.txt +1 -1
  8. {docling-2.43.0 → docling-2.44.0}/pyproject.toml +2 -2
  9. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_html.py +16 -0
  10. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_markdown.py +52 -1
  11. {docling-2.43.0 → docling-2.44.0}/LICENSE +0 -0
  12. {docling-2.43.0 → docling-2.44.0}/README.md +0 -0
  13. {docling-2.43.0 → docling-2.44.0}/docling/__init__.py +0 -0
  14. {docling-2.43.0 → docling-2.44.0}/docling/backend/__init__.py +0 -0
  15. {docling-2.43.0 → docling-2.44.0}/docling/backend/abstract_backend.py +0 -0
  16. {docling-2.43.0 → docling-2.44.0}/docling/backend/asciidoc_backend.py +0 -0
  17. {docling-2.43.0 → docling-2.44.0}/docling/backend/csv_backend.py +0 -0
  18. {docling-2.43.0 → docling-2.44.0}/docling/backend/docling_parse_backend.py +0 -0
  19. {docling-2.43.0 → docling-2.44.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  20. {docling-2.43.0 → docling-2.44.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  21. {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/__init__.py +0 -0
  22. {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/latex/__init__.py +0 -0
  23. {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  24. {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/latex/omml.py +0 -0
  25. {docling-2.43.0 → docling-2.44.0}/docling/backend/json/__init__.py +0 -0
  26. {docling-2.43.0 → docling-2.44.0}/docling/backend/json/docling_json_backend.py +0 -0
  27. {docling-2.43.0 → docling-2.44.0}/docling/backend/md_backend.py +0 -0
  28. {docling-2.43.0 → docling-2.44.0}/docling/backend/msexcel_backend.py +0 -0
  29. {docling-2.43.0 → docling-2.44.0}/docling/backend/mspowerpoint_backend.py +0 -0
  30. {docling-2.43.0 → docling-2.44.0}/docling/backend/msword_backend.py +0 -0
  31. {docling-2.43.0 → docling-2.44.0}/docling/backend/noop_backend.py +0 -0
  32. {docling-2.43.0 → docling-2.44.0}/docling/backend/pdf_backend.py +0 -0
  33. {docling-2.43.0 → docling-2.44.0}/docling/backend/pypdfium2_backend.py +0 -0
  34. {docling-2.43.0 → docling-2.44.0}/docling/backend/xml/__init__.py +0 -0
  35. {docling-2.43.0 → docling-2.44.0}/docling/backend/xml/jats_backend.py +0 -0
  36. {docling-2.43.0 → docling-2.44.0}/docling/backend/xml/uspto_backend.py +0 -0
  37. {docling-2.43.0 → docling-2.44.0}/docling/chunking/__init__.py +0 -0
  38. {docling-2.43.0 → docling-2.44.0}/docling/cli/__init__.py +0 -0
  39. {docling-2.43.0 → docling-2.44.0}/docling/cli/models.py +0 -0
  40. {docling-2.43.0 → docling-2.44.0}/docling/cli/tools.py +0 -0
  41. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/__init__.py +0 -0
  42. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/accelerator_options.py +0 -0
  43. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/asr_model_specs.py +0 -0
  44. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/base_models.py +0 -0
  45. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/document.py +0 -0
  46. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/layout_model_specs.py +0 -0
  47. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/pipeline_options.py +0 -0
  48. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  49. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  50. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/settings.py +0 -0
  51. {docling-2.43.0 → docling-2.44.0}/docling/datamodel/vlm_model_specs.py +0 -0
  52. {docling-2.43.0 → docling-2.44.0}/docling/exceptions.py +0 -0
  53. {docling-2.43.0 → docling-2.44.0}/docling/models/__init__.py +0 -0
  54. {docling-2.43.0 → docling-2.44.0}/docling/models/api_vlm_model.py +0 -0
  55. {docling-2.43.0 → docling-2.44.0}/docling/models/base_model.py +0 -0
  56. {docling-2.43.0 → docling-2.44.0}/docling/models/base_ocr_model.py +0 -0
  57. {docling-2.43.0 → docling-2.44.0}/docling/models/code_formula_model.py +0 -0
  58. {docling-2.43.0 → docling-2.44.0}/docling/models/document_picture_classifier.py +0 -0
  59. {docling-2.43.0 → docling-2.44.0}/docling/models/easyocr_model.py +0 -0
  60. {docling-2.43.0 → docling-2.44.0}/docling/models/factories/__init__.py +0 -0
  61. {docling-2.43.0 → docling-2.44.0}/docling/models/factories/base_factory.py +0 -0
  62. {docling-2.43.0 → docling-2.44.0}/docling/models/factories/ocr_factory.py +0 -0
  63. {docling-2.43.0 → docling-2.44.0}/docling/models/factories/picture_description_factory.py +0 -0
  64. {docling-2.43.0 → docling-2.44.0}/docling/models/layout_model.py +0 -0
  65. {docling-2.43.0 → docling-2.44.0}/docling/models/ocr_mac_model.py +0 -0
  66. {docling-2.43.0 → docling-2.44.0}/docling/models/page_assemble_model.py +0 -0
  67. {docling-2.43.0 → docling-2.44.0}/docling/models/page_preprocessing_model.py +0 -0
  68. {docling-2.43.0 → docling-2.44.0}/docling/models/picture_description_api_model.py +0 -0
  69. {docling-2.43.0 → docling-2.44.0}/docling/models/picture_description_base_model.py +0 -0
  70. {docling-2.43.0 → docling-2.44.0}/docling/models/picture_description_vlm_model.py +0 -0
  71. {docling-2.43.0 → docling-2.44.0}/docling/models/plugins/__init__.py +0 -0
  72. {docling-2.43.0 → docling-2.44.0}/docling/models/plugins/defaults.py +0 -0
  73. {docling-2.43.0 → docling-2.44.0}/docling/models/rapid_ocr_model.py +0 -0
  74. {docling-2.43.0 → docling-2.44.0}/docling/models/readingorder_model.py +0 -0
  75. {docling-2.43.0 → docling-2.44.0}/docling/models/table_structure_model.py +0 -0
  76. {docling-2.43.0 → docling-2.44.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  77. {docling-2.43.0 → docling-2.44.0}/docling/models/tesseract_ocr_model.py +0 -0
  78. {docling-2.43.0 → docling-2.44.0}/docling/models/utils/__init__.py +0 -0
  79. {docling-2.43.0 → docling-2.44.0}/docling/models/utils/hf_model_download.py +0 -0
  80. {docling-2.43.0 → docling-2.44.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  81. {docling-2.43.0 → docling-2.44.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  82. {docling-2.43.0 → docling-2.44.0}/docling/pipeline/__init__.py +0 -0
  83. {docling-2.43.0 → docling-2.44.0}/docling/pipeline/asr_pipeline.py +0 -0
  84. {docling-2.43.0 → docling-2.44.0}/docling/pipeline/base_pipeline.py +0 -0
  85. {docling-2.43.0 → docling-2.44.0}/docling/pipeline/simple_pipeline.py +0 -0
  86. {docling-2.43.0 → docling-2.44.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  87. {docling-2.43.0 → docling-2.44.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  88. {docling-2.43.0 → docling-2.44.0}/docling/pipeline/vlm_pipeline.py +0 -0
  89. {docling-2.43.0 → docling-2.44.0}/docling/py.typed +0 -0
  90. {docling-2.43.0 → docling-2.44.0}/docling/utils/__init__.py +0 -0
  91. {docling-2.43.0 → docling-2.44.0}/docling/utils/accelerator_utils.py +0 -0
  92. {docling-2.43.0 → docling-2.44.0}/docling/utils/api_image_request.py +0 -0
  93. {docling-2.43.0 → docling-2.44.0}/docling/utils/export.py +0 -0
  94. {docling-2.43.0 → docling-2.44.0}/docling/utils/glm_utils.py +0 -0
  95. {docling-2.43.0 → docling-2.44.0}/docling/utils/layout_postprocessor.py +0 -0
  96. {docling-2.43.0 → docling-2.44.0}/docling/utils/locks.py +0 -0
  97. {docling-2.43.0 → docling-2.44.0}/docling/utils/model_downloader.py +0 -0
  98. {docling-2.43.0 → docling-2.44.0}/docling/utils/ocr_utils.py +0 -0
  99. {docling-2.43.0 → docling-2.44.0}/docling/utils/orientation.py +0 -0
  100. {docling-2.43.0 → docling-2.44.0}/docling/utils/profiling.py +0 -0
  101. {docling-2.43.0 → docling-2.44.0}/docling/utils/utils.py +0 -0
  102. {docling-2.43.0 → docling-2.44.0}/docling/utils/visualization.py +0 -0
  103. {docling-2.43.0 → docling-2.44.0}/docling.egg-info/SOURCES.txt +0 -0
  104. {docling-2.43.0 → docling-2.44.0}/docling.egg-info/dependency_links.txt +0 -0
  105. {docling-2.43.0 → docling-2.44.0}/docling.egg-info/entry_points.txt +0 -0
  106. {docling-2.43.0 → docling-2.44.0}/docling.egg-info/top_level.txt +0 -0
  107. {docling-2.43.0 → docling-2.44.0}/setup.cfg +0 -0
  108. {docling-2.43.0 → docling-2.44.0}/tests/test_asr_pipeline.py +0 -0
  109. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_asciidoc.py +0 -0
  110. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_csv.py +0 -0
  111. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_json.py +0 -0
  112. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_parse.py +0 -0
  113. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_parse_v2.py +0 -0
  114. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_parse_v4.py +0 -0
  115. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_jats.py +0 -0
  116. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_msexcel.py +0 -0
  117. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_msword.py +0 -0
  118. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_patent_uspto.py +0 -0
  119. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_pdfium.py +0 -0
  120. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_pptx.py +0 -0
  121. {docling-2.43.0 → docling-2.44.0}/tests/test_backend_webp.py +0 -0
  122. {docling-2.43.0 → docling-2.44.0}/tests/test_cli.py +0 -0
  123. {docling-2.43.0 → docling-2.44.0}/tests/test_code_formula.py +0 -0
  124. {docling-2.43.0 → docling-2.44.0}/tests/test_data_gen_flag.py +0 -0
  125. {docling-2.43.0 → docling-2.44.0}/tests/test_document_picture_classifier.py +0 -0
  126. {docling-2.43.0 → docling-2.44.0}/tests/test_e2e_conversion.py +0 -0
  127. {docling-2.43.0 → docling-2.44.0}/tests/test_e2e_ocr_conversion.py +0 -0
  128. {docling-2.43.0 → docling-2.44.0}/tests/test_input_doc.py +0 -0
  129. {docling-2.43.0 → docling-2.44.0}/tests/test_interfaces.py +0 -0
  130. {docling-2.43.0 → docling-2.44.0}/tests/test_invalid_input.py +0 -0
  131. {docling-2.43.0 → docling-2.44.0}/tests/test_legacy_format_transform.py +0 -0
  132. {docling-2.43.0 → docling-2.44.0}/tests/test_ocr_utils.py +0 -0
  133. {docling-2.43.0 → docling-2.44.0}/tests/test_options.py +0 -0
  134. {docling-2.43.0 → docling-2.44.0}/tests/test_settings_load.py +0 -0
  135. {docling-2.43.0 → docling-2.44.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.43.0
3
+ Version: 2.44.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
58
58
  Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
- Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
+ Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
62
  Provides-Extra: rapidocr
63
63
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
64
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -125,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
125
125
  # set the title as furniture, since it is part of the document metadata
126
126
  title = self.soup.title
127
127
  if title:
128
+ title_text = title.get_text(separator=" ", strip=True)
129
+ title_clean = HTMLDocumentBackend._clean_unicode(title_text)
128
130
  doc.add_title(
129
- text=title.get_text(separator=" ", strip=True),
131
+ text=title_clean,
132
+ orig=title_text,
130
133
  content_layer=ContentLayer.FURNITURE,
131
134
  )
132
135
  # remove scripts/styles
@@ -168,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
168
171
  return
169
172
  for part in text.split("\n"):
170
173
  seg = part.strip()
174
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
171
175
  if seg:
172
176
  doc.add_text(
173
- DocItemLabel.TEXT,
174
- seg,
177
+ label=DocItemLabel.TEXT,
178
+ text=seg_clean,
179
+ orig=seg,
175
180
  parent=self.parents[self.level],
176
181
  content_layer=self.content_layer,
177
182
  )
@@ -203,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
203
208
  self.content_layer = ContentLayer.BODY
204
209
  level = int(tag_name[1])
205
210
  text = tag.get_text(strip=True, separator=" ")
211
+ text_clean = HTMLDocumentBackend._clean_unicode(text)
206
212
  # the first level is for the title item
207
213
  if level == 1:
208
214
  for key in self.parents.keys():
209
215
  self.parents[key] = None
210
216
  self.level = 0
211
217
  self.parents[self.level + 1] = doc.add_title(
212
- text, content_layer=self.content_layer
218
+ text=text_clean, orig=text, content_layer=self.content_layer
213
219
  )
214
220
  # the other levels need to be lowered by 1 if a title was set
215
221
  else:
@@ -234,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
234
240
  self.level = level
235
241
  self.parents[self.level + 1] = doc.add_heading(
236
242
  parent=self.parents[self.level],
237
- text=text,
243
+ text=text_clean,
244
+ orig=text,
238
245
  level=self.level,
239
246
  content_layer=self.content_layer,
240
247
  )
@@ -296,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
296
303
  if text_part:
297
304
  parts.append(text_part)
298
305
  li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
306
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
299
307
 
300
308
  # 3) add the list item
301
309
  if li_text:
302
310
  self.parents[self.level + 1] = doc.add_list_item(
303
- text=li_text,
311
+ text=li_clean,
304
312
  enumerated=is_ordered,
305
313
  marker=marker,
314
+ orig=li_text,
306
315
  parent=list_group,
307
316
  content_layer=self.content_layer,
308
317
  )
@@ -344,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
344
353
  elif tag_name in {"p", "address", "summary"}:
345
354
  for part in tag.text.split("\n"):
346
355
  seg = part.strip()
356
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
347
357
  if seg:
348
358
  doc.add_text(
349
- parent=self.parents[self.level],
350
359
  label=DocItemLabel.TEXT,
351
- text=seg,
360
+ text=seg_clean,
361
+ orig=seg,
362
+ parent=self.parents[self.level],
352
363
  content_layer=self.content_layer,
353
364
  )
354
365
  for img_tag in tag("img"):
@@ -370,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
370
381
  elif tag_name in {"pre", "code"}:
371
382
  # handle monospace code snippets (pre).
372
383
  text = tag.get_text(strip=True)
384
+ text_clean = HTMLDocumentBackend._clean_unicode(text)
373
385
  if text:
374
386
  doc.add_code(
375
387
  parent=self.parents[self.level],
376
- text=text,
388
+ text=text_clean,
389
+ orig=text,
377
390
  content_layer=self.content_layer,
378
391
  )
379
392
 
@@ -402,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
402
415
 
403
416
  caption_item: Optional[TextItem] = None
404
417
  if caption:
418
+ caption_clean = HTMLDocumentBackend._clean_unicode(caption)
405
419
  caption_item = doc.add_text(
406
- DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
420
+ label=DocItemLabel.CAPTION,
421
+ text=caption_clean,
422
+ orig=caption,
423
+ content_layer=self.content_layer,
407
424
  )
408
425
 
409
426
  doc.add_picture(
@@ -442,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
442
459
 
443
460
  return "".join(parts)
444
461
 
462
+ @staticmethod
463
+ def _clean_unicode(text: str) -> str:
464
+ """Replace typical Unicode characters in HTML for text processing.
465
+
466
+ Several Unicode characters (e.g., non-printable or formatting) are typically
467
+ found in HTML but are worth replacing to sanitize text and ensure consistency
468
+ in text processing tasks.
469
+
470
+ Args:
471
+ text: The original text.
472
+
473
+ Returns:
474
+ The sanitized text without typical Unicode characters.
475
+ """
476
+ replacements = {
477
+ "\u00a0": " ", # non-breaking space
478
+ "\u200b": "", # zero-width space
479
+ "\u200c": "", # zero-width non-joiner
480
+ "\u200d": "", # zero-width joiner
481
+ "\u2010": "-", # hyphen
482
+ "\u2011": "-", # non-breaking hyphen
483
+ "\u2012": "-", # dash
484
+ "\u2013": "-", # dash
485
+ "\u2014": "-", # dash
486
+ "\u2015": "-", # horizontal bar
487
+ "\u2018": "'", # left single quotation mark
488
+ "\u2019": "'", # right single quotation mark
489
+ "\u201c": '"', # left double quotation mark
490
+ "\u201d": '"', # right double quotation mark
491
+ "\u2026": "...", # ellipsis
492
+ "\u00ad": "", # soft hyphen
493
+ "\ufeff": "", # zero width non-break space
494
+ "\u202f": " ", # narrow non-break space
495
+ "\u2060": "", # word joiner
496
+ }
497
+ for raw, clean in replacements.items():
498
+ text = text.replace(raw, clean)
499
+
500
+ return text
501
+
445
502
  @staticmethod
446
503
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
447
504
  """Extract colspan and rowspan values from a table cell tag.
@@ -454,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
454
511
  str(cell.get("colspan", "1")),
455
512
  str(cell.get("rowspan", "1")),
456
513
  )
514
+
515
+ def _extract_num(s: str) -> int:
516
+ if s and s[0].isnumeric():
517
+ match = re.search(r"\d+", s)
518
+ if match:
519
+ return int(match.group())
520
+ return 1
521
+
457
522
  int_spans: tuple[int, int] = (
458
- int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
459
- int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
523
+ _extract_num(raw_spans[0]),
524
+ _extract_num(raw_spans[1]),
460
525
  )
461
526
 
462
527
  return int_spans
@@ -262,6 +262,12 @@ def export_documents(
262
262
 
263
263
  else:
264
264
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
265
+ if _log.isEnabledFor(logging.INFO):
266
+ for err in conv_res.errors:
267
+ _log.info(
268
+ f" [Failure Detail] Component: {err.component_type}, "
269
+ f"Module: {err.module_name}, Message: {err.error_message}"
270
+ )
265
271
  failure_count += 1
266
272
 
267
273
  _log.info(
@@ -5,7 +5,9 @@ import threading
5
5
  import time
6
6
  from collections.abc import Iterable, Iterator
7
7
  from concurrent.futures import ThreadPoolExecutor
8
+ from datetime import datetime
8
9
  from functools import partial
10
+ from io import BytesIO
9
11
  from pathlib import Path
10
12
  from typing import Dict, List, Optional, Tuple, Type, Union
11
13
 
@@ -275,6 +277,34 @@ class DocumentConverter:
275
277
  "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
276
278
  )
277
279
 
280
+ @validate_call(config=ConfigDict(strict=True))
281
+ def convert_string(
282
+ self,
283
+ content: str,
284
+ format: InputFormat,
285
+ name: Optional[str],
286
+ ) -> ConversionResult:
287
+ name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
288
+
289
+ if format == InputFormat.MD:
290
+ if not name.endswith(".md"):
291
+ name += ".md"
292
+
293
+ buff = BytesIO(content.encode("utf-8"))
294
+ doc_stream = DocumentStream(name=name, stream=buff)
295
+
296
+ return self.convert(doc_stream)
297
+ elif format == InputFormat.HTML:
298
+ if not name.endswith(".html"):
299
+ name += ".html"
300
+
301
+ buff = BytesIO(content.encode("utf-8"))
302
+ doc_stream = DocumentStream(name=name, stream=buff)
303
+
304
+ return self.convert(doc_stream)
305
+ else:
306
+ raise ValueError(f"format {format} is not supported in `convert_string`")
307
+
278
308
  def _convert(
279
309
  self, conv_input: _DocumentConversionInput, raises_on_error: bool
280
310
  ) -> Iterator[ConversionResult]:
@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
35
35
 
36
36
  if self.enabled:
37
37
  try:
38
- from mlx_vlm import generate, load # type: ignore
38
+ from mlx_vlm import generate, load, stream_generate # type: ignore
39
39
  from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
40
- from mlx_vlm.utils import load_config, stream_generate # type: ignore
40
+ from mlx_vlm.utils import load_config # type: ignore
41
41
  except ImportError:
42
42
  raise ImportError(
43
43
  "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.43.0
3
+ Version: 2.44.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
58
58
  Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
- Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
+ Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
62
  Provides-Extra: rapidocr
63
63
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
64
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -47,4 +47,4 @@ transformers<5.0.0,>=4.46.0
47
47
  accelerate<2.0.0,>=1.2.1
48
48
 
49
49
  [vlm:python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"]
50
- mlx-vlm<0.2,>=0.1.22
50
+ mlx-vlm<1.0.0,>=0.3.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.43.0" # DO NOT EDIT, updated automatically
3
+ version = "2.44.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -92,7 +92,7 @@ ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
92
92
  vlm = [
93
93
  'transformers (>=4.46.0,<5.0.0)',
94
94
  'accelerate (>=1.2.1,<2.0.0)',
95
- 'mlx-vlm (>=0.1.22,<0.2) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
95
+ 'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
96
96
  ]
97
97
  rapidocr = [
98
98
  'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
@@ -100,6 +100,22 @@ def test_ordered_lists():
100
100
  assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
101
101
 
102
102
 
103
+ def test_unicode_characters():
104
+ raw_html = "<html><body><h1>Hello World!</h1></body></html>".encode() # noqa: RUF001
105
+ in_doc = InputDocument(
106
+ path_or_stream=BytesIO(raw_html),
107
+ format=InputFormat.HTML,
108
+ backend=HTMLDocumentBackend,
109
+ filename="test",
110
+ )
111
+ backend = HTMLDocumentBackend(
112
+ in_doc=in_doc,
113
+ path_or_stream=BytesIO(raw_html),
114
+ )
115
+ doc: DoclingDocument = backend.convert()
116
+ assert doc.texts[0].text == "Hello World!"
117
+
118
+
103
119
  def get_html_paths():
104
120
  # Define the directory you want to search
105
121
  directory = Path("./tests/data/html/")
@@ -2,10 +2,19 @@ from pathlib import Path
2
2
 
3
3
  from docling.backend.md_backend import MarkdownDocumentBackend
4
4
  from docling.datamodel.base_models import InputFormat
5
- from docling.datamodel.document import DoclingDocument, InputDocument
5
+ from docling.datamodel.document import (
6
+ ConversionResult,
7
+ DoclingDocument,
8
+ InputDocument,
9
+ SectionHeaderItem,
10
+ )
11
+ from docling.document_converter import DocumentConverter
6
12
  from tests.verify_utils import CONFID_PREC, COORD_PREC
7
13
 
8
14
  from .test_data_gen_flag import GEN_TEST_DATA
15
+ from .verify_utils import verify_document, verify_export
16
+
17
+ GENERATE = GEN_TEST_DATA
9
18
 
10
19
 
11
20
  def test_convert_valid():
@@ -54,3 +63,45 @@ def test_convert_valid():
54
63
  if in_path.stem in yaml_filter:
55
64
  exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
56
65
  assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
66
+
67
+
68
+ def get_md_paths():
69
+ # Define the directory you want to search
70
+ directory = Path("./tests/groundtruth/docling_v2")
71
+
72
+ # List all MD files in the directory and its subdirectories
73
+ md_files = sorted(directory.rglob("*.md"))
74
+ return md_files
75
+
76
+
77
+ def get_converter():
78
+ converter = DocumentConverter(allowed_formats=[InputFormat.MD])
79
+
80
+ return converter
81
+
82
+
83
+ def test_e2e_md_conversions():
84
+ md_paths = get_md_paths()
85
+ converter = get_converter()
86
+
87
+ for md_path in md_paths:
88
+ # print(f"converting {md_path}")
89
+
90
+ with open(md_path) as fr:
91
+ true_md = fr.read()
92
+
93
+ conv_result: ConversionResult = converter.convert(md_path)
94
+
95
+ doc: DoclingDocument = conv_result.document
96
+
97
+ pred_md: str = doc.export_to_markdown()
98
+ assert true_md == pred_md
99
+
100
+ conv_result_: ConversionResult = converter.convert_string(
101
+ true_md, format=InputFormat.MD
102
+ )
103
+
104
+ doc_: DoclingDocument = conv_result_.document
105
+
106
+ pred_md_: str = doc_.export_to_markdown()
107
+ assert true_md == pred_md_
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes