docling 2.43.0__tar.gz → 2.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.43.0 → docling-2.44.0}/PKG-INFO +2 -2
- {docling-2.43.0 → docling-2.44.0}/docling/backend/html_backend.py +77 -12
- {docling-2.43.0 → docling-2.44.0}/docling/cli/main.py +6 -0
- {docling-2.43.0 → docling-2.44.0}/docling/document_converter.py +30 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/vlm_models_inline/mlx_model.py +2 -2
- {docling-2.43.0 → docling-2.44.0}/docling.egg-info/PKG-INFO +2 -2
- {docling-2.43.0 → docling-2.44.0}/docling.egg-info/requires.txt +1 -1
- {docling-2.43.0 → docling-2.44.0}/pyproject.toml +2 -2
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_html.py +16 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_markdown.py +52 -1
- {docling-2.43.0 → docling-2.44.0}/LICENSE +0 -0
- {docling-2.43.0 → docling-2.44.0}/README.md +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/md_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/chunking/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/cli/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/cli/models.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/cli/tools.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/document.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/settings.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/exceptions.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/base_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/layout_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/py.typed +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/__init__.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/export.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/locks.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/orientation.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/profiling.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/utils.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling/utils/visualization.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.43.0 → docling-2.44.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.43.0 → docling-2.44.0}/setup.cfg +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_csv.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_jats.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_msword.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_backend_webp.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_cli.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_code_formula.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_input_doc.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_interfaces.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_invalid_input.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_options.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_settings_load.py +0 -0
- {docling-2.43.0 → docling-2.44.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.44.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
58
58
|
Provides-Extra: vlm
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
|
-
Requires-Dist: mlx-vlm<0.
|
61
|
+
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
62
|
Provides-Extra: rapidocr
|
63
63
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
64
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -125,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
125
125
|
# set the title as furniture, since it is part of the document metadata
|
126
126
|
title = self.soup.title
|
127
127
|
if title:
|
128
|
+
title_text = title.get_text(separator=" ", strip=True)
|
129
|
+
title_clean = HTMLDocumentBackend._clean_unicode(title_text)
|
128
130
|
doc.add_title(
|
129
|
-
text=
|
131
|
+
text=title_clean,
|
132
|
+
orig=title_text,
|
130
133
|
content_layer=ContentLayer.FURNITURE,
|
131
134
|
)
|
132
135
|
# remove scripts/styles
|
@@ -168,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
168
171
|
return
|
169
172
|
for part in text.split("\n"):
|
170
173
|
seg = part.strip()
|
174
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
171
175
|
if seg:
|
172
176
|
doc.add_text(
|
173
|
-
DocItemLabel.TEXT,
|
174
|
-
|
177
|
+
label=DocItemLabel.TEXT,
|
178
|
+
text=seg_clean,
|
179
|
+
orig=seg,
|
175
180
|
parent=self.parents[self.level],
|
176
181
|
content_layer=self.content_layer,
|
177
182
|
)
|
@@ -203,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
203
208
|
self.content_layer = ContentLayer.BODY
|
204
209
|
level = int(tag_name[1])
|
205
210
|
text = tag.get_text(strip=True, separator=" ")
|
211
|
+
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
206
212
|
# the first level is for the title item
|
207
213
|
if level == 1:
|
208
214
|
for key in self.parents.keys():
|
209
215
|
self.parents[key] = None
|
210
216
|
self.level = 0
|
211
217
|
self.parents[self.level + 1] = doc.add_title(
|
212
|
-
text, content_layer=self.content_layer
|
218
|
+
text=text_clean, orig=text, content_layer=self.content_layer
|
213
219
|
)
|
214
220
|
# the other levels need to be lowered by 1 if a title was set
|
215
221
|
else:
|
@@ -234,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
234
240
|
self.level = level
|
235
241
|
self.parents[self.level + 1] = doc.add_heading(
|
236
242
|
parent=self.parents[self.level],
|
237
|
-
text=
|
243
|
+
text=text_clean,
|
244
|
+
orig=text,
|
238
245
|
level=self.level,
|
239
246
|
content_layer=self.content_layer,
|
240
247
|
)
|
@@ -296,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
296
303
|
if text_part:
|
297
304
|
parts.append(text_part)
|
298
305
|
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
306
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
299
307
|
|
300
308
|
# 3) add the list item
|
301
309
|
if li_text:
|
302
310
|
self.parents[self.level + 1] = doc.add_list_item(
|
303
|
-
text=
|
311
|
+
text=li_clean,
|
304
312
|
enumerated=is_ordered,
|
305
313
|
marker=marker,
|
314
|
+
orig=li_text,
|
306
315
|
parent=list_group,
|
307
316
|
content_layer=self.content_layer,
|
308
317
|
)
|
@@ -344,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
344
353
|
elif tag_name in {"p", "address", "summary"}:
|
345
354
|
for part in tag.text.split("\n"):
|
346
355
|
seg = part.strip()
|
356
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
347
357
|
if seg:
|
348
358
|
doc.add_text(
|
349
|
-
parent=self.parents[self.level],
|
350
359
|
label=DocItemLabel.TEXT,
|
351
|
-
text=
|
360
|
+
text=seg_clean,
|
361
|
+
orig=seg,
|
362
|
+
parent=self.parents[self.level],
|
352
363
|
content_layer=self.content_layer,
|
353
364
|
)
|
354
365
|
for img_tag in tag("img"):
|
@@ -370,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
370
381
|
elif tag_name in {"pre", "code"}:
|
371
382
|
# handle monospace code snippets (pre).
|
372
383
|
text = tag.get_text(strip=True)
|
384
|
+
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
373
385
|
if text:
|
374
386
|
doc.add_code(
|
375
387
|
parent=self.parents[self.level],
|
376
|
-
text=
|
388
|
+
text=text_clean,
|
389
|
+
orig=text,
|
377
390
|
content_layer=self.content_layer,
|
378
391
|
)
|
379
392
|
|
@@ -402,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
402
415
|
|
403
416
|
caption_item: Optional[TextItem] = None
|
404
417
|
if caption:
|
418
|
+
caption_clean = HTMLDocumentBackend._clean_unicode(caption)
|
405
419
|
caption_item = doc.add_text(
|
406
|
-
DocItemLabel.CAPTION,
|
420
|
+
label=DocItemLabel.CAPTION,
|
421
|
+
text=caption_clean,
|
422
|
+
orig=caption,
|
423
|
+
content_layer=self.content_layer,
|
407
424
|
)
|
408
425
|
|
409
426
|
doc.add_picture(
|
@@ -442,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
442
459
|
|
443
460
|
return "".join(parts)
|
444
461
|
|
462
|
+
@staticmethod
|
463
|
+
def _clean_unicode(text: str) -> str:
|
464
|
+
"""Replace typical Unicode characters in HTML for text processing.
|
465
|
+
|
466
|
+
Several Unicode characters (e.g., non-printable or formatting) are typically
|
467
|
+
found in HTML but are worth replacing to sanitize text and ensure consistency
|
468
|
+
in text processing tasks.
|
469
|
+
|
470
|
+
Args:
|
471
|
+
text: The original text.
|
472
|
+
|
473
|
+
Returns:
|
474
|
+
The sanitized text without typical Unicode characters.
|
475
|
+
"""
|
476
|
+
replacements = {
|
477
|
+
"\u00a0": " ", # non-breaking space
|
478
|
+
"\u200b": "", # zero-width space
|
479
|
+
"\u200c": "", # zero-width non-joiner
|
480
|
+
"\u200d": "", # zero-width joiner
|
481
|
+
"\u2010": "-", # hyphen
|
482
|
+
"\u2011": "-", # non-breaking hyphen
|
483
|
+
"\u2012": "-", # dash
|
484
|
+
"\u2013": "-", # dash
|
485
|
+
"\u2014": "-", # dash
|
486
|
+
"\u2015": "-", # horizontal bar
|
487
|
+
"\u2018": "'", # left single quotation mark
|
488
|
+
"\u2019": "'", # right single quotation mark
|
489
|
+
"\u201c": '"', # left double quotation mark
|
490
|
+
"\u201d": '"', # right double quotation mark
|
491
|
+
"\u2026": "...", # ellipsis
|
492
|
+
"\u00ad": "", # soft hyphen
|
493
|
+
"\ufeff": "", # zero width non-break space
|
494
|
+
"\u202f": " ", # narrow non-break space
|
495
|
+
"\u2060": "", # word joiner
|
496
|
+
}
|
497
|
+
for raw, clean in replacements.items():
|
498
|
+
text = text.replace(raw, clean)
|
499
|
+
|
500
|
+
return text
|
501
|
+
|
445
502
|
@staticmethod
|
446
503
|
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
447
504
|
"""Extract colspan and rowspan values from a table cell tag.
|
@@ -454,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
454
511
|
str(cell.get("colspan", "1")),
|
455
512
|
str(cell.get("rowspan", "1")),
|
456
513
|
)
|
514
|
+
|
515
|
+
def _extract_num(s: str) -> int:
|
516
|
+
if s and s[0].isnumeric():
|
517
|
+
match = re.search(r"\d+", s)
|
518
|
+
if match:
|
519
|
+
return int(match.group())
|
520
|
+
return 1
|
521
|
+
|
457
522
|
int_spans: tuple[int, int] = (
|
458
|
-
|
459
|
-
|
523
|
+
_extract_num(raw_spans[0]),
|
524
|
+
_extract_num(raw_spans[1]),
|
460
525
|
)
|
461
526
|
|
462
527
|
return int_spans
|
@@ -262,6 +262,12 @@ def export_documents(
|
|
262
262
|
|
263
263
|
else:
|
264
264
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
265
|
+
if _log.isEnabledFor(logging.INFO):
|
266
|
+
for err in conv_res.errors:
|
267
|
+
_log.info(
|
268
|
+
f" [Failure Detail] Component: {err.component_type}, "
|
269
|
+
f"Module: {err.module_name}, Message: {err.error_message}"
|
270
|
+
)
|
265
271
|
failure_count += 1
|
266
272
|
|
267
273
|
_log.info(
|
@@ -5,7 +5,9 @@ import threading
|
|
5
5
|
import time
|
6
6
|
from collections.abc import Iterable, Iterator
|
7
7
|
from concurrent.futures import ThreadPoolExecutor
|
8
|
+
from datetime import datetime
|
8
9
|
from functools import partial
|
10
|
+
from io import BytesIO
|
9
11
|
from pathlib import Path
|
10
12
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
11
13
|
|
@@ -275,6 +277,34 @@ class DocumentConverter:
|
|
275
277
|
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
276
278
|
)
|
277
279
|
|
280
|
+
@validate_call(config=ConfigDict(strict=True))
|
281
|
+
def convert_string(
|
282
|
+
self,
|
283
|
+
content: str,
|
284
|
+
format: InputFormat,
|
285
|
+
name: Optional[str],
|
286
|
+
) -> ConversionResult:
|
287
|
+
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
288
|
+
|
289
|
+
if format == InputFormat.MD:
|
290
|
+
if not name.endswith(".md"):
|
291
|
+
name += ".md"
|
292
|
+
|
293
|
+
buff = BytesIO(content.encode("utf-8"))
|
294
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
295
|
+
|
296
|
+
return self.convert(doc_stream)
|
297
|
+
elif format == InputFormat.HTML:
|
298
|
+
if not name.endswith(".html"):
|
299
|
+
name += ".html"
|
300
|
+
|
301
|
+
buff = BytesIO(content.encode("utf-8"))
|
302
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
303
|
+
|
304
|
+
return self.convert(doc_stream)
|
305
|
+
else:
|
306
|
+
raise ValueError(f"format {format} is not supported in `convert_string`")
|
307
|
+
|
278
308
|
def _convert(
|
279
309
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
280
310
|
) -> Iterator[ConversionResult]:
|
@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
35
35
|
|
36
36
|
if self.enabled:
|
37
37
|
try:
|
38
|
-
from mlx_vlm import generate, load # type: ignore
|
38
|
+
from mlx_vlm import generate, load, stream_generate # type: ignore
|
39
39
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
40
|
-
from mlx_vlm.utils import load_config
|
40
|
+
from mlx_vlm.utils import load_config # type: ignore
|
41
41
|
except ImportError:
|
42
42
|
raise ImportError(
|
43
43
|
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.44.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
58
58
|
Provides-Extra: vlm
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
|
-
Requires-Dist: mlx-vlm<0.
|
61
|
+
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
62
|
Provides-Extra: rapidocr
|
63
63
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
64
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.44.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
license = "MIT"
|
6
6
|
keywords = [
|
@@ -92,7 +92,7 @@ ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
|
|
92
92
|
vlm = [
|
93
93
|
'transformers (>=4.46.0,<5.0.0)',
|
94
94
|
'accelerate (>=1.2.1,<2.0.0)',
|
95
|
-
'mlx-vlm (>=0.1.
|
95
|
+
'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
96
96
|
]
|
97
97
|
rapidocr = [
|
98
98
|
'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
|
@@ -100,6 +100,22 @@ def test_ordered_lists():
|
|
100
100
|
assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
|
101
101
|
|
102
102
|
|
103
|
+
def test_unicode_characters():
|
104
|
+
raw_html = "<html><body><h1>Hello World!</h1></body></html>".encode() # noqa: RUF001
|
105
|
+
in_doc = InputDocument(
|
106
|
+
path_or_stream=BytesIO(raw_html),
|
107
|
+
format=InputFormat.HTML,
|
108
|
+
backend=HTMLDocumentBackend,
|
109
|
+
filename="test",
|
110
|
+
)
|
111
|
+
backend = HTMLDocumentBackend(
|
112
|
+
in_doc=in_doc,
|
113
|
+
path_or_stream=BytesIO(raw_html),
|
114
|
+
)
|
115
|
+
doc: DoclingDocument = backend.convert()
|
116
|
+
assert doc.texts[0].text == "Hello World!"
|
117
|
+
|
118
|
+
|
103
119
|
def get_html_paths():
|
104
120
|
# Define the directory you want to search
|
105
121
|
directory = Path("./tests/data/html/")
|
@@ -2,10 +2,19 @@ from pathlib import Path
|
|
2
2
|
|
3
3
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
4
4
|
from docling.datamodel.base_models import InputFormat
|
5
|
-
from docling.datamodel.document import
|
5
|
+
from docling.datamodel.document import (
|
6
|
+
ConversionResult,
|
7
|
+
DoclingDocument,
|
8
|
+
InputDocument,
|
9
|
+
SectionHeaderItem,
|
10
|
+
)
|
11
|
+
from docling.document_converter import DocumentConverter
|
6
12
|
from tests.verify_utils import CONFID_PREC, COORD_PREC
|
7
13
|
|
8
14
|
from .test_data_gen_flag import GEN_TEST_DATA
|
15
|
+
from .verify_utils import verify_document, verify_export
|
16
|
+
|
17
|
+
GENERATE = GEN_TEST_DATA
|
9
18
|
|
10
19
|
|
11
20
|
def test_convert_valid():
|
@@ -54,3 +63,45 @@ def test_convert_valid():
|
|
54
63
|
if in_path.stem in yaml_filter:
|
55
64
|
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
56
65
|
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
|
66
|
+
|
67
|
+
|
68
|
+
def get_md_paths():
|
69
|
+
# Define the directory you want to search
|
70
|
+
directory = Path("./tests/groundtruth/docling_v2")
|
71
|
+
|
72
|
+
# List all MD files in the directory and its subdirectories
|
73
|
+
md_files = sorted(directory.rglob("*.md"))
|
74
|
+
return md_files
|
75
|
+
|
76
|
+
|
77
|
+
def get_converter():
|
78
|
+
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
|
79
|
+
|
80
|
+
return converter
|
81
|
+
|
82
|
+
|
83
|
+
def test_e2e_md_conversions():
|
84
|
+
md_paths = get_md_paths()
|
85
|
+
converter = get_converter()
|
86
|
+
|
87
|
+
for md_path in md_paths:
|
88
|
+
# print(f"converting {md_path}")
|
89
|
+
|
90
|
+
with open(md_path) as fr:
|
91
|
+
true_md = fr.read()
|
92
|
+
|
93
|
+
conv_result: ConversionResult = converter.convert(md_path)
|
94
|
+
|
95
|
+
doc: DoclingDocument = conv_result.document
|
96
|
+
|
97
|
+
pred_md: str = doc.export_to_markdown()
|
98
|
+
assert true_md == pred_md
|
99
|
+
|
100
|
+
conv_result_: ConversionResult = converter.convert_string(
|
101
|
+
true_md, format=InputFormat.MD
|
102
|
+
)
|
103
|
+
|
104
|
+
doc_: DoclingDocument = conv_result_.document
|
105
|
+
|
106
|
+
pred_md_: str = doc_.export_to_markdown()
|
107
|
+
assert true_md == pred_md_
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|