docling 2.42.2__tar.gz → 2.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.42.2 → docling-2.44.0}/PKG-INFO +4 -4
- {docling-2.42.2 → docling-2.44.0}/docling/backend/html_backend.py +78 -18
- {docling-2.42.2 → docling-2.44.0}/docling/backend/md_backend.py +43 -11
- {docling-2.42.2 → docling-2.44.0}/docling/cli/main.py +6 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/pipeline_options.py +15 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/settings.py +7 -12
- {docling-2.42.2 → docling-2.44.0}/docling/document_converter.py +57 -17
- {docling-2.42.2 → docling-2.44.0}/docling/models/layout_model.py +84 -66
- {docling-2.42.2 → docling-2.44.0}/docling/models/vlm_models_inline/mlx_model.py +2 -2
- docling-2.44.0/docling/pipeline/threaded_standard_pdf_pipeline.py +605 -0
- {docling-2.42.2 → docling-2.44.0}/docling.egg-info/PKG-INFO +4 -4
- {docling-2.42.2 → docling-2.44.0}/docling.egg-info/SOURCES.txt +3 -1
- {docling-2.42.2 → docling-2.44.0}/docling.egg-info/requires.txt +3 -3
- {docling-2.42.2 → docling-2.44.0}/pyproject.toml +4 -6
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_html.py +16 -0
- docling-2.44.0/tests/test_backend_markdown.py +107 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_settings_load.py +1 -1
- docling-2.44.0/tests/test_threaded_pipeline.py +176 -0
- docling-2.42.2/tests/test_backend_markdown.py +0 -57
- {docling-2.42.2 → docling-2.44.0}/LICENSE +0 -0
- {docling-2.42.2 → docling-2.44.0}/README.md +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/chunking/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/cli/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/cli/models.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/cli/tools.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/document.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/exceptions.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/base_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/py.typed +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/__init__.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/export.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/locks.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/orientation.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/profiling.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/utils.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling/utils/visualization.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.42.2 → docling-2.44.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.42.2 → docling-2.44.0}/setup.cfg +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_csv.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_jats.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_msword.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_backend_webp.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_cli.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_code_formula.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_input_doc.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_interfaces.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_invalid_input.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.42.2 → docling-2.44.0}/tests/test_options.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.44.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -28,9 +28,9 @@ License-File: LICENSE
|
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
30
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
31
|
-
Requires-Dist: docling-ibm-models<4,>=3.
|
31
|
+
Requires-Dist: docling-ibm-models<4,>=3.9.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
|
-
Requires-Dist: pypdfium2
|
33
|
+
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
35
35
|
Requires-Dist: huggingface_hub<1,>=0.23
|
36
36
|
Requires-Dist: requests<3.0.0,>=2.32.2
|
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
58
58
|
Provides-Extra: vlm
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
|
-
Requires-Dist: mlx-vlm<0.
|
61
|
+
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
62
|
Provides-Extra: rapidocr
|
63
63
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
64
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
-
import traceback
|
4
3
|
from io import BytesIO
|
5
4
|
from pathlib import Path
|
6
5
|
from typing import Final, Optional, Union, cast
|
@@ -126,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
126
125
|
# set the title as furniture, since it is part of the document metadata
|
127
126
|
title = self.soup.title
|
128
127
|
if title:
|
128
|
+
title_text = title.get_text(separator=" ", strip=True)
|
129
|
+
title_clean = HTMLDocumentBackend._clean_unicode(title_text)
|
129
130
|
doc.add_title(
|
130
|
-
text=
|
131
|
+
text=title_clean,
|
132
|
+
orig=title_text,
|
131
133
|
content_layer=ContentLayer.FURNITURE,
|
132
134
|
)
|
133
135
|
# remove scripts/styles
|
@@ -144,11 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
144
146
|
)
|
145
147
|
# reset context
|
146
148
|
self.ctx = _Context()
|
147
|
-
|
148
|
-
try:
|
149
|
-
self._walk(content, doc)
|
150
|
-
except Exception:
|
151
|
-
print(traceback.format_exc())
|
149
|
+
self._walk(content, doc)
|
152
150
|
|
153
151
|
return doc
|
154
152
|
|
@@ -173,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
173
171
|
return
|
174
172
|
for part in text.split("\n"):
|
175
173
|
seg = part.strip()
|
174
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
176
175
|
if seg:
|
177
176
|
doc.add_text(
|
178
|
-
DocItemLabel.TEXT,
|
179
|
-
|
177
|
+
label=DocItemLabel.TEXT,
|
178
|
+
text=seg_clean,
|
179
|
+
orig=seg,
|
180
180
|
parent=self.parents[self.level],
|
181
181
|
content_layer=self.content_layer,
|
182
182
|
)
|
@@ -208,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
208
208
|
self.content_layer = ContentLayer.BODY
|
209
209
|
level = int(tag_name[1])
|
210
210
|
text = tag.get_text(strip=True, separator=" ")
|
211
|
+
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
211
212
|
# the first level is for the title item
|
212
213
|
if level == 1:
|
213
214
|
for key in self.parents.keys():
|
214
215
|
self.parents[key] = None
|
215
216
|
self.level = 0
|
216
217
|
self.parents[self.level + 1] = doc.add_title(
|
217
|
-
text, content_layer=self.content_layer
|
218
|
+
text=text_clean, orig=text, content_layer=self.content_layer
|
218
219
|
)
|
219
220
|
# the other levels need to be lowered by 1 if a title was set
|
220
221
|
else:
|
@@ -239,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
239
240
|
self.level = level
|
240
241
|
self.parents[self.level + 1] = doc.add_heading(
|
241
242
|
parent=self.parents[self.level],
|
242
|
-
text=
|
243
|
+
text=text_clean,
|
244
|
+
orig=text,
|
243
245
|
level=self.level,
|
244
246
|
content_layer=self.content_layer,
|
245
247
|
)
|
@@ -301,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
301
303
|
if text_part:
|
302
304
|
parts.append(text_part)
|
303
305
|
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
306
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
304
307
|
|
305
308
|
# 3) add the list item
|
306
309
|
if li_text:
|
307
310
|
self.parents[self.level + 1] = doc.add_list_item(
|
308
|
-
text=
|
311
|
+
text=li_clean,
|
309
312
|
enumerated=is_ordered,
|
310
313
|
marker=marker,
|
314
|
+
orig=li_text,
|
311
315
|
parent=list_group,
|
312
316
|
content_layer=self.content_layer,
|
313
317
|
)
|
@@ -349,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
349
353
|
elif tag_name in {"p", "address", "summary"}:
|
350
354
|
for part in tag.text.split("\n"):
|
351
355
|
seg = part.strip()
|
356
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
352
357
|
if seg:
|
353
358
|
doc.add_text(
|
354
|
-
parent=self.parents[self.level],
|
355
359
|
label=DocItemLabel.TEXT,
|
356
|
-
text=
|
360
|
+
text=seg_clean,
|
361
|
+
orig=seg,
|
362
|
+
parent=self.parents[self.level],
|
357
363
|
content_layer=self.content_layer,
|
358
364
|
)
|
359
365
|
for img_tag in tag("img"):
|
@@ -375,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
375
381
|
elif tag_name in {"pre", "code"}:
|
376
382
|
# handle monospace code snippets (pre).
|
377
383
|
text = tag.get_text(strip=True)
|
384
|
+
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
378
385
|
if text:
|
379
386
|
doc.add_code(
|
380
387
|
parent=self.parents[self.level],
|
381
|
-
text=
|
388
|
+
text=text_clean,
|
389
|
+
orig=text,
|
382
390
|
content_layer=self.content_layer,
|
383
391
|
)
|
384
392
|
|
@@ -407,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
407
415
|
|
408
416
|
caption_item: Optional[TextItem] = None
|
409
417
|
if caption:
|
418
|
+
caption_clean = HTMLDocumentBackend._clean_unicode(caption)
|
410
419
|
caption_item = doc.add_text(
|
411
|
-
DocItemLabel.CAPTION,
|
420
|
+
label=DocItemLabel.CAPTION,
|
421
|
+
text=caption_clean,
|
422
|
+
orig=caption,
|
423
|
+
content_layer=self.content_layer,
|
412
424
|
)
|
413
425
|
|
414
426
|
doc.add_picture(
|
@@ -447,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
447
459
|
|
448
460
|
return "".join(parts)
|
449
461
|
|
462
|
+
@staticmethod
|
463
|
+
def _clean_unicode(text: str) -> str:
|
464
|
+
"""Replace typical Unicode characters in HTML for text processing.
|
465
|
+
|
466
|
+
Several Unicode characters (e.g., non-printable or formatting) are typically
|
467
|
+
found in HTML but are worth replacing to sanitize text and ensure consistency
|
468
|
+
in text processing tasks.
|
469
|
+
|
470
|
+
Args:
|
471
|
+
text: The original text.
|
472
|
+
|
473
|
+
Returns:
|
474
|
+
The sanitized text without typical Unicode characters.
|
475
|
+
"""
|
476
|
+
replacements = {
|
477
|
+
"\u00a0": " ", # non-breaking space
|
478
|
+
"\u200b": "", # zero-width space
|
479
|
+
"\u200c": "", # zero-width non-joiner
|
480
|
+
"\u200d": "", # zero-width joiner
|
481
|
+
"\u2010": "-", # hyphen
|
482
|
+
"\u2011": "-", # non-breaking hyphen
|
483
|
+
"\u2012": "-", # dash
|
484
|
+
"\u2013": "-", # dash
|
485
|
+
"\u2014": "-", # dash
|
486
|
+
"\u2015": "-", # horizontal bar
|
487
|
+
"\u2018": "'", # left single quotation mark
|
488
|
+
"\u2019": "'", # right single quotation mark
|
489
|
+
"\u201c": '"', # left double quotation mark
|
490
|
+
"\u201d": '"', # right double quotation mark
|
491
|
+
"\u2026": "...", # ellipsis
|
492
|
+
"\u00ad": "", # soft hyphen
|
493
|
+
"\ufeff": "", # zero width non-break space
|
494
|
+
"\u202f": " ", # narrow non-break space
|
495
|
+
"\u2060": "", # word joiner
|
496
|
+
}
|
497
|
+
for raw, clean in replacements.items():
|
498
|
+
text = text.replace(raw, clean)
|
499
|
+
|
500
|
+
return text
|
501
|
+
|
450
502
|
@staticmethod
|
451
503
|
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
452
504
|
"""Extract colspan and rowspan values from a table cell tag.
|
@@ -459,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
459
511
|
str(cell.get("colspan", "1")),
|
460
512
|
str(cell.get("rowspan", "1")),
|
461
513
|
)
|
514
|
+
|
515
|
+
def _extract_num(s: str) -> int:
|
516
|
+
if s and s[0].isnumeric():
|
517
|
+
match = re.search(r"\d+", s)
|
518
|
+
if match:
|
519
|
+
return int(match.group())
|
520
|
+
return 1
|
521
|
+
|
462
522
|
int_spans: tuple[int, int] = (
|
463
|
-
|
464
|
-
|
523
|
+
_extract_num(raw_spans[0]),
|
524
|
+
_extract_num(raw_spans[1]),
|
465
525
|
)
|
466
526
|
|
467
527
|
return int_spans
|
@@ -5,7 +5,7 @@ from copy import deepcopy
|
|
5
5
|
from enum import Enum
|
6
6
|
from io import BytesIO
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import
|
8
|
+
from typing import Literal, Optional, Union, cast
|
9
9
|
|
10
10
|
import marko
|
11
11
|
import marko.element
|
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
|
|
14
14
|
DocItemLabel,
|
15
15
|
DoclingDocument,
|
16
16
|
DocumentOrigin,
|
17
|
+
ListItem,
|
17
18
|
NodeItem,
|
18
19
|
TableCell,
|
19
20
|
TableData,
|
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
89
90
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
90
91
|
super().__init__(in_doc, path_or_stream)
|
91
92
|
|
92
|
-
_log.debug("
|
93
|
+
_log.debug("Starting MarkdownDocumentBackend...")
|
93
94
|
|
94
95
|
# Markdown file:
|
95
96
|
self.path_or_stream = path_or_stream
|
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
131
132
|
for md_table_row in self.md_table_buffer:
|
132
133
|
_log.debug(md_table_row)
|
133
134
|
_log.debug("=== TABLE END ===")
|
134
|
-
tcells:
|
135
|
+
tcells: list[TableCell] = []
|
135
136
|
result_table = []
|
136
137
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
137
138
|
data = []
|
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
232
233
|
element: marko.element.Element,
|
233
234
|
depth: int,
|
234
235
|
doc: DoclingDocument,
|
235
|
-
visited:
|
236
|
+
visited: set[marko.element.Element],
|
236
237
|
creation_stack: list[
|
237
238
|
_CreationPayload
|
238
239
|
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
239
240
|
list_ordered_flag_by_ref: dict[str, bool],
|
241
|
+
list_last_item_by_ref: dict[str, ListItem],
|
240
242
|
parent_item: Optional[NodeItem] = None,
|
241
243
|
formatting: Optional[Formatting] = None,
|
242
244
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
279
281
|
|
280
282
|
elif (
|
281
283
|
isinstance(element, marko.block.ListItem)
|
282
|
-
and len(element.children)
|
284
|
+
and len(element.children) > 0
|
283
285
|
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
284
286
|
and len(child.children) > 0
|
285
287
|
):
|
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
291
293
|
if parent_item
|
292
294
|
else False
|
293
295
|
)
|
294
|
-
|
296
|
+
non_list_children: list[marko.element.Element] = [
|
297
|
+
item
|
298
|
+
for item in child.children
|
299
|
+
if not isinstance(item, marko.block.ListItem)
|
300
|
+
]
|
301
|
+
if len(non_list_children) > 1: # inline group will be created further down
|
302
|
+
parent_ref: Optional[str] = (
|
303
|
+
parent_item.self_ref if parent_item else None
|
304
|
+
)
|
295
305
|
parent_item = self._create_list_item(
|
296
306
|
doc=doc,
|
297
307
|
parent_item=parent_item,
|
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
300
310
|
formatting=formatting,
|
301
311
|
hyperlink=hyperlink,
|
302
312
|
)
|
313
|
+
if parent_ref:
|
314
|
+
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
|
303
315
|
else:
|
304
316
|
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
305
317
|
|
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
334
346
|
element.dest
|
335
347
|
)
|
336
348
|
|
337
|
-
elif isinstance(element, marko.inline.RawText):
|
338
|
-
_log.debug(f" -
|
339
|
-
snippet_text =
|
349
|
+
elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
|
350
|
+
_log.debug(f" - RawText/Literal: {element.children}")
|
351
|
+
snippet_text = (
|
352
|
+
element.children.strip() if isinstance(element.children, str) else ""
|
353
|
+
)
|
340
354
|
# Detect start of the table:
|
341
355
|
if "|" in snippet_text or self.in_table:
|
342
356
|
# most likely part of the markdown table
|
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
359
373
|
if parent_item
|
360
374
|
else False
|
361
375
|
)
|
376
|
+
parent_ref = parent_item.self_ref if parent_item else None
|
362
377
|
parent_item = self._create_list_item(
|
363
378
|
doc=doc,
|
364
379
|
parent_item=parent_item,
|
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
367
382
|
formatting=formatting,
|
368
383
|
hyperlink=hyperlink,
|
369
384
|
)
|
385
|
+
if parent_ref:
|
386
|
+
list_last_item_by_ref[parent_ref] = cast(
|
387
|
+
ListItem, parent_item
|
388
|
+
)
|
389
|
+
|
370
390
|
elif isinstance(to_create, _HeadingCreationPayload):
|
371
391
|
# not keeping as parent_item as logic for correctly tracking
|
372
392
|
# that not implemented yet (section components not captured
|
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
458
478
|
element, processed_block_types
|
459
479
|
):
|
460
480
|
for child in element.children:
|
481
|
+
if (
|
482
|
+
isinstance(element, marko.block.ListItem)
|
483
|
+
and isinstance(child, marko.block.List)
|
484
|
+
and parent_item
|
485
|
+
and list_last_item_by_ref.get(parent_item.self_ref, None)
|
486
|
+
):
|
487
|
+
_log.debug(
|
488
|
+
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
|
489
|
+
)
|
490
|
+
parent_item = list_last_item_by_ref[parent_item.self_ref]
|
491
|
+
|
461
492
|
self._iterate_elements(
|
462
493
|
element=child,
|
463
494
|
depth=depth + 1,
|
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
465
496
|
visited=visited,
|
466
497
|
creation_stack=creation_stack,
|
467
498
|
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
499
|
+
list_last_item_by_ref=list_last_item_by_ref,
|
468
500
|
parent_item=parent_item,
|
469
501
|
formatting=formatting,
|
470
502
|
hyperlink=hyperlink,
|
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
483
515
|
return False
|
484
516
|
|
485
517
|
@classmethod
|
486
|
-
def supported_formats(cls) ->
|
518
|
+
def supported_formats(cls) -> set[InputFormat]:
|
487
519
|
return {InputFormat.MD}
|
488
520
|
|
489
521
|
def convert(self) -> DoclingDocument:
|
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
510
542
|
visited=set(),
|
511
543
|
creation_stack=[],
|
512
544
|
list_ordered_flag_by_ref={},
|
545
|
+
list_last_item_by_ref={},
|
513
546
|
)
|
514
547
|
self._close_table(doc=doc) # handle any last hanging table
|
515
548
|
|
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
534
567
|
]:
|
535
568
|
html_str = _restore_original_html(txt=html_str, regex=regex)
|
536
569
|
self._html_blocks = 0
|
537
|
-
|
538
570
|
# delegate to HTML backend
|
539
571
|
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
540
572
|
in_doc = InputDocument(
|
@@ -262,6 +262,12 @@ def export_documents(
|
|
262
262
|
|
263
263
|
else:
|
264
264
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
265
|
+
if _log.isEnabledFor(logging.INFO):
|
266
|
+
for err in conv_res.errors:
|
267
|
+
_log.info(
|
268
|
+
f" [Failure Detail] Component: {err.component_type}, "
|
269
|
+
f"Module: {err.module_name}, Message: {err.error_message}"
|
270
|
+
)
|
265
271
|
failure_count += 1
|
266
272
|
|
267
273
|
_log.info(
|
@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
|
|
332
332
|
STANDARD = "standard"
|
333
333
|
VLM = "vlm"
|
334
334
|
ASR = "asr"
|
335
|
+
|
336
|
+
|
337
|
+
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
338
|
+
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
339
|
+
|
340
|
+
# Batch sizes for different stages
|
341
|
+
ocr_batch_size: int = 4
|
342
|
+
layout_batch_size: int = 4
|
343
|
+
table_batch_size: int = 4
|
344
|
+
|
345
|
+
# Timing control
|
346
|
+
batch_timeout_seconds: float = 2.0
|
347
|
+
|
348
|
+
# Backpressure and queue control
|
349
|
+
queue_max_size: int = 100
|
@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
|
|
26
26
|
|
27
27
|
|
28
28
|
class BatchConcurrencySettings(BaseModel):
|
29
|
-
doc_batch_size: int =
|
30
|
-
doc_batch_concurrency: int =
|
31
|
-
page_batch_size: int = 4
|
32
|
-
page_batch_concurrency: int =
|
33
|
-
elements_batch_size: int =
|
34
|
-
|
35
|
-
|
36
|
-
# doc_batch_concurrency: int = 1
|
37
|
-
# page_batch_size: int = 1
|
38
|
-
# page_batch_concurrency: int = 1
|
39
|
-
|
40
|
-
# model_concurrency: int = 2
|
29
|
+
doc_batch_size: int = 1 # Number of documents processed in one batch. Should be >= doc_batch_concurrency
|
30
|
+
doc_batch_concurrency: int = 1 # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
|
31
|
+
page_batch_size: int = 4 # Number of pages processed in one batch.
|
32
|
+
page_batch_concurrency: int = 1 # Currently unused.
|
33
|
+
elements_batch_size: int = (
|
34
|
+
16 # Number of elements processed in one batch, in enrichment models.
|
35
|
+
)
|
41
36
|
|
42
37
|
# To force models into single core: export OMP_NUM_THREADS=1
|
43
38
|
|
@@ -4,7 +4,10 @@ import sys
|
|
4
4
|
import threading
|
5
5
|
import time
|
6
6
|
from collections.abc import Iterable, Iterator
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
8
|
+
from datetime import datetime
|
7
9
|
from functools import partial
|
10
|
+
from io import BytesIO
|
8
11
|
from pathlib import Path
|
9
12
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
10
13
|
|
@@ -274,6 +277,34 @@ class DocumentConverter:
|
|
274
277
|
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
275
278
|
)
|
276
279
|
|
280
|
+
@validate_call(config=ConfigDict(strict=True))
|
281
|
+
def convert_string(
|
282
|
+
self,
|
283
|
+
content: str,
|
284
|
+
format: InputFormat,
|
285
|
+
name: Optional[str],
|
286
|
+
) -> ConversionResult:
|
287
|
+
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
288
|
+
|
289
|
+
if format == InputFormat.MD:
|
290
|
+
if not name.endswith(".md"):
|
291
|
+
name += ".md"
|
292
|
+
|
293
|
+
buff = BytesIO(content.encode("utf-8"))
|
294
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
295
|
+
|
296
|
+
return self.convert(doc_stream)
|
297
|
+
elif format == InputFormat.HTML:
|
298
|
+
if not name.endswith(".html"):
|
299
|
+
name += ".html"
|
300
|
+
|
301
|
+
buff = BytesIO(content.encode("utf-8"))
|
302
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
303
|
+
|
304
|
+
return self.convert(doc_stream)
|
305
|
+
else:
|
306
|
+
raise ValueError(f"format {format} is not supported in `convert_string`")
|
307
|
+
|
277
308
|
def _convert(
|
278
309
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
279
310
|
) -> Iterator[ConversionResult]:
|
@@ -284,24 +315,33 @@ class DocumentConverter:
|
|
284
315
|
settings.perf.doc_batch_size, # pass format_options
|
285
316
|
):
|
286
317
|
_log.info("Going to convert document batch...")
|
318
|
+
process_func = partial(
|
319
|
+
self._process_document, raises_on_error=raises_on_error
|
320
|
+
)
|
287
321
|
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
# ) as pool:
|
292
|
-
# yield from pool.map(self.process_document, input_batch)
|
293
|
-
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
294
|
-
|
295
|
-
for item in map(
|
296
|
-
partial(self._process_document, raises_on_error=raises_on_error),
|
297
|
-
input_batch,
|
322
|
+
if (
|
323
|
+
settings.perf.doc_batch_concurrency > 1
|
324
|
+
and settings.perf.doc_batch_size > 1
|
298
325
|
):
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
326
|
+
with ThreadPoolExecutor(
|
327
|
+
max_workers=settings.perf.doc_batch_concurrency
|
328
|
+
) as pool:
|
329
|
+
for item in pool.map(
|
330
|
+
process_func,
|
331
|
+
input_batch,
|
332
|
+
):
|
333
|
+
yield item
|
334
|
+
else:
|
335
|
+
for item in map(
|
336
|
+
process_func,
|
337
|
+
input_batch,
|
338
|
+
):
|
339
|
+
elapsed = time.monotonic() - start_time
|
340
|
+
start_time = time.monotonic()
|
341
|
+
_log.info(
|
342
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
343
|
+
)
|
344
|
+
yield item
|
305
345
|
|
306
346
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
307
347
|
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
@@ -330,7 +370,7 @@ class DocumentConverter:
|
|
330
370
|
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
331
371
|
)
|
332
372
|
|
333
|
-
|
373
|
+
return self.initialized_pipelines[cache_key]
|
334
374
|
|
335
375
|
def _process_document(
|
336
376
|
self, in_doc: InputDocument, raises_on_error: bool
|