docling 2.43.0__py3-none-any.whl → 2.44.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +77 -12
- docling/cli/main.py +6 -0
- docling/document_converter.py +30 -0
- docling/models/vlm_models_inline/mlx_model.py +2 -2
- {docling-2.43.0.dist-info → docling-2.44.0.dist-info}/METADATA +2 -2
- {docling-2.43.0.dist-info → docling-2.44.0.dist-info}/RECORD +10 -10
- {docling-2.43.0.dist-info → docling-2.44.0.dist-info}/WHEEL +0 -0
- {docling-2.43.0.dist-info → docling-2.44.0.dist-info}/entry_points.txt +0 -0
- {docling-2.43.0.dist-info → docling-2.44.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.43.0.dist-info → docling-2.44.0.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -125,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
125
125
|
# set the title as furniture, since it is part of the document metadata
|
126
126
|
title = self.soup.title
|
127
127
|
if title:
|
128
|
+
title_text = title.get_text(separator=" ", strip=True)
|
129
|
+
title_clean = HTMLDocumentBackend._clean_unicode(title_text)
|
128
130
|
doc.add_title(
|
129
|
-
text=
|
131
|
+
text=title_clean,
|
132
|
+
orig=title_text,
|
130
133
|
content_layer=ContentLayer.FURNITURE,
|
131
134
|
)
|
132
135
|
# remove scripts/styles
|
@@ -168,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
168
171
|
return
|
169
172
|
for part in text.split("\n"):
|
170
173
|
seg = part.strip()
|
174
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
171
175
|
if seg:
|
172
176
|
doc.add_text(
|
173
|
-
DocItemLabel.TEXT,
|
174
|
-
|
177
|
+
label=DocItemLabel.TEXT,
|
178
|
+
text=seg_clean,
|
179
|
+
orig=seg,
|
175
180
|
parent=self.parents[self.level],
|
176
181
|
content_layer=self.content_layer,
|
177
182
|
)
|
@@ -203,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
203
208
|
self.content_layer = ContentLayer.BODY
|
204
209
|
level = int(tag_name[1])
|
205
210
|
text = tag.get_text(strip=True, separator=" ")
|
211
|
+
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
206
212
|
# the first level is for the title item
|
207
213
|
if level == 1:
|
208
214
|
for key in self.parents.keys():
|
209
215
|
self.parents[key] = None
|
210
216
|
self.level = 0
|
211
217
|
self.parents[self.level + 1] = doc.add_title(
|
212
|
-
text, content_layer=self.content_layer
|
218
|
+
text=text_clean, orig=text, content_layer=self.content_layer
|
213
219
|
)
|
214
220
|
# the other levels need to be lowered by 1 if a title was set
|
215
221
|
else:
|
@@ -234,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
234
240
|
self.level = level
|
235
241
|
self.parents[self.level + 1] = doc.add_heading(
|
236
242
|
parent=self.parents[self.level],
|
237
|
-
text=
|
243
|
+
text=text_clean,
|
244
|
+
orig=text,
|
238
245
|
level=self.level,
|
239
246
|
content_layer=self.content_layer,
|
240
247
|
)
|
@@ -296,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
296
303
|
if text_part:
|
297
304
|
parts.append(text_part)
|
298
305
|
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
306
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
299
307
|
|
300
308
|
# 3) add the list item
|
301
309
|
if li_text:
|
302
310
|
self.parents[self.level + 1] = doc.add_list_item(
|
303
|
-
text=
|
311
|
+
text=li_clean,
|
304
312
|
enumerated=is_ordered,
|
305
313
|
marker=marker,
|
314
|
+
orig=li_text,
|
306
315
|
parent=list_group,
|
307
316
|
content_layer=self.content_layer,
|
308
317
|
)
|
@@ -344,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
344
353
|
elif tag_name in {"p", "address", "summary"}:
|
345
354
|
for part in tag.text.split("\n"):
|
346
355
|
seg = part.strip()
|
356
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
347
357
|
if seg:
|
348
358
|
doc.add_text(
|
349
|
-
parent=self.parents[self.level],
|
350
359
|
label=DocItemLabel.TEXT,
|
351
|
-
text=
|
360
|
+
text=seg_clean,
|
361
|
+
orig=seg,
|
362
|
+
parent=self.parents[self.level],
|
352
363
|
content_layer=self.content_layer,
|
353
364
|
)
|
354
365
|
for img_tag in tag("img"):
|
@@ -370,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
370
381
|
elif tag_name in {"pre", "code"}:
|
371
382
|
# handle monospace code snippets (pre).
|
372
383
|
text = tag.get_text(strip=True)
|
384
|
+
text_clean = HTMLDocumentBackend._clean_unicode(text)
|
373
385
|
if text:
|
374
386
|
doc.add_code(
|
375
387
|
parent=self.parents[self.level],
|
376
|
-
text=
|
388
|
+
text=text_clean,
|
389
|
+
orig=text,
|
377
390
|
content_layer=self.content_layer,
|
378
391
|
)
|
379
392
|
|
@@ -402,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
402
415
|
|
403
416
|
caption_item: Optional[TextItem] = None
|
404
417
|
if caption:
|
418
|
+
caption_clean = HTMLDocumentBackend._clean_unicode(caption)
|
405
419
|
caption_item = doc.add_text(
|
406
|
-
DocItemLabel.CAPTION,
|
420
|
+
label=DocItemLabel.CAPTION,
|
421
|
+
text=caption_clean,
|
422
|
+
orig=caption,
|
423
|
+
content_layer=self.content_layer,
|
407
424
|
)
|
408
425
|
|
409
426
|
doc.add_picture(
|
@@ -442,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
442
459
|
|
443
460
|
return "".join(parts)
|
444
461
|
|
462
|
+
@staticmethod
|
463
|
+
def _clean_unicode(text: str) -> str:
|
464
|
+
"""Replace typical Unicode characters in HTML for text processing.
|
465
|
+
|
466
|
+
Several Unicode characters (e.g., non-printable or formatting) are typically
|
467
|
+
found in HTML but are worth replacing to sanitize text and ensure consistency
|
468
|
+
in text processing tasks.
|
469
|
+
|
470
|
+
Args:
|
471
|
+
text: The original text.
|
472
|
+
|
473
|
+
Returns:
|
474
|
+
The sanitized text without typical Unicode characters.
|
475
|
+
"""
|
476
|
+
replacements = {
|
477
|
+
"\u00a0": " ", # non-breaking space
|
478
|
+
"\u200b": "", # zero-width space
|
479
|
+
"\u200c": "", # zero-width non-joiner
|
480
|
+
"\u200d": "", # zero-width joiner
|
481
|
+
"\u2010": "-", # hyphen
|
482
|
+
"\u2011": "-", # non-breaking hyphen
|
483
|
+
"\u2012": "-", # dash
|
484
|
+
"\u2013": "-", # dash
|
485
|
+
"\u2014": "-", # dash
|
486
|
+
"\u2015": "-", # horizontal bar
|
487
|
+
"\u2018": "'", # left single quotation mark
|
488
|
+
"\u2019": "'", # right single quotation mark
|
489
|
+
"\u201c": '"', # left double quotation mark
|
490
|
+
"\u201d": '"', # right double quotation mark
|
491
|
+
"\u2026": "...", # ellipsis
|
492
|
+
"\u00ad": "", # soft hyphen
|
493
|
+
"\ufeff": "", # zero width non-break space
|
494
|
+
"\u202f": " ", # narrow non-break space
|
495
|
+
"\u2060": "", # word joiner
|
496
|
+
}
|
497
|
+
for raw, clean in replacements.items():
|
498
|
+
text = text.replace(raw, clean)
|
499
|
+
|
500
|
+
return text
|
501
|
+
|
445
502
|
@staticmethod
|
446
503
|
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
447
504
|
"""Extract colspan and rowspan values from a table cell tag.
|
@@ -454,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
454
511
|
str(cell.get("colspan", "1")),
|
455
512
|
str(cell.get("rowspan", "1")),
|
456
513
|
)
|
514
|
+
|
515
|
+
def _extract_num(s: str) -> int:
|
516
|
+
if s and s[0].isnumeric():
|
517
|
+
match = re.search(r"\d+", s)
|
518
|
+
if match:
|
519
|
+
return int(match.group())
|
520
|
+
return 1
|
521
|
+
|
457
522
|
int_spans: tuple[int, int] = (
|
458
|
-
|
459
|
-
|
523
|
+
_extract_num(raw_spans[0]),
|
524
|
+
_extract_num(raw_spans[1]),
|
460
525
|
)
|
461
526
|
|
462
527
|
return int_spans
|
docling/cli/main.py
CHANGED
@@ -262,6 +262,12 @@ def export_documents(
|
|
262
262
|
|
263
263
|
else:
|
264
264
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
265
|
+
if _log.isEnabledFor(logging.INFO):
|
266
|
+
for err in conv_res.errors:
|
267
|
+
_log.info(
|
268
|
+
f" [Failure Detail] Component: {err.component_type}, "
|
269
|
+
f"Module: {err.module_name}, Message: {err.error_message}"
|
270
|
+
)
|
265
271
|
failure_count += 1
|
266
272
|
|
267
273
|
_log.info(
|
docling/document_converter.py
CHANGED
@@ -5,7 +5,9 @@ import threading
|
|
5
5
|
import time
|
6
6
|
from collections.abc import Iterable, Iterator
|
7
7
|
from concurrent.futures import ThreadPoolExecutor
|
8
|
+
from datetime import datetime
|
8
9
|
from functools import partial
|
10
|
+
from io import BytesIO
|
9
11
|
from pathlib import Path
|
10
12
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
11
13
|
|
@@ -275,6 +277,34 @@ class DocumentConverter:
|
|
275
277
|
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
276
278
|
)
|
277
279
|
|
280
|
+
@validate_call(config=ConfigDict(strict=True))
|
281
|
+
def convert_string(
|
282
|
+
self,
|
283
|
+
content: str,
|
284
|
+
format: InputFormat,
|
285
|
+
name: Optional[str],
|
286
|
+
) -> ConversionResult:
|
287
|
+
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
288
|
+
|
289
|
+
if format == InputFormat.MD:
|
290
|
+
if not name.endswith(".md"):
|
291
|
+
name += ".md"
|
292
|
+
|
293
|
+
buff = BytesIO(content.encode("utf-8"))
|
294
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
295
|
+
|
296
|
+
return self.convert(doc_stream)
|
297
|
+
elif format == InputFormat.HTML:
|
298
|
+
if not name.endswith(".html"):
|
299
|
+
name += ".html"
|
300
|
+
|
301
|
+
buff = BytesIO(content.encode("utf-8"))
|
302
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
303
|
+
|
304
|
+
return self.convert(doc_stream)
|
305
|
+
else:
|
306
|
+
raise ValueError(f"format {format} is not supported in `convert_string`")
|
307
|
+
|
278
308
|
def _convert(
|
279
309
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
280
310
|
) -> Iterator[ConversionResult]:
|
@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
35
35
|
|
36
36
|
if self.enabled:
|
37
37
|
try:
|
38
|
-
from mlx_vlm import generate, load # type: ignore
|
38
|
+
from mlx_vlm import generate, load, stream_generate # type: ignore
|
39
39
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
40
|
-
from mlx_vlm.utils import load_config
|
40
|
+
from mlx_vlm.utils import load_config # type: ignore
|
41
41
|
except ImportError:
|
42
42
|
raise ImportError(
|
43
43
|
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.44.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
58
58
|
Provides-Extra: vlm
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
|
-
Requires-Dist: mlx-vlm<0.
|
61
|
+
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
62
|
Provides-Extra: rapidocr
|
63
63
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
64
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -1,5 +1,5 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/document_converter.py,sha256=
|
2
|
+
docling/document_converter.py,sha256=l4b9m9NcbnwzXNNvf777nszyXznQJiaTXyIl_WehkyQ,15724
|
3
3
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
4
4
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
5
5
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -9,7 +9,7 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
|
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
11
|
docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
|
12
|
-
docling/backend/html_backend.py,sha256=
|
12
|
+
docling/backend/html_backend.py,sha256=0_l-I9gBAs0HKU3yKLQ3OqyYgB3V48hInv42GudnSjA,22856
|
13
13
|
docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
|
14
14
|
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
15
15
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
@@ -28,7 +28,7 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
|
|
28
28
|
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
29
29
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
30
30
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
docling/cli/main.py,sha256=
|
31
|
+
docling/cli/main.py,sha256=rXWR2QJFLeHLPWkMsLXvsVblX-KOXwbM8r0ku80KU5Q,29925
|
32
32
|
docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
|
33
33
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
34
34
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -71,7 +71,7 @@ docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
71
71
|
docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
|
72
72
|
docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
73
73
|
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6DQMLtCQQW3-YUPDMbgeD2tjfM8vLM,8415
|
74
|
-
docling/models/vlm_models_inline/mlx_model.py,sha256=
|
74
|
+
docling/models/vlm_models_inline/mlx_model.py,sha256=tqbJ8tmf2VBDuMLYIv9s1Ysn3G831k2uE_PdOv0kCaE,5948
|
75
75
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
76
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
77
77
|
docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
|
@@ -92,9 +92,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
92
92
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
93
93
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
94
94
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
95
|
-
docling-2.
|
96
|
-
docling-2.
|
97
|
-
docling-2.
|
98
|
-
docling-2.
|
99
|
-
docling-2.
|
100
|
-
docling-2.
|
95
|
+
docling-2.44.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
96
|
+
docling-2.44.0.dist-info/METADATA,sha256=SjD3EXlvgfyXIo8YoeldcAFX0r_nbJszp7VPoMLPFBk,10459
|
97
|
+
docling-2.44.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
98
|
+
docling-2.44.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
99
|
+
docling-2.44.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
100
|
+
docling-2.44.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|