docling 2.43.0__py3-none-any.whl → 2.44.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -125,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
125
125
  # set the title as furniture, since it is part of the document metadata
126
126
  title = self.soup.title
127
127
  if title:
128
+ title_text = title.get_text(separator=" ", strip=True)
129
+ title_clean = HTMLDocumentBackend._clean_unicode(title_text)
128
130
  doc.add_title(
129
- text=title.get_text(separator=" ", strip=True),
131
+ text=title_clean,
132
+ orig=title_text,
130
133
  content_layer=ContentLayer.FURNITURE,
131
134
  )
132
135
  # remove scripts/styles
@@ -168,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
168
171
  return
169
172
  for part in text.split("\n"):
170
173
  seg = part.strip()
174
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
171
175
  if seg:
172
176
  doc.add_text(
173
- DocItemLabel.TEXT,
174
- seg,
177
+ label=DocItemLabel.TEXT,
178
+ text=seg_clean,
179
+ orig=seg,
175
180
  parent=self.parents[self.level],
176
181
  content_layer=self.content_layer,
177
182
  )
@@ -203,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
203
208
  self.content_layer = ContentLayer.BODY
204
209
  level = int(tag_name[1])
205
210
  text = tag.get_text(strip=True, separator=" ")
211
+ text_clean = HTMLDocumentBackend._clean_unicode(text)
206
212
  # the first level is for the title item
207
213
  if level == 1:
208
214
  for key in self.parents.keys():
209
215
  self.parents[key] = None
210
216
  self.level = 0
211
217
  self.parents[self.level + 1] = doc.add_title(
212
- text, content_layer=self.content_layer
218
+ text=text_clean, orig=text, content_layer=self.content_layer
213
219
  )
214
220
  # the other levels need to be lowered by 1 if a title was set
215
221
  else:
@@ -234,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
234
240
  self.level = level
235
241
  self.parents[self.level + 1] = doc.add_heading(
236
242
  parent=self.parents[self.level],
237
- text=text,
243
+ text=text_clean,
244
+ orig=text,
238
245
  level=self.level,
239
246
  content_layer=self.content_layer,
240
247
  )
@@ -296,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
296
303
  if text_part:
297
304
  parts.append(text_part)
298
305
  li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
306
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
299
307
 
300
308
  # 3) add the list item
301
309
  if li_text:
302
310
  self.parents[self.level + 1] = doc.add_list_item(
303
- text=li_text,
311
+ text=li_clean,
304
312
  enumerated=is_ordered,
305
313
  marker=marker,
314
+ orig=li_text,
306
315
  parent=list_group,
307
316
  content_layer=self.content_layer,
308
317
  )
@@ -344,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
344
353
  elif tag_name in {"p", "address", "summary"}:
345
354
  for part in tag.text.split("\n"):
346
355
  seg = part.strip()
356
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
347
357
  if seg:
348
358
  doc.add_text(
349
- parent=self.parents[self.level],
350
359
  label=DocItemLabel.TEXT,
351
- text=seg,
360
+ text=seg_clean,
361
+ orig=seg,
362
+ parent=self.parents[self.level],
352
363
  content_layer=self.content_layer,
353
364
  )
354
365
  for img_tag in tag("img"):
@@ -370,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
370
381
  elif tag_name in {"pre", "code"}:
371
382
  # handle monospace code snippets (pre).
372
383
  text = tag.get_text(strip=True)
384
+ text_clean = HTMLDocumentBackend._clean_unicode(text)
373
385
  if text:
374
386
  doc.add_code(
375
387
  parent=self.parents[self.level],
376
- text=text,
388
+ text=text_clean,
389
+ orig=text,
377
390
  content_layer=self.content_layer,
378
391
  )
379
392
 
@@ -402,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
402
415
 
403
416
  caption_item: Optional[TextItem] = None
404
417
  if caption:
418
+ caption_clean = HTMLDocumentBackend._clean_unicode(caption)
405
419
  caption_item = doc.add_text(
406
- DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
420
+ label=DocItemLabel.CAPTION,
421
+ text=caption_clean,
422
+ orig=caption,
423
+ content_layer=self.content_layer,
407
424
  )
408
425
 
409
426
  doc.add_picture(
@@ -442,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
442
459
 
443
460
  return "".join(parts)
444
461
 
462
+ @staticmethod
463
+ def _clean_unicode(text: str) -> str:
464
+ """Replace typical Unicode characters in HTML for text processing.
465
+
466
+ Several Unicode characters (e.g., non-printable or formatting) are typically
467
+ found in HTML but are worth replacing to sanitize text and ensure consistency
468
+ in text processing tasks.
469
+
470
+ Args:
471
+ text: The original text.
472
+
473
+ Returns:
474
+ The sanitized text without typical Unicode characters.
475
+ """
476
+ replacements = {
477
+ "\u00a0": " ", # non-breaking space
478
+ "\u200b": "", # zero-width space
479
+ "\u200c": "", # zero-width non-joiner
480
+ "\u200d": "", # zero-width joiner
481
+ "\u2010": "-", # hyphen
482
+ "\u2011": "-", # non-breaking hyphen
483
+ "\u2012": "-", # dash
484
+ "\u2013": "-", # dash
485
+ "\u2014": "-", # dash
486
+ "\u2015": "-", # horizontal bar
487
+ "\u2018": "'", # left single quotation mark
488
+ "\u2019": "'", # right single quotation mark
489
+ "\u201c": '"', # left double quotation mark
490
+ "\u201d": '"', # right double quotation mark
491
+ "\u2026": "...", # ellipsis
492
+ "\u00ad": "", # soft hyphen
493
+ "\ufeff": "", # zero width non-break space
494
+ "\u202f": " ", # narrow non-break space
495
+ "\u2060": "", # word joiner
496
+ }
497
+ for raw, clean in replacements.items():
498
+ text = text.replace(raw, clean)
499
+
500
+ return text
501
+
445
502
  @staticmethod
446
503
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
447
504
  """Extract colspan and rowspan values from a table cell tag.
@@ -454,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
454
511
  str(cell.get("colspan", "1")),
455
512
  str(cell.get("rowspan", "1")),
456
513
  )
514
+
515
+ def _extract_num(s: str) -> int:
516
+ if s and s[0].isnumeric():
517
+ match = re.search(r"\d+", s)
518
+ if match:
519
+ return int(match.group())
520
+ return 1
521
+
457
522
  int_spans: tuple[int, int] = (
458
- int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
459
- int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
523
+ _extract_num(raw_spans[0]),
524
+ _extract_num(raw_spans[1]),
460
525
  )
461
526
 
462
527
  return int_spans
docling/cli/main.py CHANGED
@@ -262,6 +262,12 @@ def export_documents(
262
262
 
263
263
  else:
264
264
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
265
+ if _log.isEnabledFor(logging.INFO):
266
+ for err in conv_res.errors:
267
+ _log.info(
268
+ f" [Failure Detail] Component: {err.component_type}, "
269
+ f"Module: {err.module_name}, Message: {err.error_message}"
270
+ )
265
271
  failure_count += 1
266
272
 
267
273
  _log.info(
@@ -5,7 +5,9 @@ import threading
5
5
  import time
6
6
  from collections.abc import Iterable, Iterator
7
7
  from concurrent.futures import ThreadPoolExecutor
8
+ from datetime import datetime
8
9
  from functools import partial
10
+ from io import BytesIO
9
11
  from pathlib import Path
10
12
  from typing import Dict, List, Optional, Tuple, Type, Union
11
13
 
@@ -275,6 +277,34 @@ class DocumentConverter:
275
277
  "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
276
278
  )
277
279
 
280
+ @validate_call(config=ConfigDict(strict=True))
281
+ def convert_string(
282
+ self,
283
+ content: str,
284
+ format: InputFormat,
285
+ name: Optional[str],
286
+ ) -> ConversionResult:
287
+ name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
288
+
289
+ if format == InputFormat.MD:
290
+ if not name.endswith(".md"):
291
+ name += ".md"
292
+
293
+ buff = BytesIO(content.encode("utf-8"))
294
+ doc_stream = DocumentStream(name=name, stream=buff)
295
+
296
+ return self.convert(doc_stream)
297
+ elif format == InputFormat.HTML:
298
+ if not name.endswith(".html"):
299
+ name += ".html"
300
+
301
+ buff = BytesIO(content.encode("utf-8"))
302
+ doc_stream = DocumentStream(name=name, stream=buff)
303
+
304
+ return self.convert(doc_stream)
305
+ else:
306
+ raise ValueError(f"format {format} is not supported in `convert_string`")
307
+
278
308
  def _convert(
279
309
  self, conv_input: _DocumentConversionInput, raises_on_error: bool
280
310
  ) -> Iterator[ConversionResult]:
@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
35
35
 
36
36
  if self.enabled:
37
37
  try:
38
- from mlx_vlm import generate, load # type: ignore
38
+ from mlx_vlm import generate, load, stream_generate # type: ignore
39
39
  from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
40
- from mlx_vlm.utils import load_config, stream_generate # type: ignore
40
+ from mlx_vlm.utils import load_config # type: ignore
41
41
  except ImportError:
42
42
  raise ImportError(
43
43
  "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.43.0
3
+ Version: 2.44.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
58
58
  Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
- Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
+ Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
62
  Provides-Extra: rapidocr
63
63
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
64
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -1,5 +1,5 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/document_converter.py,sha256=pYlozCp6X1iGO75m3KSudMfrSCrXihTlRpKARFN67BI,14757
2
+ docling/document_converter.py,sha256=l4b9m9NcbnwzXNNvf777nszyXznQJiaTXyIl_WehkyQ,15724
3
3
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
4
4
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
5
5
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,7 +9,7 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
12
- docling/backend/html_backend.py,sha256=Nuzyp6kyjd0g_MsBEPiWdFWU5w9UM60yWSluwU5C0M4,20310
12
+ docling/backend/html_backend.py,sha256=0_l-I9gBAs0HKU3yKLQ3OqyYgB3V48hInv42GudnSjA,22856
13
13
  docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
14
14
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
15
15
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
@@ -28,7 +28,7 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
28
28
  docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
29
29
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
30
30
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- docling/cli/main.py,sha256=D2gEoArnQ2yQ9BesH9CkxZbYQyhZRGgjjNWYqmRRUtU,29617
31
+ docling/cli/main.py,sha256=rXWR2QJFLeHLPWkMsLXvsVblX-KOXwbM8r0ku80KU5Q,29925
32
32
  docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
33
33
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
34
34
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -71,7 +71,7 @@ docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
71
71
  docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
72
72
  docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
73
  docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6DQMLtCQQW3-YUPDMbgeD2tjfM8vLM,8415
74
- docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
74
+ docling/models/vlm_models_inline/mlx_model.py,sha256=tqbJ8tmf2VBDuMLYIv9s1Ysn3G831k2uE_PdOv0kCaE,5948
75
75
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
77
77
  docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
@@ -92,9 +92,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
92
92
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
93
93
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
94
94
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
95
- docling-2.43.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
96
- docling-2.43.0.dist-info/METADATA,sha256=HS5J6rDKaZ_G_d4p10XgAwrNe-FjmHV-u5EmoTP4hro,10458
97
- docling-2.43.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
98
- docling-2.43.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
99
- docling-2.43.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
100
- docling-2.43.0.dist-info/RECORD,,
95
+ docling-2.44.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
96
+ docling-2.44.0.dist-info/METADATA,sha256=SjD3EXlvgfyXIo8YoeldcAFX0r_nbJszp7VPoMLPFBk,10459
97
+ docling-2.44.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
98
+ docling-2.44.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
99
+ docling-2.44.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
100
+ docling-2.44.0.dist-info/RECORD,,