docling 2.42.2__py3-none-any.whl → 2.44.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  import re
3
- import traceback
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Final, Optional, Union, cast
@@ -126,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
126
125
  # set the title as furniture, since it is part of the document metadata
127
126
  title = self.soup.title
128
127
  if title:
128
+ title_text = title.get_text(separator=" ", strip=True)
129
+ title_clean = HTMLDocumentBackend._clean_unicode(title_text)
129
130
  doc.add_title(
130
- text=title.get_text(separator=" ", strip=True),
131
+ text=title_clean,
132
+ orig=title_text,
131
133
  content_layer=ContentLayer.FURNITURE,
132
134
  )
133
135
  # remove scripts/styles
@@ -144,11 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
144
146
  )
145
147
  # reset context
146
148
  self.ctx = _Context()
147
-
148
- try:
149
- self._walk(content, doc)
150
- except Exception:
151
- print(traceback.format_exc())
149
+ self._walk(content, doc)
152
150
 
153
151
  return doc
154
152
 
@@ -173,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
173
171
  return
174
172
  for part in text.split("\n"):
175
173
  seg = part.strip()
174
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
176
175
  if seg:
177
176
  doc.add_text(
178
- DocItemLabel.TEXT,
179
- seg,
177
+ label=DocItemLabel.TEXT,
178
+ text=seg_clean,
179
+ orig=seg,
180
180
  parent=self.parents[self.level],
181
181
  content_layer=self.content_layer,
182
182
  )
@@ -208,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
208
208
  self.content_layer = ContentLayer.BODY
209
209
  level = int(tag_name[1])
210
210
  text = tag.get_text(strip=True, separator=" ")
211
+ text_clean = HTMLDocumentBackend._clean_unicode(text)
211
212
  # the first level is for the title item
212
213
  if level == 1:
213
214
  for key in self.parents.keys():
214
215
  self.parents[key] = None
215
216
  self.level = 0
216
217
  self.parents[self.level + 1] = doc.add_title(
217
- text, content_layer=self.content_layer
218
+ text=text_clean, orig=text, content_layer=self.content_layer
218
219
  )
219
220
  # the other levels need to be lowered by 1 if a title was set
220
221
  else:
@@ -239,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
239
240
  self.level = level
240
241
  self.parents[self.level + 1] = doc.add_heading(
241
242
  parent=self.parents[self.level],
242
- text=text,
243
+ text=text_clean,
244
+ orig=text,
243
245
  level=self.level,
244
246
  content_layer=self.content_layer,
245
247
  )
@@ -301,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
301
303
  if text_part:
302
304
  parts.append(text_part)
303
305
  li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
306
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
304
307
 
305
308
  # 3) add the list item
306
309
  if li_text:
307
310
  self.parents[self.level + 1] = doc.add_list_item(
308
- text=li_text,
311
+ text=li_clean,
309
312
  enumerated=is_ordered,
310
313
  marker=marker,
314
+ orig=li_text,
311
315
  parent=list_group,
312
316
  content_layer=self.content_layer,
313
317
  )
@@ -349,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
349
353
  elif tag_name in {"p", "address", "summary"}:
350
354
  for part in tag.text.split("\n"):
351
355
  seg = part.strip()
356
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
352
357
  if seg:
353
358
  doc.add_text(
354
- parent=self.parents[self.level],
355
359
  label=DocItemLabel.TEXT,
356
- text=seg,
360
+ text=seg_clean,
361
+ orig=seg,
362
+ parent=self.parents[self.level],
357
363
  content_layer=self.content_layer,
358
364
  )
359
365
  for img_tag in tag("img"):
@@ -375,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
375
381
  elif tag_name in {"pre", "code"}:
376
382
  # handle monospace code snippets (pre).
377
383
  text = tag.get_text(strip=True)
384
+ text_clean = HTMLDocumentBackend._clean_unicode(text)
378
385
  if text:
379
386
  doc.add_code(
380
387
  parent=self.parents[self.level],
381
- text=text,
388
+ text=text_clean,
389
+ orig=text,
382
390
  content_layer=self.content_layer,
383
391
  )
384
392
 
@@ -407,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
407
415
 
408
416
  caption_item: Optional[TextItem] = None
409
417
  if caption:
418
+ caption_clean = HTMLDocumentBackend._clean_unicode(caption)
410
419
  caption_item = doc.add_text(
411
- DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
420
+ label=DocItemLabel.CAPTION,
421
+ text=caption_clean,
422
+ orig=caption,
423
+ content_layer=self.content_layer,
412
424
  )
413
425
 
414
426
  doc.add_picture(
@@ -447,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
447
459
 
448
460
  return "".join(parts)
449
461
 
462
+ @staticmethod
463
+ def _clean_unicode(text: str) -> str:
464
+ """Replace typical Unicode characters in HTML for text processing.
465
+
466
+ Several Unicode characters (e.g., non-printable or formatting) are typically
467
+ found in HTML but are worth replacing to sanitize text and ensure consistency
468
+ in text processing tasks.
469
+
470
+ Args:
471
+ text: The original text.
472
+
473
+ Returns:
474
+ The sanitized text without typical Unicode characters.
475
+ """
476
+ replacements = {
477
+ "\u00a0": " ", # non-breaking space
478
+ "\u200b": "", # zero-width space
479
+ "\u200c": "", # zero-width non-joiner
480
+ "\u200d": "", # zero-width joiner
481
+ "\u2010": "-", # hyphen
482
+ "\u2011": "-", # non-breaking hyphen
483
+ "\u2012": "-", # dash
484
+ "\u2013": "-", # dash
485
+ "\u2014": "-", # dash
486
+ "\u2015": "-", # horizontal bar
487
+ "\u2018": "'", # left single quotation mark
488
+ "\u2019": "'", # right single quotation mark
489
+ "\u201c": '"', # left double quotation mark
490
+ "\u201d": '"', # right double quotation mark
491
+ "\u2026": "...", # ellipsis
492
+ "\u00ad": "", # soft hyphen
493
+ "\ufeff": "", # zero width non-break space
494
+ "\u202f": " ", # narrow non-break space
495
+ "\u2060": "", # word joiner
496
+ }
497
+ for raw, clean in replacements.items():
498
+ text = text.replace(raw, clean)
499
+
500
+ return text
501
+
450
502
  @staticmethod
451
503
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
452
504
  """Extract colspan and rowspan values from a table cell tag.
@@ -459,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
459
511
  str(cell.get("colspan", "1")),
460
512
  str(cell.get("rowspan", "1")),
461
513
  )
514
+
515
+ def _extract_num(s: str) -> int:
516
+ if s and s[0].isnumeric():
517
+ match = re.search(r"\d+", s)
518
+ if match:
519
+ return int(match.group())
520
+ return 1
521
+
462
522
  int_spans: tuple[int, int] = (
463
- int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
464
- int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
523
+ _extract_num(raw_spans[0]),
524
+ _extract_num(raw_spans[1]),
465
525
  )
466
526
 
467
527
  return int_spans
@@ -5,7 +5,7 @@ from copy import deepcopy
5
5
  from enum import Enum
6
6
  from io import BytesIO
7
7
  from pathlib import Path
8
- from typing import List, Literal, Optional, Set, Union
8
+ from typing import Literal, Optional, Union, cast
9
9
 
10
10
  import marko
11
11
  import marko.element
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
14
14
  DocItemLabel,
15
15
  DoclingDocument,
16
16
  DocumentOrigin,
17
+ ListItem,
17
18
  NodeItem,
18
19
  TableCell,
19
20
  TableData,
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
89
90
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
90
91
  super().__init__(in_doc, path_or_stream)
91
92
 
92
- _log.debug("MD INIT!!!")
93
+ _log.debug("Starting MarkdownDocumentBackend...")
93
94
 
94
95
  # Markdown file:
95
96
  self.path_or_stream = path_or_stream
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
131
132
  for md_table_row in self.md_table_buffer:
132
133
  _log.debug(md_table_row)
133
134
  _log.debug("=== TABLE END ===")
134
- tcells: List[TableCell] = []
135
+ tcells: list[TableCell] = []
135
136
  result_table = []
136
137
  for n, md_table_row in enumerate(self.md_table_buffer):
137
138
  data = []
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
232
233
  element: marko.element.Element,
233
234
  depth: int,
234
235
  doc: DoclingDocument,
235
- visited: Set[marko.element.Element],
236
+ visited: set[marko.element.Element],
236
237
  creation_stack: list[
237
238
  _CreationPayload
238
239
  ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
239
240
  list_ordered_flag_by_ref: dict[str, bool],
241
+ list_last_item_by_ref: dict[str, ListItem],
240
242
  parent_item: Optional[NodeItem] = None,
241
243
  formatting: Optional[Formatting] = None,
242
244
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
279
281
 
280
282
  elif (
281
283
  isinstance(element, marko.block.ListItem)
282
- and len(element.children) == 1
284
+ and len(element.children) > 0
283
285
  and isinstance((child := element.children[0]), marko.block.Paragraph)
284
286
  and len(child.children) > 0
285
287
  ):
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
291
293
  if parent_item
292
294
  else False
293
295
  )
294
- if len(child.children) > 1: # inline group will be created further down
296
+ non_list_children: list[marko.element.Element] = [
297
+ item
298
+ for item in child.children
299
+ if not isinstance(item, marko.block.ListItem)
300
+ ]
301
+ if len(non_list_children) > 1: # inline group will be created further down
302
+ parent_ref: Optional[str] = (
303
+ parent_item.self_ref if parent_item else None
304
+ )
295
305
  parent_item = self._create_list_item(
296
306
  doc=doc,
297
307
  parent_item=parent_item,
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
300
310
  formatting=formatting,
301
311
  hyperlink=hyperlink,
302
312
  )
313
+ if parent_ref:
314
+ list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
303
315
  else:
304
316
  creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
305
317
 
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
334
346
  element.dest
335
347
  )
336
348
 
337
- elif isinstance(element, marko.inline.RawText):
338
- _log.debug(f" - Paragraph (raw text): {element.children}")
339
- snippet_text = element.children.strip()
349
+ elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
350
+ _log.debug(f" - RawText/Literal: {element.children}")
351
+ snippet_text = (
352
+ element.children.strip() if isinstance(element.children, str) else ""
353
+ )
340
354
  # Detect start of the table:
341
355
  if "|" in snippet_text or self.in_table:
342
356
  # most likely part of the markdown table
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
359
373
  if parent_item
360
374
  else False
361
375
  )
376
+ parent_ref = parent_item.self_ref if parent_item else None
362
377
  parent_item = self._create_list_item(
363
378
  doc=doc,
364
379
  parent_item=parent_item,
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
367
382
  formatting=formatting,
368
383
  hyperlink=hyperlink,
369
384
  )
385
+ if parent_ref:
386
+ list_last_item_by_ref[parent_ref] = cast(
387
+ ListItem, parent_item
388
+ )
389
+
370
390
  elif isinstance(to_create, _HeadingCreationPayload):
371
391
  # not keeping as parent_item as logic for correctly tracking
372
392
  # that not implemented yet (section components not captured
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
458
478
  element, processed_block_types
459
479
  ):
460
480
  for child in element.children:
481
+ if (
482
+ isinstance(element, marko.block.ListItem)
483
+ and isinstance(child, marko.block.List)
484
+ and parent_item
485
+ and list_last_item_by_ref.get(parent_item.self_ref, None)
486
+ ):
487
+ _log.debug(
488
+ f"walking into new List hanging from item of parent list {parent_item.self_ref}"
489
+ )
490
+ parent_item = list_last_item_by_ref[parent_item.self_ref]
491
+
461
492
  self._iterate_elements(
462
493
  element=child,
463
494
  depth=depth + 1,
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
465
496
  visited=visited,
466
497
  creation_stack=creation_stack,
467
498
  list_ordered_flag_by_ref=list_ordered_flag_by_ref,
499
+ list_last_item_by_ref=list_last_item_by_ref,
468
500
  parent_item=parent_item,
469
501
  formatting=formatting,
470
502
  hyperlink=hyperlink,
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
483
515
  return False
484
516
 
485
517
  @classmethod
486
- def supported_formats(cls) -> Set[InputFormat]:
518
+ def supported_formats(cls) -> set[InputFormat]:
487
519
  return {InputFormat.MD}
488
520
 
489
521
  def convert(self) -> DoclingDocument:
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
510
542
  visited=set(),
511
543
  creation_stack=[],
512
544
  list_ordered_flag_by_ref={},
545
+ list_last_item_by_ref={},
513
546
  )
514
547
  self._close_table(doc=doc) # handle any last hanging table
515
548
 
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
534
567
  ]:
535
568
  html_str = _restore_original_html(txt=html_str, regex=regex)
536
569
  self._html_blocks = 0
537
-
538
570
  # delegate to HTML backend
539
571
  stream = BytesIO(bytes(html_str, encoding="utf-8"))
540
572
  in_doc = InputDocument(
docling/cli/main.py CHANGED
@@ -262,6 +262,12 @@ def export_documents(
262
262
 
263
263
  else:
264
264
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
265
+ if _log.isEnabledFor(logging.INFO):
266
+ for err in conv_res.errors:
267
+ _log.info(
268
+ f" [Failure Detail] Component: {err.component_type}, "
269
+ f"Module: {err.module_name}, Message: {err.error_message}"
270
+ )
265
271
  failure_count += 1
266
272
 
267
273
  _log.info(
@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
332
332
  STANDARD = "standard"
333
333
  VLM = "vlm"
334
334
  ASR = "asr"
335
+
336
+
337
+ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
338
+ """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
339
+
340
+ # Batch sizes for different stages
341
+ ocr_batch_size: int = 4
342
+ layout_batch_size: int = 4
343
+ table_batch_size: int = 4
344
+
345
+ # Timing control
346
+ batch_timeout_seconds: float = 2.0
347
+
348
+ # Backpressure and queue control
349
+ queue_max_size: int = 100
@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
26
26
 
27
27
 
28
28
  class BatchConcurrencySettings(BaseModel):
29
- doc_batch_size: int = 2
30
- doc_batch_concurrency: int = 2
31
- page_batch_size: int = 4
32
- page_batch_concurrency: int = 2
33
- elements_batch_size: int = 16
34
-
35
- # doc_batch_size: int = 1
36
- # doc_batch_concurrency: int = 1
37
- # page_batch_size: int = 1
38
- # page_batch_concurrency: int = 1
39
-
40
- # model_concurrency: int = 2
29
+ doc_batch_size: int = 1 # Number of documents processed in one batch. Should be >= doc_batch_concurrency
30
+ doc_batch_concurrency: int = 1 # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
31
+ page_batch_size: int = 4 # Number of pages processed in one batch.
32
+ page_batch_concurrency: int = 1 # Currently unused.
33
+ elements_batch_size: int = (
34
+ 16 # Number of elements processed in one batch, in enrichment models.
35
+ )
41
36
 
42
37
  # To force models into single core: export OMP_NUM_THREADS=1
43
38
 
@@ -4,7 +4,10 @@ import sys
4
4
  import threading
5
5
  import time
6
6
  from collections.abc import Iterable, Iterator
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from datetime import datetime
7
9
  from functools import partial
10
+ from io import BytesIO
8
11
  from pathlib import Path
9
12
  from typing import Dict, List, Optional, Tuple, Type, Union
10
13
 
@@ -274,6 +277,34 @@ class DocumentConverter:
274
277
  "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
275
278
  )
276
279
 
280
+ @validate_call(config=ConfigDict(strict=True))
281
+ def convert_string(
282
+ self,
283
+ content: str,
284
+ format: InputFormat,
285
+ name: Optional[str],
286
+ ) -> ConversionResult:
287
+ name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
288
+
289
+ if format == InputFormat.MD:
290
+ if not name.endswith(".md"):
291
+ name += ".md"
292
+
293
+ buff = BytesIO(content.encode("utf-8"))
294
+ doc_stream = DocumentStream(name=name, stream=buff)
295
+
296
+ return self.convert(doc_stream)
297
+ elif format == InputFormat.HTML:
298
+ if not name.endswith(".html"):
299
+ name += ".html"
300
+
301
+ buff = BytesIO(content.encode("utf-8"))
302
+ doc_stream = DocumentStream(name=name, stream=buff)
303
+
304
+ return self.convert(doc_stream)
305
+ else:
306
+ raise ValueError(f"format {format} is not supported in `convert_string`")
307
+
277
308
  def _convert(
278
309
  self, conv_input: _DocumentConversionInput, raises_on_error: bool
279
310
  ) -> Iterator[ConversionResult]:
@@ -284,24 +315,33 @@ class DocumentConverter:
284
315
  settings.perf.doc_batch_size, # pass format_options
285
316
  ):
286
317
  _log.info("Going to convert document batch...")
318
+ process_func = partial(
319
+ self._process_document, raises_on_error=raises_on_error
320
+ )
287
321
 
288
- # parallel processing only within input_batch
289
- # with ThreadPoolExecutor(
290
- # max_workers=settings.perf.doc_batch_concurrency
291
- # ) as pool:
292
- # yield from pool.map(self.process_document, input_batch)
293
- # Note: PDF backends are not thread-safe, thread pool usage was disabled.
294
-
295
- for item in map(
296
- partial(self._process_document, raises_on_error=raises_on_error),
297
- input_batch,
322
+ if (
323
+ settings.perf.doc_batch_concurrency > 1
324
+ and settings.perf.doc_batch_size > 1
298
325
  ):
299
- elapsed = time.monotonic() - start_time
300
- start_time = time.monotonic()
301
- _log.info(
302
- f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
303
- )
304
- yield item
326
+ with ThreadPoolExecutor(
327
+ max_workers=settings.perf.doc_batch_concurrency
328
+ ) as pool:
329
+ for item in pool.map(
330
+ process_func,
331
+ input_batch,
332
+ ):
333
+ yield item
334
+ else:
335
+ for item in map(
336
+ process_func,
337
+ input_batch,
338
+ ):
339
+ elapsed = time.monotonic() - start_time
340
+ start_time = time.monotonic()
341
+ _log.info(
342
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
343
+ )
344
+ yield item
305
345
 
306
346
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
307
347
  """Retrieve or initialize a pipeline, reusing instances based on class and options."""
@@ -330,7 +370,7 @@ class DocumentConverter:
330
370
  f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
331
371
  )
332
372
 
333
- return self.initialized_pipelines[cache_key]
373
+ return self.initialized_pipelines[cache_key]
334
374
 
335
375
  def _process_document(
336
376
  self, in_doc: InputDocument, raises_on_error: bool