docling 2.42.1__py3-none-any.whl → 2.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,10 @@
1
1
  import logging
2
2
  import re
3
- import traceback
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Final, Optional, Union, cast
7
6
 
8
- from bs4 import BeautifulSoup, NavigableString, Tag
7
+ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
9
8
  from bs4.element import PreformattedString
10
9
  from docling_core.types.doc import (
11
10
  DocItem,
@@ -144,11 +143,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
144
143
  )
145
144
  # reset context
146
145
  self.ctx = _Context()
147
-
148
- try:
149
- self._walk(content, doc)
150
- except Exception:
151
- print(traceback.format_exc())
146
+ self._walk(content, doc)
152
147
 
153
148
  return doc
154
149
 
@@ -297,7 +292,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
297
292
  ):
298
293
  parts.append(child)
299
294
  elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
300
- text_part = child.get_text()
295
+ text_part = HTMLDocumentBackend.get_text(child)
301
296
  if text_part:
302
297
  parts.append(text_part)
303
298
  li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
@@ -417,6 +412,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
417
412
  content_layer=self.content_layer,
418
413
  )
419
414
 
415
+ @staticmethod
416
+ def get_text(item: PageElement) -> str:
417
+ """Concatenate all child strings of a PageElement.
418
+
419
+ This method is equivalent to `PageElement.get_text()` but also considers
420
+ certain tags. When called on a <p> or <li> tags, it returns the text with a
421
+ trailing space, otherwise the text is concatenated without separators.
422
+ """
423
+
424
+ def _extract_text_recursively(item: PageElement) -> list[str]:
425
+ """Recursively extract text from all child nodes."""
426
+ result: list[str] = []
427
+
428
+ if isinstance(item, NavigableString):
429
+ result = [item]
430
+ elif isinstance(item, Tag):
431
+ tag = cast(Tag, item)
432
+ parts: list[str] = []
433
+ for child in tag:
434
+ parts.extend(_extract_text_recursively(child))
435
+ result.append(
436
+ "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
437
+ )
438
+
439
+ return result
440
+
441
+ parts: list[str] = _extract_text_recursively(item)
442
+
443
+ return "".join(parts)
444
+
420
445
  @staticmethod
421
446
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
422
447
  """Extract colspan and rowspan values from a table cell tag.
@@ -510,9 +535,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
510
535
  formula.replace_with(NavigableString(math_formula))
511
536
 
512
537
  # TODO: extract content correctly from table-cells with lists
513
- text = html_cell.text
514
-
515
- # label = html_cell.name
538
+ text = HTMLDocumentBackend.get_text(html_cell).strip()
516
539
  col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
517
540
  if row_header:
518
541
  row_span -= 1
@@ -5,7 +5,7 @@ from copy import deepcopy
5
5
  from enum import Enum
6
6
  from io import BytesIO
7
7
  from pathlib import Path
8
- from typing import List, Literal, Optional, Set, Union
8
+ from typing import Literal, Optional, Union, cast
9
9
 
10
10
  import marko
11
11
  import marko.element
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
14
14
  DocItemLabel,
15
15
  DoclingDocument,
16
16
  DocumentOrigin,
17
+ ListItem,
17
18
  NodeItem,
18
19
  TableCell,
19
20
  TableData,
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
89
90
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
90
91
  super().__init__(in_doc, path_or_stream)
91
92
 
92
- _log.debug("MD INIT!!!")
93
+ _log.debug("Starting MarkdownDocumentBackend...")
93
94
 
94
95
  # Markdown file:
95
96
  self.path_or_stream = path_or_stream
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
131
132
  for md_table_row in self.md_table_buffer:
132
133
  _log.debug(md_table_row)
133
134
  _log.debug("=== TABLE END ===")
134
- tcells: List[TableCell] = []
135
+ tcells: list[TableCell] = []
135
136
  result_table = []
136
137
  for n, md_table_row in enumerate(self.md_table_buffer):
137
138
  data = []
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
232
233
  element: marko.element.Element,
233
234
  depth: int,
234
235
  doc: DoclingDocument,
235
- visited: Set[marko.element.Element],
236
+ visited: set[marko.element.Element],
236
237
  creation_stack: list[
237
238
  _CreationPayload
238
239
  ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
239
240
  list_ordered_flag_by_ref: dict[str, bool],
241
+ list_last_item_by_ref: dict[str, ListItem],
240
242
  parent_item: Optional[NodeItem] = None,
241
243
  formatting: Optional[Formatting] = None,
242
244
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
279
281
 
280
282
  elif (
281
283
  isinstance(element, marko.block.ListItem)
282
- and len(element.children) == 1
284
+ and len(element.children) > 0
283
285
  and isinstance((child := element.children[0]), marko.block.Paragraph)
284
286
  and len(child.children) > 0
285
287
  ):
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
291
293
  if parent_item
292
294
  else False
293
295
  )
294
- if len(child.children) > 1: # inline group will be created further down
296
+ non_list_children: list[marko.element.Element] = [
297
+ item
298
+ for item in child.children
299
+ if not isinstance(item, marko.block.ListItem)
300
+ ]
301
+ if len(non_list_children) > 1: # inline group will be created further down
302
+ parent_ref: Optional[str] = (
303
+ parent_item.self_ref if parent_item else None
304
+ )
295
305
  parent_item = self._create_list_item(
296
306
  doc=doc,
297
307
  parent_item=parent_item,
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
300
310
  formatting=formatting,
301
311
  hyperlink=hyperlink,
302
312
  )
313
+ if parent_ref:
314
+ list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
303
315
  else:
304
316
  creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
305
317
 
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
334
346
  element.dest
335
347
  )
336
348
 
337
- elif isinstance(element, marko.inline.RawText):
338
- _log.debug(f" - Paragraph (raw text): {element.children}")
339
- snippet_text = element.children.strip()
349
+ elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
350
+ _log.debug(f" - RawText/Literal: {element.children}")
351
+ snippet_text = (
352
+ element.children.strip() if isinstance(element.children, str) else ""
353
+ )
340
354
  # Detect start of the table:
341
355
  if "|" in snippet_text or self.in_table:
342
356
  # most likely part of the markdown table
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
359
373
  if parent_item
360
374
  else False
361
375
  )
376
+ parent_ref = parent_item.self_ref if parent_item else None
362
377
  parent_item = self._create_list_item(
363
378
  doc=doc,
364
379
  parent_item=parent_item,
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
367
382
  formatting=formatting,
368
383
  hyperlink=hyperlink,
369
384
  )
385
+ if parent_ref:
386
+ list_last_item_by_ref[parent_ref] = cast(
387
+ ListItem, parent_item
388
+ )
389
+
370
390
  elif isinstance(to_create, _HeadingCreationPayload):
371
391
  # not keeping as parent_item as logic for correctly tracking
372
392
  # that not implemented yet (section components not captured
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
458
478
  element, processed_block_types
459
479
  ):
460
480
  for child in element.children:
481
+ if (
482
+ isinstance(element, marko.block.ListItem)
483
+ and isinstance(child, marko.block.List)
484
+ and parent_item
485
+ and list_last_item_by_ref.get(parent_item.self_ref, None)
486
+ ):
487
+ _log.debug(
488
+ f"walking into new List hanging from item of parent list {parent_item.self_ref}"
489
+ )
490
+ parent_item = list_last_item_by_ref[parent_item.self_ref]
491
+
461
492
  self._iterate_elements(
462
493
  element=child,
463
494
  depth=depth + 1,
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
465
496
  visited=visited,
466
497
  creation_stack=creation_stack,
467
498
  list_ordered_flag_by_ref=list_ordered_flag_by_ref,
499
+ list_last_item_by_ref=list_last_item_by_ref,
468
500
  parent_item=parent_item,
469
501
  formatting=formatting,
470
502
  hyperlink=hyperlink,
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
483
515
  return False
484
516
 
485
517
  @classmethod
486
- def supported_formats(cls) -> Set[InputFormat]:
518
+ def supported_formats(cls) -> set[InputFormat]:
487
519
  return {InputFormat.MD}
488
520
 
489
521
  def convert(self) -> DoclingDocument:
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
510
542
  visited=set(),
511
543
  creation_stack=[],
512
544
  list_ordered_flag_by_ref={},
545
+ list_last_item_by_ref={},
513
546
  )
514
547
  self._close_table(doc=doc) # handle any last hanging table
515
548
 
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
534
567
  ]:
535
568
  html_str = _restore_original_html(txt=html_str, regex=regex)
536
569
  self._html_blocks = 0
537
-
538
570
  # delegate to HTML backend
539
571
  stream = BytesIO(bytes(html_str, encoding="utf-8"))
540
572
  in_doc = InputDocument(
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1104
1104
  )
1105
1105
  _log.debug(f" spanned before row {spanned_idx}")
1106
1106
 
1107
+ # Detect equations in cell text
1108
+ text, equations = self._handle_equations_in_text(
1109
+ element=cell._element, text=cell.text
1110
+ )
1111
+ if len(equations) == 0:
1112
+ text = cell.text
1113
+ else:
1114
+ text = text.replace("<eq>", "$").replace("</eq>", "$")
1115
+
1107
1116
  table_cell = TableCell(
1108
- text=cell.text,
1117
+ text=text,
1109
1118
  row_span=spanned_idx - row_idx,
1110
1119
  col_span=cell.grid_span,
1111
1120
  start_row_offset_idx=row.grid_cols_before + row_idx,
@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
57
57
  if self.input_format is InputFormat.IMAGE:
58
58
  buf = BytesIO()
59
59
  img = Image.open(self.path_or_stream)
60
- img.save(buf, "PDF")
60
+
61
+ # Handle multi-page TIFF images
62
+ if hasattr(img, "n_frames") and img.n_frames > 1:
63
+ # Extract all frames from multi-page image
64
+ frames = []
65
+ try:
66
+ for i in range(img.n_frames):
67
+ img.seek(i)
68
+ frame = img.copy().convert("RGB")
69
+ frames.append(frame)
70
+ except EOFError:
71
+ pass
72
+
73
+ # Save as multi-page PDF
74
+ if frames:
75
+ frames[0].save(
76
+ buf, "PDF", save_all=True, append_images=frames[1:]
77
+ )
78
+ else:
79
+ # Fallback to single page if frame extraction fails
80
+ img.convert("RGB").save(buf, "PDF")
81
+ else:
82
+ # Single page image - convert to RGB and save
83
+ img.convert("RGB").save(buf, "PDF")
84
+
61
85
  buf.seek(0)
62
86
  self.path_or_stream = buf
63
87
  else:
@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
332
332
  STANDARD = "standard"
333
333
  VLM = "vlm"
334
334
  ASR = "asr"
335
+
336
+
337
+ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
338
+ """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
339
+
340
+ # Batch sizes for different stages
341
+ ocr_batch_size: int = 4
342
+ layout_batch_size: int = 4
343
+ table_batch_size: int = 4
344
+
345
+ # Timing control
346
+ batch_timeout_seconds: float = 2.0
347
+
348
+ # Backpressure and queue control
349
+ queue_max_size: int = 100
@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
26
26
 
27
27
 
28
28
  class BatchConcurrencySettings(BaseModel):
29
- doc_batch_size: int = 2
30
- doc_batch_concurrency: int = 2
31
- page_batch_size: int = 4
32
- page_batch_concurrency: int = 2
33
- elements_batch_size: int = 16
34
-
35
- # doc_batch_size: int = 1
36
- # doc_batch_concurrency: int = 1
37
- # page_batch_size: int = 1
38
- # page_batch_concurrency: int = 1
39
-
40
- # model_concurrency: int = 2
29
+ doc_batch_size: int = 1 # Number of documents processed in one batch. Should be >= doc_batch_concurrency
30
+ doc_batch_concurrency: int = 1 # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
31
+ page_batch_size: int = 4 # Number of pages processed in one batch.
32
+ page_batch_concurrency: int = 1 # Currently unused.
33
+ elements_batch_size: int = (
34
+ 16 # Number of elements processed in one batch, in enrichment models.
35
+ )
41
36
 
42
37
  # To force models into single core: export OMP_NUM_THREADS=1
43
38
 
@@ -4,6 +4,7 @@ import sys
4
4
  import threading
5
5
  import time
6
6
  from collections.abc import Iterable, Iterator
7
+ from concurrent.futures import ThreadPoolExecutor
7
8
  from functools import partial
8
9
  from pathlib import Path
9
10
  from typing import Dict, List, Optional, Tuple, Type, Union
@@ -284,24 +285,33 @@ class DocumentConverter:
284
285
  settings.perf.doc_batch_size, # pass format_options
285
286
  ):
286
287
  _log.info("Going to convert document batch...")
288
+ process_func = partial(
289
+ self._process_document, raises_on_error=raises_on_error
290
+ )
287
291
 
288
- # parallel processing only within input_batch
289
- # with ThreadPoolExecutor(
290
- # max_workers=settings.perf.doc_batch_concurrency
291
- # ) as pool:
292
- # yield from pool.map(self.process_document, input_batch)
293
- # Note: PDF backends are not thread-safe, thread pool usage was disabled.
294
-
295
- for item in map(
296
- partial(self._process_document, raises_on_error=raises_on_error),
297
- input_batch,
292
+ if (
293
+ settings.perf.doc_batch_concurrency > 1
294
+ and settings.perf.doc_batch_size > 1
298
295
  ):
299
- elapsed = time.monotonic() - start_time
300
- start_time = time.monotonic()
301
- _log.info(
302
- f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
303
- )
304
- yield item
296
+ with ThreadPoolExecutor(
297
+ max_workers=settings.perf.doc_batch_concurrency
298
+ ) as pool:
299
+ for item in pool.map(
300
+ process_func,
301
+ input_batch,
302
+ ):
303
+ yield item
304
+ else:
305
+ for item in map(
306
+ process_func,
307
+ input_batch,
308
+ ):
309
+ elapsed = time.monotonic() - start_time
310
+ start_time = time.monotonic()
311
+ _log.info(
312
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
313
+ )
314
+ yield item
305
315
 
306
316
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
307
317
  """Retrieve or initialize a pipeline, reusing instances based on class and options."""
@@ -330,7 +340,7 @@ class DocumentConverter:
330
340
  f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
331
341
  )
332
342
 
333
- return self.initialized_pipelines[cache_key]
343
+ return self.initialized_pipelines[cache_key]
334
344
 
335
345
  def _process_document(
336
346
  self, in_doc: InputDocument, raises_on_error: bool
@@ -3,7 +3,7 @@ import logging
3
3
  import warnings
4
4
  from collections.abc import Iterable
5
5
  from pathlib import Path
6
- from typing import Optional
6
+ from typing import List, Optional, Union
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import DocItemLabel
@@ -148,72 +148,90 @@ class LayoutModel(BasePageModel):
148
148
  def __call__(
149
149
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
150
150
  ) -> Iterable[Page]:
151
- for page in page_batch:
151
+ # Convert to list to allow multiple iterations
152
+ pages = list(page_batch)
153
+
154
+ # Separate valid and invalid pages
155
+ valid_pages = []
156
+ valid_page_images: List[Union[Image.Image, np.ndarray]] = []
157
+
158
+ for page in pages:
152
159
  assert page._backend is not None
153
160
  if not page._backend.is_valid():
154
- yield page
155
- else:
156
- with TimeRecorder(conv_res, "layout"):
157
- assert page.size is not None
158
- page_image = page.get_image(scale=1.0)
159
- assert page_image is not None
160
-
161
- clusters = []
162
- for ix, pred_item in enumerate(
163
- self.layout_predictor.predict(page_image)
164
- ):
165
- label = DocItemLabel(
166
- pred_item["label"]
167
- .lower()
168
- .replace(" ", "_")
169
- .replace("-", "_")
170
- ) # Temporary, until docling-ibm-model uses docling-core types
171
- cluster = Cluster(
172
- id=ix,
173
- label=label,
174
- confidence=pred_item["confidence"],
175
- bbox=BoundingBox.model_validate(pred_item),
176
- cells=[],
177
- )
178
- clusters.append(cluster)
179
-
180
- if settings.debug.visualize_raw_layout:
181
- self.draw_clusters_and_cells_side_by_side(
182
- conv_res, page, clusters, mode_prefix="raw"
183
- )
184
-
185
- # Apply postprocessing
186
-
187
- processed_clusters, processed_cells = LayoutPostprocessor(
188
- page, clusters, self.options
189
- ).postprocess()
190
- # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
191
-
192
- with warnings.catch_warnings():
193
- warnings.filterwarnings(
194
- "ignore",
195
- "Mean of empty slice|invalid value encountered in scalar divide",
196
- RuntimeWarning,
197
- "numpy",
198
- )
199
-
200
- conv_res.confidence.pages[page.page_no].layout_score = float(
201
- np.mean([c.confidence for c in processed_clusters])
202
- )
203
-
204
- conv_res.confidence.pages[page.page_no].ocr_score = float(
205
- np.mean(
206
- [c.confidence for c in processed_cells if c.from_ocr]
207
- )
208
- )
209
-
210
- page.predictions.layout = LayoutPrediction(
211
- clusters=processed_clusters
212
- )
213
-
214
- if settings.debug.visualize_layout:
215
- self.draw_clusters_and_cells_side_by_side(
216
- conv_res, page, processed_clusters, mode_prefix="postprocessed"
217
- )
161
+ continue
218
162
 
163
+ assert page.size is not None
164
+ page_image = page.get_image(scale=1.0)
165
+ assert page_image is not None
166
+
167
+ valid_pages.append(page)
168
+ valid_page_images.append(page_image)
169
+
170
+ # Process all valid pages with batch prediction
171
+ batch_predictions = []
172
+ if valid_page_images:
173
+ with TimeRecorder(conv_res, "layout"):
174
+ batch_predictions = self.layout_predictor.predict_batch( # type: ignore[attr-defined]
175
+ valid_page_images
176
+ )
177
+
178
+ # Process each page with its predictions
179
+ valid_page_idx = 0
180
+ for page in pages:
181
+ assert page._backend is not None
182
+ if not page._backend.is_valid():
219
183
  yield page
184
+ continue
185
+
186
+ page_predictions = batch_predictions[valid_page_idx]
187
+ valid_page_idx += 1
188
+
189
+ clusters = []
190
+ for ix, pred_item in enumerate(page_predictions):
191
+ label = DocItemLabel(
192
+ pred_item["label"].lower().replace(" ", "_").replace("-", "_")
193
+ ) # Temporary, until docling-ibm-model uses docling-core types
194
+ cluster = Cluster(
195
+ id=ix,
196
+ label=label,
197
+ confidence=pred_item["confidence"],
198
+ bbox=BoundingBox.model_validate(pred_item),
199
+ cells=[],
200
+ )
201
+ clusters.append(cluster)
202
+
203
+ if settings.debug.visualize_raw_layout:
204
+ self.draw_clusters_and_cells_side_by_side(
205
+ conv_res, page, clusters, mode_prefix="raw"
206
+ )
207
+
208
+ # Apply postprocessing
209
+ processed_clusters, processed_cells = LayoutPostprocessor(
210
+ page, clusters, self.options
211
+ ).postprocess()
212
+ # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
213
+
214
+ with warnings.catch_warnings():
215
+ warnings.filterwarnings(
216
+ "ignore",
217
+ "Mean of empty slice|invalid value encountered in scalar divide",
218
+ RuntimeWarning,
219
+ "numpy",
220
+ )
221
+
222
+ conv_res.confidence.pages[page.page_no].layout_score = float(
223
+ np.mean([c.confidence for c in processed_clusters])
224
+ )
225
+
226
+ conv_res.confidence.pages[page.page_no].ocr_score = float(
227
+ np.mean([c.confidence for c in processed_cells if c.from_ocr])
228
+ )
229
+
230
+ page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
231
+
232
+ if settings.debug.visualize_layout:
233
+ self.draw_clusters_and_cells_side_by_side(
234
+ conv_res, page, processed_clusters, mode_prefix="postprocessed"
235
+ )
236
+
237
+ yield page
@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
217
217
  return conv_res
218
218
 
219
219
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
220
- status = ConversionStatus.SUCCESS
220
+ status = conv_res.status
221
+ if status in [
222
+ ConversionStatus.PENDING,
223
+ ConversionStatus.STARTED,
224
+ ]: # preserves ConversionStatus.PARTIAL_SUCCESS
225
+ status = ConversionStatus.SUCCESS
226
+
221
227
  for page in conv_res.pages:
222
228
  if page._backend is None or not page._backend.is_valid():
223
229
  conv_res.errors.append(
@@ -0,0 +1,605 @@
1
+ # threaded_standard_pdf_pipeline.py
2
+ """Thread-safe, production-ready PDF pipeline
3
+ ================================================
4
+ A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
5
+
6
+ * **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
7
+ threads so that concurrent invocations never share mutable state.
8
+ * **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
9
+ relying on :pyfunc:`id`, which may clash after garbage collection.
10
+ * **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
11
+ propagates downstream so stages terminate deterministically without sentinels.
12
+ * **Minimal shared state** - heavyweight models are initialised once per pipeline instance
13
+ and only read by worker threads; no runtime mutability is exposed.
14
+ * **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import itertools
20
+ import logging
21
+ import threading
22
+ import time
23
+ from collections import defaultdict, deque
24
+ from dataclasses import dataclass, field
25
+ from pathlib import Path
26
+ from typing import Any, Iterable, List, Optional, Sequence, Tuple
27
+
28
+ from docling.backend.abstract_backend import AbstractDocumentBackend
29
+ from docling.backend.pdf_backend import PdfDocumentBackend
30
+ from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
31
+ from docling.datamodel.document import ConversionResult
32
+ from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
33
+ from docling.datamodel.settings import settings
34
+ from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
35
+ from docling.models.document_picture_classifier import (
36
+ DocumentPictureClassifier,
37
+ DocumentPictureClassifierOptions,
38
+ )
39
+ from docling.models.factories import get_ocr_factory, get_picture_description_factory
40
+ from docling.models.layout_model import LayoutModel
41
+ from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
42
+ from docling.models.page_preprocessing_model import (
43
+ PagePreprocessingModel,
44
+ PagePreprocessingOptions,
45
+ )
46
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
47
+ from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
48
+ from docling.models.table_structure_model import TableStructureModel
49
+ from docling.pipeline.base_pipeline import BasePipeline
50
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
51
+ from docling.utils.utils import chunkify
52
+
53
+ _log = logging.getLogger(__name__)
54
+
55
+ # ──────────────────────────────────────────────────────────────────────────────
56
+ # Helper data structures
57
+ # ──────────────────────────────────────────────────────────────────────────────
58
+
59
+
60
+ @dataclass
61
+ class ThreadedItem:
62
+ """Envelope that travels between pipeline stages."""
63
+
64
+ payload: Optional[Page]
65
+ run_id: int # Unique per *execute* call, monotonic across pipeline instance
66
+ page_no: int
67
+ conv_res: ConversionResult
68
+ error: Optional[Exception] = None
69
+ is_failed: bool = False
70
+
71
+
72
+ @dataclass
73
+ class ProcessingResult:
74
+ """Aggregated outcome of a pipeline run."""
75
+
76
+ pages: List[Page] = field(default_factory=list)
77
+ failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
78
+ total_expected: int = 0
79
+
80
+ @property
81
+ def success_count(self) -> int:
82
+ return len(self.pages)
83
+
84
+ @property
85
+ def failure_count(self) -> int:
86
+ return len(self.failed_pages)
87
+
88
+ @property
89
+ def is_partial_success(self) -> bool:
90
+ return 0 < self.success_count < self.total_expected
91
+
92
+ @property
93
+ def is_complete_failure(self) -> bool:
94
+ return self.success_count == 0 and self.failure_count > 0
95
+
96
+
97
+ class ThreadedQueue:
98
+ """Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
99
+
100
+ __slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
101
+
102
+ def __init__(self, max_size: int) -> None:
103
+ self._max: int = max_size
104
+ self._items: deque[ThreadedItem] = deque()
105
+ self._lock = threading.Lock()
106
+ self._not_full = threading.Condition(self._lock)
107
+ self._not_empty = threading.Condition(self._lock)
108
+ self._closed = False
109
+
110
+ # ---------------------------------------------------------------- put()
111
+ def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
112
+ """Block until queue accepts *item* or is closed. Returns *False* if closed."""
113
+ with self._not_full:
114
+ if self._closed:
115
+ return False
116
+ start = time.monotonic()
117
+ while len(self._items) >= self._max and not self._closed:
118
+ if timeout is not None:
119
+ remaining = timeout - (time.monotonic() - start)
120
+ if remaining <= 0:
121
+ return False
122
+ self._not_full.wait(remaining)
123
+ else:
124
+ self._not_full.wait()
125
+ if self._closed:
126
+ return False
127
+ self._items.append(item)
128
+ self._not_empty.notify()
129
+ return True
130
+
131
+ # ------------------------------------------------------------ get_batch()
132
+ def get_batch(
133
+ self, size: int, timeout: Optional[float] | None = None
134
+ ) -> List[ThreadedItem]:
135
+ """Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
136
+ with self._not_empty:
137
+ start = time.monotonic()
138
+ while not self._items and not self._closed:
139
+ if timeout is not None:
140
+ remaining = timeout - (time.monotonic() - start)
141
+ if remaining <= 0:
142
+ return []
143
+ self._not_empty.wait(remaining)
144
+ else:
145
+ self._not_empty.wait()
146
+ batch: List[ThreadedItem] = []
147
+ while self._items and len(batch) < size:
148
+ batch.append(self._items.popleft())
149
+ if batch:
150
+ self._not_full.notify_all()
151
+ return batch
152
+
153
+ # ---------------------------------------------------------------- close()
154
+ def close(self) -> None:
155
+ with self._lock:
156
+ self._closed = True
157
+ self._not_empty.notify_all()
158
+ self._not_full.notify_all()
159
+
160
+ # -------------------------------------------------------------- property
161
+ @property
162
+ def closed(self) -> bool:
163
+ return self._closed
164
+
165
+
166
+ class ThreadedPipelineStage:
167
+ """A single pipeline stage backed by one worker thread."""
168
+
169
+ def __init__(
170
+ self,
171
+ *,
172
+ name: str,
173
+ model: Any,
174
+ batch_size: int,
175
+ batch_timeout: float,
176
+ queue_max_size: int,
177
+ ) -> None:
178
+ self.name = name
179
+ self.model = model
180
+ self.batch_size = batch_size
181
+ self.batch_timeout = batch_timeout
182
+ self.input_queue = ThreadedQueue(queue_max_size)
183
+ self._outputs: list[ThreadedQueue] = []
184
+ self._thread: Optional[threading.Thread] = None
185
+ self._running = False
186
+
187
+ # ---------------------------------------------------------------- wiring
188
+ def add_output_queue(self, q: ThreadedQueue) -> None:
189
+ self._outputs.append(q)
190
+
191
+ # -------------------------------------------------------------- lifecycle
192
+ def start(self) -> None:
193
+ if self._running:
194
+ return
195
+ self._running = True
196
+ self._thread = threading.Thread(
197
+ target=self._run, name=f"Stage-{self.name}", daemon=False
198
+ )
199
+ self._thread.start()
200
+
201
+ def stop(self) -> None:
202
+ if not self._running:
203
+ return
204
+ self._running = False
205
+ self.input_queue.close()
206
+ if self._thread is not None:
207
+ self._thread.join(timeout=30.0)
208
+ if self._thread.is_alive():
209
+ _log.warning("Stage %s did not terminate cleanly within 30s", self.name)
210
+
211
+ # ------------------------------------------------------------------ _run
212
+ def _run(self) -> None:
213
+ try:
214
+ while self._running:
215
+ batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
216
+ if not batch and self.input_queue.closed:
217
+ break
218
+ processed = self._process_batch(batch)
219
+ self._emit(processed)
220
+ except Exception: # pragma: no cover - top-level guard
221
+ _log.exception("Fatal error in stage %s", self.name)
222
+ finally:
223
+ for q in self._outputs:
224
+ q.close()
225
+
226
+ # ----------------------------------------------------- _process_batch()
227
+ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
228
+ """Run *model* on *batch* grouped by run_id to maximise batching."""
229
+ groups: dict[int, list[ThreadedItem]] = defaultdict(list)
230
+ for itm in batch:
231
+ groups[itm.run_id].append(itm)
232
+
233
+ result: list[ThreadedItem] = []
234
+ for rid, items in groups.items():
235
+ good: list[ThreadedItem] = [i for i in items if not i.is_failed]
236
+ if not good:
237
+ result.extend(items)
238
+ continue
239
+ try:
240
+ # Filter out None payloads and ensure type safety
241
+ pages_with_payloads = [
242
+ (i, i.payload) for i in good if i.payload is not None
243
+ ]
244
+ if len(pages_with_payloads) != len(good):
245
+ # Some items have None payloads, mark all as failed
246
+ for it in items:
247
+ it.is_failed = True
248
+ it.error = RuntimeError("Page payload is None")
249
+ result.extend(items)
250
+ continue
251
+
252
+ pages: List[Page] = [payload for _, payload in pages_with_payloads]
253
+ processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
254
+ if len(processed_pages) != len(pages): # strict mismatch guard
255
+ raise RuntimeError(
256
+ f"Model {self.name} returned wrong number of pages"
257
+ )
258
+ for idx, page in enumerate(processed_pages):
259
+ result.append(
260
+ ThreadedItem(
261
+ payload=page,
262
+ run_id=rid,
263
+ page_no=good[idx].page_no,
264
+ conv_res=good[idx].conv_res,
265
+ )
266
+ )
267
+ except Exception as exc:
268
+ _log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
269
+ for it in items:
270
+ it.is_failed = True
271
+ it.error = exc
272
+ result.extend(items)
273
+ return result
274
+
275
+ # -------------------------------------------------------------- _emit()
276
+ def _emit(self, items: Iterable[ThreadedItem]) -> None:
277
+ for item in items:
278
+ for q in self._outputs:
279
+ if not q.put(item):
280
+ _log.error("Output queue closed while emitting from %s", self.name)
281
+
282
+
283
+ @dataclass
284
+ class RunContext:
285
+ """Wiring for a single *execute* call."""
286
+
287
+ stages: list[ThreadedPipelineStage]
288
+ first_stage: ThreadedPipelineStage
289
+ output_queue: ThreadedQueue
290
+
291
+
292
+ # ──────────────────────────────────────────────────────────────────────────────
293
+ # Main pipeline
294
+ # ──────────────────────────────────────────────────────────────────────────────
295
+
296
+
297
+ class ThreadedStandardPdfPipeline(BasePipeline):
298
+ """High-performance PDF pipeline with multi-threaded stages."""
299
+
300
+ def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
301
+ super().__init__(pipeline_options)
302
+ self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
303
+ self._run_seq = itertools.count(1) # deterministic, monotonic run ids
304
+
305
+ # initialise heavy models once
306
+ self._init_models()
307
+
308
+ # ────────────────────────────────────────────────────────────────────────
309
+ # Heavy-model initialisation & helpers
310
+ # ────────────────────────────────────────────────────────────────────────
311
+
312
+ def _init_models(self) -> None:
313
+ art_path = self._resolve_artifacts_path()
314
+ self.keep_images = (
315
+ self.pipeline_options.generate_page_images
316
+ or self.pipeline_options.generate_picture_images
317
+ or self.pipeline_options.generate_table_images
318
+ )
319
+ self.preprocessing_model = PagePreprocessingModel(
320
+ options=PagePreprocessingOptions(
321
+ images_scale=self.pipeline_options.images_scale
322
+ )
323
+ )
324
+ self.ocr_model = self._make_ocr_model(art_path)
325
+ self.layout_model = LayoutModel(
326
+ artifacts_path=art_path,
327
+ accelerator_options=self.pipeline_options.accelerator_options,
328
+ options=self.pipeline_options.layout_options,
329
+ )
330
+ self.table_model = TableStructureModel(
331
+ enabled=self.pipeline_options.do_table_structure,
332
+ artifacts_path=art_path,
333
+ options=self.pipeline_options.table_structure_options,
334
+ accelerator_options=self.pipeline_options.accelerator_options,
335
+ )
336
+ self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
337
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
338
+
339
+ # --- optional enrichment ------------------------------------------------
340
+ self.enrichment_pipe = []
341
+ code_formula = CodeFormulaModel(
342
+ enabled=self.pipeline_options.do_code_enrichment
343
+ or self.pipeline_options.do_formula_enrichment,
344
+ artifacts_path=art_path,
345
+ options=CodeFormulaModelOptions(
346
+ do_code_enrichment=self.pipeline_options.do_code_enrichment,
347
+ do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
348
+ ),
349
+ accelerator_options=self.pipeline_options.accelerator_options,
350
+ )
351
+ if code_formula.enabled:
352
+ self.enrichment_pipe.append(code_formula)
353
+
354
+ picture_classifier = DocumentPictureClassifier(
355
+ enabled=self.pipeline_options.do_picture_classification,
356
+ artifacts_path=art_path,
357
+ options=DocumentPictureClassifierOptions(),
358
+ accelerator_options=self.pipeline_options.accelerator_options,
359
+ )
360
+ if picture_classifier.enabled:
361
+ self.enrichment_pipe.append(picture_classifier)
362
+
363
+ picture_descr = self._make_picture_description_model(art_path)
364
+ if picture_descr and picture_descr.enabled:
365
+ self.enrichment_pipe.append(picture_descr)
366
+
367
+ self.keep_backend = any(
368
+ (
369
+ self.pipeline_options.do_formula_enrichment,
370
+ self.pipeline_options.do_code_enrichment,
371
+ self.pipeline_options.do_picture_classification,
372
+ self.pipeline_options.do_picture_description,
373
+ )
374
+ )
375
+
376
+ # ---------------------------------------------------------------- helpers
377
+ def _resolve_artifacts_path(self) -> Optional[Path]:
378
+ if self.pipeline_options.artifacts_path:
379
+ p = Path(self.pipeline_options.artifacts_path).expanduser()
380
+ elif settings.artifacts_path:
381
+ p = Path(settings.artifacts_path).expanduser()
382
+ else:
383
+ return None
384
+ if not p.is_dir():
385
+ raise RuntimeError(
386
+ f"{p} does not exist or is not a directory containing the required models"
387
+ )
388
+ return p
389
+
390
+ def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
391
+ factory = get_ocr_factory(
392
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
393
+ )
394
+ return factory.create_instance(
395
+ options=self.pipeline_options.ocr_options,
396
+ enabled=self.pipeline_options.do_ocr,
397
+ artifacts_path=art_path,
398
+ accelerator_options=self.pipeline_options.accelerator_options,
399
+ )
400
+
401
+ def _make_picture_description_model(
402
+ self, art_path: Optional[Path]
403
+ ) -> Optional[PictureDescriptionBaseModel]:
404
+ factory = get_picture_description_factory(
405
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
406
+ )
407
+ return factory.create_instance(
408
+ options=self.pipeline_options.picture_description_options,
409
+ enabled=self.pipeline_options.do_picture_description,
410
+ enable_remote_services=self.pipeline_options.enable_remote_services,
411
+ artifacts_path=art_path,
412
+ accelerator_options=self.pipeline_options.accelerator_options,
413
+ )
414
+
415
+ # ────────────────────────────────────────────────────────────────────────
416
+ # Build - thread pipeline
417
+ # ────────────────────────────────────────────────────────────────────────
418
+
419
+ def _create_run_ctx(self) -> RunContext:
420
+ opts = self.pipeline_options
421
+ preprocess = ThreadedPipelineStage(
422
+ name="preprocess",
423
+ model=self.preprocessing_model,
424
+ batch_size=1,
425
+ batch_timeout=opts.batch_timeout_seconds,
426
+ queue_max_size=opts.queue_max_size,
427
+ )
428
+ ocr = ThreadedPipelineStage(
429
+ name="ocr",
430
+ model=self.ocr_model,
431
+ batch_size=opts.ocr_batch_size,
432
+ batch_timeout=opts.batch_timeout_seconds,
433
+ queue_max_size=opts.queue_max_size,
434
+ )
435
+ layout = ThreadedPipelineStage(
436
+ name="layout",
437
+ model=self.layout_model,
438
+ batch_size=opts.layout_batch_size,
439
+ batch_timeout=opts.batch_timeout_seconds,
440
+ queue_max_size=opts.queue_max_size,
441
+ )
442
+ table = ThreadedPipelineStage(
443
+ name="table",
444
+ model=self.table_model,
445
+ batch_size=opts.table_batch_size,
446
+ batch_timeout=opts.batch_timeout_seconds,
447
+ queue_max_size=opts.queue_max_size,
448
+ )
449
+ assemble = ThreadedPipelineStage(
450
+ name="assemble",
451
+ model=self.assemble_model,
452
+ batch_size=1,
453
+ batch_timeout=opts.batch_timeout_seconds,
454
+ queue_max_size=opts.queue_max_size,
455
+ )
456
+
457
+ # wire stages
458
+ output_q = ThreadedQueue(opts.queue_max_size)
459
+ preprocess.add_output_queue(ocr.input_queue)
460
+ ocr.add_output_queue(layout.input_queue)
461
+ layout.add_output_queue(table.input_queue)
462
+ table.add_output_queue(assemble.input_queue)
463
+ assemble.add_output_queue(output_q)
464
+
465
+ stages = [preprocess, ocr, layout, table, assemble]
466
+ return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
467
+
468
+ # --------------------------------------------------------------------- build
469
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
470
+ """Stream-build the document while interleaving producer and consumer work."""
471
+ run_id = next(self._run_seq)
472
+ assert isinstance(conv_res.input._backend, PdfDocumentBackend)
473
+ backend = conv_res.input._backend
474
+
475
+ # preload & initialise pages -------------------------------------------------------------
476
+ start_page, end_page = conv_res.input.limits.page_range
477
+ pages: list[Page] = []
478
+ for i in range(conv_res.input.page_count):
479
+ if start_page - 1 <= i <= end_page - 1:
480
+ page = Page(page_no=i)
481
+ page._backend = backend.load_page(i)
482
+ if page._backend and page._backend.is_valid():
483
+ page.size = page._backend.get_size()
484
+ conv_res.pages.append(page)
485
+ pages.append(page)
486
+
487
+ if not pages:
488
+ conv_res.status = ConversionStatus.FAILURE
489
+ return conv_res
490
+
491
+ total_pages: int = len(pages)
492
+ ctx: RunContext = self._create_run_ctx()
493
+ for st in ctx.stages:
494
+ st.start()
495
+
496
+ proc = ProcessingResult(total_expected=total_pages)
497
+ fed_idx: int = 0 # number of pages successfully queued
498
+ batch_size: int = 32 # drain chunk
499
+ try:
500
+ while proc.success_count + proc.failure_count < total_pages:
501
+ # 1) feed - try to enqueue until the first queue is full
502
+ while fed_idx < total_pages:
503
+ ok = ctx.first_stage.input_queue.put(
504
+ ThreadedItem(
505
+ payload=pages[fed_idx],
506
+ run_id=run_id,
507
+ page_no=pages[fed_idx].page_no,
508
+ conv_res=conv_res,
509
+ ),
510
+ timeout=0.0, # non-blocking try-put
511
+ )
512
+ if ok:
513
+ fed_idx += 1
514
+ if fed_idx == total_pages:
515
+ ctx.first_stage.input_queue.close()
516
+ else: # queue full - switch to draining
517
+ break
518
+
519
+ # 2) drain - pull whatever is ready from the output side
520
+ out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
521
+ for itm in out_batch:
522
+ if itm.run_id != run_id:
523
+ continue
524
+ if itm.is_failed or itm.error:
525
+ proc.failed_pages.append(
526
+ (itm.page_no, itm.error or RuntimeError("unknown error"))
527
+ )
528
+ else:
529
+ assert itm.payload is not None
530
+ proc.pages.append(itm.payload)
531
+
532
+ # 3) failure safety - downstream closed early -> mark missing pages failed
533
+ if not out_batch and ctx.output_queue.closed:
534
+ missing = total_pages - (proc.success_count + proc.failure_count)
535
+ if missing > 0:
536
+ proc.failed_pages.extend(
537
+ [(-1, RuntimeError("pipeline terminated early"))] * missing
538
+ )
539
+ break
540
+ finally:
541
+ for st in ctx.stages:
542
+ st.stop()
543
+ ctx.output_queue.close()
544
+
545
+ self._integrate_results(conv_res, proc)
546
+ return conv_res
547
+
548
+ # ---------------------------------------------------- integrate_results()
549
+ def _integrate_results(
550
+ self, conv_res: ConversionResult, proc: ProcessingResult
551
+ ) -> None:
552
+ page_map = {p.page_no: p for p in proc.pages}
553
+ conv_res.pages = [
554
+ page_map.get(p.page_no, p)
555
+ for p in conv_res.pages
556
+ if p.page_no in page_map
557
+ or not any(fp == p.page_no for fp, _ in proc.failed_pages)
558
+ ]
559
+ if proc.is_complete_failure:
560
+ conv_res.status = ConversionStatus.FAILURE
561
+ elif proc.is_partial_success:
562
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
563
+ else:
564
+ conv_res.status = ConversionStatus.SUCCESS
565
+ if not self.keep_images:
566
+ for p in conv_res.pages:
567
+ p._image_cache = {}
568
+ if not self.keep_backend:
569
+ for p in conv_res.pages:
570
+ if p._backend is not None:
571
+ p._backend.unload()
572
+
573
+ # ---------------------------------------------------------------- assemble
574
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
575
+ elements, headers, body = [], [], []
576
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
577
+ for p in conv_res.pages:
578
+ if p.assembled:
579
+ elements.extend(p.assembled.elements)
580
+ headers.extend(p.assembled.headers)
581
+ body.extend(p.assembled.body)
582
+ conv_res.assembled = AssembledUnit(
583
+ elements=elements, headers=headers, body=body
584
+ )
585
+ conv_res.document = self.reading_order_model(conv_res)
586
+ return conv_res
587
+
588
+ # ---------------------------------------------------------------- misc
589
+ @classmethod
590
+ def get_default_options(cls) -> ThreadedPdfPipelineOptions:
591
+ return ThreadedPdfPipelineOptions()
592
+
593
+ @classmethod
594
+ def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
595
+ return isinstance(backend, PdfDocumentBackend)
596
+
597
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
598
+ return conv_res.status
599
+
600
+ def _unload(self, conv_res: ConversionResult) -> None:
601
+ for p in conv_res.pages:
602
+ if p._backend is not None:
603
+ p._backend.unload()
604
+ if conv_res.input._backend:
605
+ conv_res.input._backend.unload()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.42.1
3
+ Version: 2.43.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -28,9 +28,9 @@ License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
30
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
31
- Requires-Dist: docling-ibm-models<4,>=3.6.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.9.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
- Requires-Dist: pypdfium2<5.0.0,>=4.30.0
33
+ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
35
35
  Requires-Dist: huggingface_hub<1,>=0.23
36
36
  Requires-Dist: requests<3.0.0,>=2.32.2
@@ -89,6 +89,7 @@ Dynamic: license-file
89
89
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
90
90
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
91
91
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
92
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
92
93
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
93
94
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
94
95
 
@@ -1,5 +1,5 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/document_converter.py,sha256=9aH8B30_jOYN4P_ySCCvtgEb3GoIpec15r7lEAFlMDU,14469
2
+ docling/document_converter.py,sha256=pYlozCp6X1iGO75m3KSudMfrSCrXihTlRpKARFN67BI,14757
3
3
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
4
4
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
5
5
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,13 +9,13 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
12
- docling/backend/html_backend.py,sha256=gGkm3i7FpW2WCJ-_GPpOJNh1LUq1_-vRGyGURuPagck,19284
13
- docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
12
+ docling/backend/html_backend.py,sha256=Nuzyp6kyjd0g_MsBEPiWdFWU5w9UM60yWSluwU5C0M4,20310
13
+ docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
14
14
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
15
15
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
16
- docling/backend/msword_backend.py,sha256=7mzPCF4bGWZPst5ntoV3aSxH5WUu2nBP-l8lgQT3tdw,44544
16
+ docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
17
17
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
18
- docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
18
+ docling/backend/pdf_backend.py,sha256=sUBrCz1zvt6E7sVl4xHtrkpTBClOK0vBV2lLi_TRHNg,3237
19
19
  docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
20
20
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,10 +37,10 @@ docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF
37
37
  docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
38
38
  docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
39
39
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
40
- docling/datamodel/pipeline_options.py,sha256=nlejeQjnJx2RBMkCukDECHGuVEOol9hbsSLUi2ee9hY,10134
40
+ docling/datamodel/pipeline_options.py,sha256=TaBmCBRjSxyoh79UkpEkPzokLYS8BA2QJam86g9pT5g,10544
41
41
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
42
42
  docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
43
- docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
43
+ docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
44
44
  docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
45
45
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
46
  docling/models/api_vlm_model.py,sha256=foBvzaWeHFH1t-VdvRWLdiXiiofhvhjvHqRI0eNA_3w,2923
@@ -49,7 +49,7 @@ docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDW
49
49
  docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
50
50
  docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
51
51
  docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
52
- docling/models/layout_model.py,sha256=8bfLVKCS2A-ePTQK-T4M2K_Ah-jUVj71YOtwZvZ9rsU,8825
52
+ docling/models/layout_model.py,sha256=Nfbo6keMB4vVjGoZdFMqD9CmZcWh-0bE3LkRjJTDJQ0,9146
53
53
  docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
54
54
  docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
55
55
  docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
@@ -74,9 +74,10 @@ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6D
74
74
  docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
75
75
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
77
- docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
77
+ docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
78
78
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
79
79
  docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
80
+ docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=Rjdq1x2fRHBA0rMHJ6rqqHzxVVzgTEALBBj5d30oOZ8,26018
80
81
  docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
81
82
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
83
  docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
@@ -91,9 +92,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
91
92
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
92
93
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
93
94
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
94
- docling-2.42.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
- docling-2.42.1.dist-info/METADATA,sha256=d46NOPDEps6dVLLMh3tWBCEQv7b_bwQQ46ndyqVO-ag,10310
96
- docling-2.42.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
- docling-2.42.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
- docling-2.42.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
- docling-2.42.1.dist-info/RECORD,,
95
+ docling-2.43.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
96
+ docling-2.43.0.dist-info/METADATA,sha256=HS5J6rDKaZ_G_d4p10XgAwrNe-FjmHV-u5EmoTP4hro,10458
97
+ docling-2.43.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
98
+ docling-2.43.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
99
+ docling-2.43.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
100
+ docling-2.43.0.dist-info/RECORD,,