docling 2.28.0__py3-none-any.whl → 2.28.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -206,9 +206,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
206
206
  hlevel = int(element.name.replace("h", ""))
207
207
  text = element.text.strip()
208
208
 
209
- if hlevel == 1:
210
- self.content_layer = ContentLayer.BODY
209
+ self.content_layer = ContentLayer.BODY
211
210
 
211
+ if hlevel == 1:
212
212
  for key in self.parents.keys():
213
213
  self.parents[key] = None
214
214
 
@@ -243,7 +243,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
243
243
  self.parents[hlevel] = doc.add_heading(
244
244
  parent=self.parents[hlevel - 1],
245
245
  text=text,
246
- level=hlevel,
246
+ level=hlevel - 1,
247
247
  content_layer=self.content_layer,
248
248
  )
249
249
 
@@ -212,9 +212,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
212
212
  traverse(element)
213
213
  snippet_text = "".join(strings)
214
214
  if len(snippet_text) > 0:
215
- parent_item = doc.add_text(
216
- label=doc_label, parent=parent_item, text=snippet_text
217
- )
215
+ if doc_label == DocItemLabel.SECTION_HEADER:
216
+ parent_item = doc.add_heading(
217
+ text=snippet_text,
218
+ level=element.level - 1,
219
+ parent=parent_item,
220
+ )
221
+ else:
222
+ parent_item = doc.add_text(
223
+ label=doc_label, parent=parent_item, text=snippet_text
224
+ )
218
225
 
219
226
  elif isinstance(element, marko.block.List):
220
227
  has_non_empty_list_items = False
@@ -232,12 +239,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
232
239
  label=label, name=f"list", parent=parent_item
233
240
  )
234
241
 
235
- elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
242
+ elif (
243
+ isinstance(element, marko.block.ListItem)
244
+ and len(element.children) > 0
245
+ and isinstance((first_child := element.children[0]), marko.block.Paragraph)
246
+ ):
236
247
  self._close_table(doc)
237
248
  self._process_inline_text(parent_item, doc)
238
249
  _log.debug(" - List item")
239
250
 
240
- first_child = element.children[0]
241
251
  snippet_text = str(first_child.children[0].children) # type: ignore
242
252
  is_numbered = False
243
253
  if (
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
53
53
  self.max_levels: int = 10
54
54
  self.level_at_new_list: Optional[int] = None
55
55
  self.parents: dict[int, Optional[NodeItem]] = {}
56
+ self.numbered_headers: dict[int, int] = {}
56
57
  for i in range(-1, self.max_levels):
57
58
  self.parents[i] = None
58
59
 
@@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
346
347
  parent=None, label=DocItemLabel.TITLE, text=text
347
348
  )
348
349
  elif "Heading" in p_style_id:
349
- self.add_header(doc, p_level, text)
350
+ style_element = getattr(paragraph.style, "element", None)
351
+ if style_element:
352
+ is_numbered_style = (
353
+ "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
354
+ )
355
+ else:
356
+ is_numbered_style = False
357
+ self.add_header(doc, p_level, text, is_numbered_style)
350
358
 
351
359
  elif len(equations) > 0:
352
360
  if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
415
423
  return
416
424
 
417
425
  def add_header(
418
- self, doc: DoclingDocument, curr_level: Optional[int], text: str
426
+ self,
427
+ doc: DoclingDocument,
428
+ curr_level: Optional[int],
429
+ text: str,
430
+ is_numbered_style: bool = False,
419
431
  ) -> None:
420
432
  level = self.get_level()
421
433
  if isinstance(curr_level, int):
@@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
433
445
  if key >= curr_level:
434
446
  self.parents[key] = None
435
447
 
436
- self.parents[curr_level] = doc.add_heading(
437
- parent=self.parents[curr_level - 1],
438
- text=text,
439
- level=curr_level,
440
- )
448
+ current_level = curr_level
449
+ parent_level = curr_level - 1
450
+ add_level = curr_level
441
451
  else:
442
- self.parents[self.level] = doc.add_heading(
443
- parent=self.parents[self.level - 1],
444
- text=text,
445
- level=1,
446
- )
452
+ current_level = self.level
453
+ parent_level = self.level - 1
454
+ add_level = 1
455
+
456
+ if is_numbered_style:
457
+ if add_level in self.numbered_headers:
458
+ self.numbered_headers[add_level] += 1
459
+ else:
460
+ self.numbered_headers[add_level] = 1
461
+ text = f"{self.numbered_headers[add_level]} {text}"
462
+
463
+ # Reset deeper levels
464
+ next_level = add_level + 1
465
+ while next_level in self.numbered_headers:
466
+ self.numbered_headers[next_level] = 0
467
+ next_level += 1
468
+
469
+ # Scan upper levels
470
+ previous_level = add_level - 1
471
+ while previous_level in self.numbered_headers:
472
+ # MSWord convention: no empty sublevels
473
+ # I.e., sub-sub section (2.0.1) without a sub-section (2.1)
474
+ # is processed as 2.1.1
475
+ if self.numbered_headers[previous_level] == 0:
476
+ self.numbered_headers[previous_level] += 1
477
+
478
+ text = f"{self.numbered_headers[previous_level]}.{text}"
479
+ previous_level -= 1
480
+
481
+ self.parents[current_level] = doc.add_heading(
482
+ parent=self.parents[parent_level],
483
+ text=text,
484
+ level=add_level,
485
+ )
447
486
  return
448
487
 
449
488
  def add_listitem(
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  import math
3
4
  import sys
@@ -181,7 +182,14 @@ class DocumentConverter:
181
182
  )
182
183
  for format in self.allowed_formats
183
184
  }
184
- self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
185
+ self.initialized_pipelines: Dict[
186
+ Tuple[Type[BasePipeline], str], BasePipeline
187
+ ] = {}
188
+
189
+ def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
190
+ """Generate a hash of pipeline options to use as part of the cache key."""
191
+ options_str = str(pipeline_options.model_dump())
192
+ return hashlib.md5(options_str.encode("utf-8")).hexdigest()
185
193
 
186
194
  def initialize_pipeline(self, format: InputFormat):
187
195
  """Initialize the conversion pipeline for the selected format."""
@@ -279,31 +287,36 @@ class DocumentConverter:
279
287
  yield item
280
288
 
281
289
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
290
+ """Retrieve or initialize a pipeline, reusing instances based on class and options."""
282
291
  fopt = self.format_to_options.get(doc_format)
283
292
 
284
- if fopt is None:
293
+ if fopt is None or fopt.pipeline_options is None:
285
294
  return None
286
- else:
287
- pipeline_class = fopt.pipeline_cls
288
- pipeline_options = fopt.pipeline_options
289
295
 
290
- if pipeline_options is None:
291
- return None
292
- # TODO this will ignore if different options have been defined for the same pipeline class.
293
- if (
294
- pipeline_class not in self.initialized_pipelines
295
- or self.initialized_pipelines[pipeline_class].pipeline_options
296
- != pipeline_options
297
- ):
298
- self.initialized_pipelines[pipeline_class] = pipeline_class(
296
+ pipeline_class = fopt.pipeline_cls
297
+ pipeline_options = fopt.pipeline_options
298
+ options_hash = self._get_pipeline_options_hash(pipeline_options)
299
+
300
+ # Use a composite key to cache pipelines
301
+ cache_key = (pipeline_class, options_hash)
302
+
303
+ if cache_key not in self.initialized_pipelines:
304
+ _log.info(
305
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
306
+ )
307
+ self.initialized_pipelines[cache_key] = pipeline_class(
299
308
  pipeline_options=pipeline_options
300
309
  )
301
- return self.initialized_pipelines[pipeline_class]
310
+ else:
311
+ _log.debug(
312
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
313
+ )
314
+
315
+ return self.initialized_pipelines[cache_key]
302
316
 
303
317
  def _process_document(
304
318
  self, in_doc: InputDocument, raises_on_error: bool
305
319
  ) -> ConversionResult:
306
-
307
320
  valid = (
308
321
  self.allowed_formats is not None and in_doc.format in self.allowed_formats
309
322
  )
@@ -345,7 +358,6 @@ class DocumentConverter:
345
358
  else:
346
359
  if raises_on_error:
347
360
  raise ConversionError(f"Input document {in_doc.file} is not valid.")
348
-
349
361
  else:
350
362
  # invalid doc or not of desired format
351
363
  conv_res = ConversionResult(
@@ -63,7 +63,13 @@ class PagePreprocessingModel(BasePageModel):
63
63
  def draw_text_boxes(image, cells, show: bool = False):
64
64
  draw = ImageDraw.Draw(image)
65
65
  for c in cells:
66
- x0, y0, x1, y1 = c.bbox.as_tuple()
66
+ x0, y0, x1, y1 = (
67
+ c.to_bounding_box().l,
68
+ c.to_bounding_box().t,
69
+ c.to_bounding_box().r,
70
+ c.to_bounding_box().b,
71
+ )
72
+
67
73
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
68
74
  if show:
69
75
  image.show()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.0
3
+ Version: 2.28.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -10,13 +10,13 @@ docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
10
10
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/backend/docx/latex/latex_dict.py,sha256=a0UC3VLmG1BLN-hGmEaQamzKbDB10fCz0U8qRU--aBw,6613
12
12
  docling/backend/docx/latex/omml.py,sha256=U-mQXNCI9ObUyHDxv6ItvaHlObIEu77PiXS1Vaaah6U,12012
13
- docling/backend/html_backend.py,sha256=i9a5ucsIuf-sn6M8tmKt9Kg_qWqc5OJxhARb6ZNS3wI,19448
13
+ docling/backend/html_backend.py,sha256=FRtzsXjlvHISLa2jZ8_zpGqBN6uAaXpuPpLcLLNY_k4,19448
14
14
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
16
- docling/backend/md_backend.py,sha256=v230PXShYJo2QaabwUHiBpE-EGScHIerjL78zPaJpZM,16837
16
+ docling/backend/md_backend.py,sha256=lqDiKIBHGsA0u-H1n9oVpPlrcpVT4gYRuNXXcyGlftM,17219
17
17
  docling/backend/msexcel_backend.py,sha256=_ZVZFKRRijpg-Xz10xNxu2m-NpDaYvoiBqEZP6GbrgE,11095
18
18
  docling/backend/mspowerpoint_backend.py,sha256=zXdXr8nGJJbPGTgR5_dqq5WmNL1wDCaK0RqFqtuHPqs,17213
19
- docling/backend/msword_backend.py,sha256=VjTvJe249FjHJDBpK0RC4iyosMzmpJLTuFIAPNEdReU,23259
19
+ docling/backend/msword_backend.py,sha256=Eyv-owZrudGFpD6QCgr3wBBAbhUaH6QSb2ifaLsoAzY,24752
20
20
  docling/backend/pdf_backend.py,sha256=odWb1rxk3WCUIEJMhq-dYFNUQ1pSDuNHbU9wlTZIRAs,2211
21
21
  docling/backend/pypdfium2_backend.py,sha256=wRwhA5XHRqL7vyNhCAHM6P-ONkwtyjKG9LgC4NJ-4i8,10784
22
22
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -32,7 +32,7 @@ docling/datamodel/base_models.py,sha256=MAHr8LlffZ2uIXZ3AXOsikh_-oQIEYTiwwjsz-dQ
32
32
  docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
33
33
  docling/datamodel/pipeline_options.py,sha256=TpRf_-7UuCjjaytFWA0nL2m-KP4no9jeAjaXRjBLMLE,12593
34
34
  docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
35
- docling/document_converter.py,sha256=LwbnfGzma937EmSrNWMzM-dldI9Cbu4DUgY8gL1OVHo,13184
35
+ docling/document_converter.py,sha256=LCX92FzgmXNJLFVSQfjqH9SGe3zA7FGwARedSigFIpY,13798
36
36
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
37
37
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  docling/models/base_model.py,sha256=9xJ0VIlpR2BzqoEWMC8LYp5Y96QAEKip4b_HCwCDltY,2931
@@ -49,7 +49,7 @@ docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHB
49
49
  docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
50
50
  docling/models/ocr_mac_model.py,sha256=2pZaUWg19go_u88mKWr5y_52PAYEN__GsbyUYLdY4zo,5353
51
51
  docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
52
- docling/models/page_preprocessing_model.py,sha256=wAN2WlW7YnpqyETq6MpEWgUAokUwqGaX_g59sPUQsXo,2903
52
+ docling/models/page_preprocessing_model.py,sha256=Ja7RE1K-2fWxWrxOzNm6QDSGqFf-MY6_uY5OAZ7AQSo,3078
53
53
  docling/models/picture_description_api_model.py,sha256=SRjOkCTBYa1pTIaQffDLUPabljjYrLOQ916MywESEXk,3715
54
54
  docling/models/picture_description_base_model.py,sha256=uRpjBXC2qjpPyWFUt600N1GvmvF-vWwB8f-OTQ7PfDg,2305
55
55
  docling/models/picture_description_vlm_model.py,sha256=I2Un3vfhQVeWEyZ3Sd3Kygw9la2QSZCwDfl_7XVlMm4,4042
@@ -77,8 +77,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
77
77
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
78
78
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
79
79
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
80
- docling-2.28.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
81
- docling-2.28.0.dist-info/METADATA,sha256=miIkWRX5hgrOeGbyYDAiQaymAR6PxK6Qdlss5DR1YhM,9982
82
- docling-2.28.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
- docling-2.28.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
84
- docling-2.28.0.dist-info/RECORD,,
80
+ docling-2.28.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
81
+ docling-2.28.2.dist-info/METADATA,sha256=ZeYjkP0ZzlpqoseGod2_iuJPW9d4B16JCeSo2b61KIw,9982
82
+ docling-2.28.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
+ docling-2.28.2.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
84
+ docling-2.28.2.dist-info/RECORD,,