docling 2.28.0__tar.gz → 2.28.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {docling-2.28.0 → docling-2.28.1}/PKG-INFO +1 -1
  2. {docling-2.28.0 → docling-2.28.1}/docling/backend/msword_backend.py +51 -12
  3. {docling-2.28.0 → docling-2.28.1}/docling/document_converter.py +29 -17
  4. {docling-2.28.0 → docling-2.28.1}/docling/models/page_preprocessing_model.py +7 -1
  5. {docling-2.28.0 → docling-2.28.1}/pyproject.toml +1 -1
  6. {docling-2.28.0 → docling-2.28.1}/LICENSE +0 -0
  7. {docling-2.28.0 → docling-2.28.1}/README.md +0 -0
  8. {docling-2.28.0 → docling-2.28.1}/docling/__init__.py +0 -0
  9. {docling-2.28.0 → docling-2.28.1}/docling/backend/__init__.py +0 -0
  10. {docling-2.28.0 → docling-2.28.1}/docling/backend/abstract_backend.py +0 -0
  11. {docling-2.28.0 → docling-2.28.1}/docling/backend/asciidoc_backend.py +0 -0
  12. {docling-2.28.0 → docling-2.28.1}/docling/backend/csv_backend.py +0 -0
  13. {docling-2.28.0 → docling-2.28.1}/docling/backend/docling_parse_backend.py +0 -0
  14. {docling-2.28.0 → docling-2.28.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  15. {docling-2.28.0 → docling-2.28.1}/docling/backend/docling_parse_v4_backend.py +0 -0
  16. {docling-2.28.0 → docling-2.28.1}/docling/backend/docx/__init__.py +0 -0
  17. {docling-2.28.0 → docling-2.28.1}/docling/backend/docx/latex/__init__.py +0 -0
  18. {docling-2.28.0 → docling-2.28.1}/docling/backend/docx/latex/latex_dict.py +0 -0
  19. {docling-2.28.0 → docling-2.28.1}/docling/backend/docx/latex/omml.py +0 -0
  20. {docling-2.28.0 → docling-2.28.1}/docling/backend/html_backend.py +0 -0
  21. {docling-2.28.0 → docling-2.28.1}/docling/backend/json/__init__.py +0 -0
  22. {docling-2.28.0 → docling-2.28.1}/docling/backend/json/docling_json_backend.py +0 -0
  23. {docling-2.28.0 → docling-2.28.1}/docling/backend/md_backend.py +0 -0
  24. {docling-2.28.0 → docling-2.28.1}/docling/backend/msexcel_backend.py +0 -0
  25. {docling-2.28.0 → docling-2.28.1}/docling/backend/mspowerpoint_backend.py +0 -0
  26. {docling-2.28.0 → docling-2.28.1}/docling/backend/pdf_backend.py +0 -0
  27. {docling-2.28.0 → docling-2.28.1}/docling/backend/pypdfium2_backend.py +0 -0
  28. {docling-2.28.0 → docling-2.28.1}/docling/backend/xml/__init__.py +0 -0
  29. {docling-2.28.0 → docling-2.28.1}/docling/backend/xml/jats_backend.py +0 -0
  30. {docling-2.28.0 → docling-2.28.1}/docling/backend/xml/uspto_backend.py +0 -0
  31. {docling-2.28.0 → docling-2.28.1}/docling/chunking/__init__.py +0 -0
  32. {docling-2.28.0 → docling-2.28.1}/docling/cli/__init__.py +0 -0
  33. {docling-2.28.0 → docling-2.28.1}/docling/cli/main.py +0 -0
  34. {docling-2.28.0 → docling-2.28.1}/docling/cli/models.py +0 -0
  35. {docling-2.28.0 → docling-2.28.1}/docling/cli/tools.py +0 -0
  36. {docling-2.28.0 → docling-2.28.1}/docling/datamodel/__init__.py +0 -0
  37. {docling-2.28.0 → docling-2.28.1}/docling/datamodel/base_models.py +0 -0
  38. {docling-2.28.0 → docling-2.28.1}/docling/datamodel/document.py +0 -0
  39. {docling-2.28.0 → docling-2.28.1}/docling/datamodel/pipeline_options.py +0 -0
  40. {docling-2.28.0 → docling-2.28.1}/docling/datamodel/settings.py +0 -0
  41. {docling-2.28.0 → docling-2.28.1}/docling/exceptions.py +0 -0
  42. {docling-2.28.0 → docling-2.28.1}/docling/models/__init__.py +0 -0
  43. {docling-2.28.0 → docling-2.28.1}/docling/models/base_model.py +0 -0
  44. {docling-2.28.0 → docling-2.28.1}/docling/models/base_ocr_model.py +0 -0
  45. {docling-2.28.0 → docling-2.28.1}/docling/models/code_formula_model.py +0 -0
  46. {docling-2.28.0 → docling-2.28.1}/docling/models/document_picture_classifier.py +0 -0
  47. {docling-2.28.0 → docling-2.28.1}/docling/models/easyocr_model.py +0 -0
  48. {docling-2.28.0 → docling-2.28.1}/docling/models/factories/__init__.py +0 -0
  49. {docling-2.28.0 → docling-2.28.1}/docling/models/factories/base_factory.py +0 -0
  50. {docling-2.28.0 → docling-2.28.1}/docling/models/factories/ocr_factory.py +0 -0
  51. {docling-2.28.0 → docling-2.28.1}/docling/models/factories/picture_description_factory.py +0 -0
  52. {docling-2.28.0 → docling-2.28.1}/docling/models/hf_mlx_model.py +0 -0
  53. {docling-2.28.0 → docling-2.28.1}/docling/models/hf_vlm_model.py +0 -0
  54. {docling-2.28.0 → docling-2.28.1}/docling/models/layout_model.py +0 -0
  55. {docling-2.28.0 → docling-2.28.1}/docling/models/ocr_mac_model.py +0 -0
  56. {docling-2.28.0 → docling-2.28.1}/docling/models/page_assemble_model.py +0 -0
  57. {docling-2.28.0 → docling-2.28.1}/docling/models/picture_description_api_model.py +0 -0
  58. {docling-2.28.0 → docling-2.28.1}/docling/models/picture_description_base_model.py +0 -0
  59. {docling-2.28.0 → docling-2.28.1}/docling/models/picture_description_vlm_model.py +0 -0
  60. {docling-2.28.0 → docling-2.28.1}/docling/models/plugins/__init__.py +0 -0
  61. {docling-2.28.0 → docling-2.28.1}/docling/models/plugins/defaults.py +0 -0
  62. {docling-2.28.0 → docling-2.28.1}/docling/models/rapid_ocr_model.py +0 -0
  63. {docling-2.28.0 → docling-2.28.1}/docling/models/readingorder_model.py +0 -0
  64. {docling-2.28.0 → docling-2.28.1}/docling/models/table_structure_model.py +0 -0
  65. {docling-2.28.0 → docling-2.28.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  66. {docling-2.28.0 → docling-2.28.1}/docling/models/tesseract_ocr_model.py +0 -0
  67. {docling-2.28.0 → docling-2.28.1}/docling/pipeline/__init__.py +0 -0
  68. {docling-2.28.0 → docling-2.28.1}/docling/pipeline/base_pipeline.py +0 -0
  69. {docling-2.28.0 → docling-2.28.1}/docling/pipeline/simple_pipeline.py +0 -0
  70. {docling-2.28.0 → docling-2.28.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  71. {docling-2.28.0 → docling-2.28.1}/docling/pipeline/vlm_pipeline.py +0 -0
  72. {docling-2.28.0 → docling-2.28.1}/docling/py.typed +0 -0
  73. {docling-2.28.0 → docling-2.28.1}/docling/utils/__init__.py +0 -0
  74. {docling-2.28.0 → docling-2.28.1}/docling/utils/accelerator_utils.py +0 -0
  75. {docling-2.28.0 → docling-2.28.1}/docling/utils/export.py +0 -0
  76. {docling-2.28.0 → docling-2.28.1}/docling/utils/glm_utils.py +0 -0
  77. {docling-2.28.0 → docling-2.28.1}/docling/utils/layout_postprocessor.py +0 -0
  78. {docling-2.28.0 → docling-2.28.1}/docling/utils/locks.py +0 -0
  79. {docling-2.28.0 → docling-2.28.1}/docling/utils/model_downloader.py +0 -0
  80. {docling-2.28.0 → docling-2.28.1}/docling/utils/ocr_utils.py +0 -0
  81. {docling-2.28.0 → docling-2.28.1}/docling/utils/profiling.py +0 -0
  82. {docling-2.28.0 → docling-2.28.1}/docling/utils/utils.py +0 -0
  83. {docling-2.28.0 → docling-2.28.1}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.0
3
+ Version: 2.28.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
53
53
  self.max_levels: int = 10
54
54
  self.level_at_new_list: Optional[int] = None
55
55
  self.parents: dict[int, Optional[NodeItem]] = {}
56
+ self.numbered_headers: dict[int, int] = {}
56
57
  for i in range(-1, self.max_levels):
57
58
  self.parents[i] = None
58
59
 
@@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
346
347
  parent=None, label=DocItemLabel.TITLE, text=text
347
348
  )
348
349
  elif "Heading" in p_style_id:
349
- self.add_header(doc, p_level, text)
350
+ style_element = getattr(paragraph.style, "element", None)
351
+ if style_element:
352
+ is_numbered_style = (
353
+ "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
354
+ )
355
+ else:
356
+ is_numbered_style = False
357
+ self.add_header(doc, p_level, text, is_numbered_style)
350
358
 
351
359
  elif len(equations) > 0:
352
360
  if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
415
423
  return
416
424
 
417
425
  def add_header(
418
- self, doc: DoclingDocument, curr_level: Optional[int], text: str
426
+ self,
427
+ doc: DoclingDocument,
428
+ curr_level: Optional[int],
429
+ text: str,
430
+ is_numbered_style: bool = False,
419
431
  ) -> None:
420
432
  level = self.get_level()
421
433
  if isinstance(curr_level, int):
@@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
433
445
  if key >= curr_level:
434
446
  self.parents[key] = None
435
447
 
436
- self.parents[curr_level] = doc.add_heading(
437
- parent=self.parents[curr_level - 1],
438
- text=text,
439
- level=curr_level,
440
- )
448
+ current_level = curr_level
449
+ parent_level = curr_level - 1
450
+ add_level = curr_level
441
451
  else:
442
- self.parents[self.level] = doc.add_heading(
443
- parent=self.parents[self.level - 1],
444
- text=text,
445
- level=1,
446
- )
452
+ current_level = self.level
453
+ parent_level = self.level - 1
454
+ add_level = 1
455
+
456
+ if is_numbered_style:
457
+ if add_level in self.numbered_headers:
458
+ self.numbered_headers[add_level] += 1
459
+ else:
460
+ self.numbered_headers[add_level] = 1
461
+ text = f"{self.numbered_headers[add_level]} {text}"
462
+
463
+ # Reset deeper levels
464
+ next_level = add_level + 1
465
+ while next_level in self.numbered_headers:
466
+ self.numbered_headers[next_level] = 0
467
+ next_level += 1
468
+
469
+ # Scan upper levels
470
+ previous_level = add_level - 1
471
+ while previous_level in self.numbered_headers:
472
+ # MSWord convention: no empty sublevels
473
+ # I.e., sub-sub section (2.0.1) without a sub-section (2.1)
474
+ # is processed as 2.1.1
475
+ if self.numbered_headers[previous_level] == 0:
476
+ self.numbered_headers[previous_level] += 1
477
+
478
+ text = f"{self.numbered_headers[previous_level]}.{text}"
479
+ previous_level -= 1
480
+
481
+ self.parents[current_level] = doc.add_heading(
482
+ parent=self.parents[parent_level],
483
+ text=text,
484
+ level=add_level,
485
+ )
447
486
  return
448
487
 
449
488
  def add_listitem(
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  import math
3
4
  import sys
@@ -181,7 +182,14 @@ class DocumentConverter:
181
182
  )
182
183
  for format in self.allowed_formats
183
184
  }
184
- self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
185
+ self.initialized_pipelines: Dict[
186
+ Tuple[Type[BasePipeline], str], BasePipeline
187
+ ] = {}
188
+
189
+ def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
190
+ """Generate a hash of pipeline options to use as part of the cache key."""
191
+ options_str = str(pipeline_options.model_dump())
192
+ return hashlib.md5(options_str.encode("utf-8")).hexdigest()
185
193
 
186
194
  def initialize_pipeline(self, format: InputFormat):
187
195
  """Initialize the conversion pipeline for the selected format."""
@@ -279,31 +287,36 @@ class DocumentConverter:
279
287
  yield item
280
288
 
281
289
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
290
+ """Retrieve or initialize a pipeline, reusing instances based on class and options."""
282
291
  fopt = self.format_to_options.get(doc_format)
283
292
 
284
- if fopt is None:
293
+ if fopt is None or fopt.pipeline_options is None:
285
294
  return None
286
- else:
287
- pipeline_class = fopt.pipeline_cls
288
- pipeline_options = fopt.pipeline_options
289
295
 
290
- if pipeline_options is None:
291
- return None
292
- # TODO this will ignore if different options have been defined for the same pipeline class.
293
- if (
294
- pipeline_class not in self.initialized_pipelines
295
- or self.initialized_pipelines[pipeline_class].pipeline_options
296
- != pipeline_options
297
- ):
298
- self.initialized_pipelines[pipeline_class] = pipeline_class(
296
+ pipeline_class = fopt.pipeline_cls
297
+ pipeline_options = fopt.pipeline_options
298
+ options_hash = self._get_pipeline_options_hash(pipeline_options)
299
+
300
+ # Use a composite key to cache pipelines
301
+ cache_key = (pipeline_class, options_hash)
302
+
303
+ if cache_key not in self.initialized_pipelines:
304
+ _log.info(
305
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
306
+ )
307
+ self.initialized_pipelines[cache_key] = pipeline_class(
299
308
  pipeline_options=pipeline_options
300
309
  )
301
- return self.initialized_pipelines[pipeline_class]
310
+ else:
311
+ _log.debug(
312
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
313
+ )
314
+
315
+ return self.initialized_pipelines[cache_key]
302
316
 
303
317
  def _process_document(
304
318
  self, in_doc: InputDocument, raises_on_error: bool
305
319
  ) -> ConversionResult:
306
-
307
320
  valid = (
308
321
  self.allowed_formats is not None and in_doc.format in self.allowed_formats
309
322
  )
@@ -345,7 +358,6 @@ class DocumentConverter:
345
358
  else:
346
359
  if raises_on_error:
347
360
  raise ConversionError(f"Input document {in_doc.file} is not valid.")
348
-
349
361
  else:
350
362
  # invalid doc or not of desired format
351
363
  conv_res = ConversionResult(
@@ -63,7 +63,13 @@ class PagePreprocessingModel(BasePageModel):
63
63
  def draw_text_boxes(image, cells, show: bool = False):
64
64
  draw = ImageDraw.Draw(image)
65
65
  for c in cells:
66
- x0, y0, x1, y1 = c.bbox.as_tuple()
66
+ x0, y0, x1, y1 = (
67
+ c.to_bounding_box().l,
68
+ c.to_bounding_box().t,
69
+ c.to_bounding_box().r,
70
+ c.to_bounding_box().b,
71
+ )
72
+
67
73
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
68
74
  if show:
69
75
  image.show()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.28.0" # DO NOT EDIT, updated automatically
3
+ version = "2.28.1" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = [
6
6
  "Christoph Auer <cau@zurich.ibm.com>",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes