docling 2.41.0__tar.gz → 2.42.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. {docling-2.41.0 → docling-2.42.0}/PKG-INFO +2 -1
  2. {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/latex/omml.py +9 -1
  3. {docling-2.41.0 → docling-2.42.0}/docling/backend/html_backend.py +25 -17
  4. {docling-2.41.0 → docling-2.42.0}/docling/backend/xml/jats_backend.py +12 -4
  5. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/pipeline_options.py +4 -1
  6. {docling-2.41.0 → docling-2.42.0}/docling/document_converter.py +14 -11
  7. {docling-2.41.0 → docling-2.42.0}/docling/models/picture_description_vlm_model.py +2 -1
  8. {docling-2.41.0 → docling-2.42.0}/docling/utils/layout_postprocessor.py +3 -2
  9. {docling-2.41.0 → docling-2.42.0}/docling.egg-info/PKG-INFO +2 -1
  10. {docling-2.41.0 → docling-2.42.0}/docling.egg-info/requires.txt +1 -0
  11. {docling-2.41.0 → docling-2.42.0}/pyproject.toml +2 -1
  12. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_jats.py +14 -14
  13. {docling-2.41.0 → docling-2.42.0}/LICENSE +0 -0
  14. {docling-2.41.0 → docling-2.42.0}/README.md +0 -0
  15. {docling-2.41.0 → docling-2.42.0}/docling/__init__.py +0 -0
  16. {docling-2.41.0 → docling-2.42.0}/docling/backend/__init__.py +0 -0
  17. {docling-2.41.0 → docling-2.42.0}/docling/backend/abstract_backend.py +0 -0
  18. {docling-2.41.0 → docling-2.42.0}/docling/backend/asciidoc_backend.py +0 -0
  19. {docling-2.41.0 → docling-2.42.0}/docling/backend/csv_backend.py +0 -0
  20. {docling-2.41.0 → docling-2.42.0}/docling/backend/docling_parse_backend.py +0 -0
  21. {docling-2.41.0 → docling-2.42.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  22. {docling-2.41.0 → docling-2.42.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  23. {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/__init__.py +0 -0
  24. {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/latex/__init__.py +0 -0
  25. {docling-2.41.0 → docling-2.42.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  26. {docling-2.41.0 → docling-2.42.0}/docling/backend/json/__init__.py +0 -0
  27. {docling-2.41.0 → docling-2.42.0}/docling/backend/json/docling_json_backend.py +0 -0
  28. {docling-2.41.0 → docling-2.42.0}/docling/backend/md_backend.py +0 -0
  29. {docling-2.41.0 → docling-2.42.0}/docling/backend/msexcel_backend.py +0 -0
  30. {docling-2.41.0 → docling-2.42.0}/docling/backend/mspowerpoint_backend.py +0 -0
  31. {docling-2.41.0 → docling-2.42.0}/docling/backend/msword_backend.py +0 -0
  32. {docling-2.41.0 → docling-2.42.0}/docling/backend/noop_backend.py +0 -0
  33. {docling-2.41.0 → docling-2.42.0}/docling/backend/pdf_backend.py +0 -0
  34. {docling-2.41.0 → docling-2.42.0}/docling/backend/pypdfium2_backend.py +0 -0
  35. {docling-2.41.0 → docling-2.42.0}/docling/backend/xml/__init__.py +0 -0
  36. {docling-2.41.0 → docling-2.42.0}/docling/backend/xml/uspto_backend.py +0 -0
  37. {docling-2.41.0 → docling-2.42.0}/docling/chunking/__init__.py +0 -0
  38. {docling-2.41.0 → docling-2.42.0}/docling/cli/__init__.py +0 -0
  39. {docling-2.41.0 → docling-2.42.0}/docling/cli/main.py +0 -0
  40. {docling-2.41.0 → docling-2.42.0}/docling/cli/models.py +0 -0
  41. {docling-2.41.0 → docling-2.42.0}/docling/cli/tools.py +0 -0
  42. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/__init__.py +0 -0
  43. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/accelerator_options.py +0 -0
  44. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/asr_model_specs.py +0 -0
  45. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/base_models.py +0 -0
  46. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/document.py +0 -0
  47. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/layout_model_specs.py +0 -0
  48. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  49. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  50. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/settings.py +0 -0
  51. {docling-2.41.0 → docling-2.42.0}/docling/datamodel/vlm_model_specs.py +0 -0
  52. {docling-2.41.0 → docling-2.42.0}/docling/exceptions.py +0 -0
  53. {docling-2.41.0 → docling-2.42.0}/docling/models/__init__.py +0 -0
  54. {docling-2.41.0 → docling-2.42.0}/docling/models/api_vlm_model.py +0 -0
  55. {docling-2.41.0 → docling-2.42.0}/docling/models/base_model.py +0 -0
  56. {docling-2.41.0 → docling-2.42.0}/docling/models/base_ocr_model.py +0 -0
  57. {docling-2.41.0 → docling-2.42.0}/docling/models/code_formula_model.py +0 -0
  58. {docling-2.41.0 → docling-2.42.0}/docling/models/document_picture_classifier.py +0 -0
  59. {docling-2.41.0 → docling-2.42.0}/docling/models/easyocr_model.py +0 -0
  60. {docling-2.41.0 → docling-2.42.0}/docling/models/factories/__init__.py +0 -0
  61. {docling-2.41.0 → docling-2.42.0}/docling/models/factories/base_factory.py +0 -0
  62. {docling-2.41.0 → docling-2.42.0}/docling/models/factories/ocr_factory.py +0 -0
  63. {docling-2.41.0 → docling-2.42.0}/docling/models/factories/picture_description_factory.py +0 -0
  64. {docling-2.41.0 → docling-2.42.0}/docling/models/layout_model.py +0 -0
  65. {docling-2.41.0 → docling-2.42.0}/docling/models/ocr_mac_model.py +0 -0
  66. {docling-2.41.0 → docling-2.42.0}/docling/models/page_assemble_model.py +0 -0
  67. {docling-2.41.0 → docling-2.42.0}/docling/models/page_preprocessing_model.py +0 -0
  68. {docling-2.41.0 → docling-2.42.0}/docling/models/picture_description_api_model.py +0 -0
  69. {docling-2.41.0 → docling-2.42.0}/docling/models/picture_description_base_model.py +0 -0
  70. {docling-2.41.0 → docling-2.42.0}/docling/models/plugins/__init__.py +0 -0
  71. {docling-2.41.0 → docling-2.42.0}/docling/models/plugins/defaults.py +0 -0
  72. {docling-2.41.0 → docling-2.42.0}/docling/models/rapid_ocr_model.py +0 -0
  73. {docling-2.41.0 → docling-2.42.0}/docling/models/readingorder_model.py +0 -0
  74. {docling-2.41.0 → docling-2.42.0}/docling/models/table_structure_model.py +0 -0
  75. {docling-2.41.0 → docling-2.42.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  76. {docling-2.41.0 → docling-2.42.0}/docling/models/tesseract_ocr_model.py +0 -0
  77. {docling-2.41.0 → docling-2.42.0}/docling/models/utils/__init__.py +0 -0
  78. {docling-2.41.0 → docling-2.42.0}/docling/models/utils/hf_model_download.py +0 -0
  79. {docling-2.41.0 → docling-2.42.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  80. {docling-2.41.0 → docling-2.42.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  81. {docling-2.41.0 → docling-2.42.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  82. {docling-2.41.0 → docling-2.42.0}/docling/pipeline/__init__.py +0 -0
  83. {docling-2.41.0 → docling-2.42.0}/docling/pipeline/asr_pipeline.py +0 -0
  84. {docling-2.41.0 → docling-2.42.0}/docling/pipeline/base_pipeline.py +0 -0
  85. {docling-2.41.0 → docling-2.42.0}/docling/pipeline/simple_pipeline.py +0 -0
  86. {docling-2.41.0 → docling-2.42.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  87. {docling-2.41.0 → docling-2.42.0}/docling/pipeline/vlm_pipeline.py +0 -0
  88. {docling-2.41.0 → docling-2.42.0}/docling/py.typed +0 -0
  89. {docling-2.41.0 → docling-2.42.0}/docling/utils/__init__.py +0 -0
  90. {docling-2.41.0 → docling-2.42.0}/docling/utils/accelerator_utils.py +0 -0
  91. {docling-2.41.0 → docling-2.42.0}/docling/utils/api_image_request.py +0 -0
  92. {docling-2.41.0 → docling-2.42.0}/docling/utils/export.py +0 -0
  93. {docling-2.41.0 → docling-2.42.0}/docling/utils/glm_utils.py +0 -0
  94. {docling-2.41.0 → docling-2.42.0}/docling/utils/locks.py +0 -0
  95. {docling-2.41.0 → docling-2.42.0}/docling/utils/model_downloader.py +0 -0
  96. {docling-2.41.0 → docling-2.42.0}/docling/utils/ocr_utils.py +0 -0
  97. {docling-2.41.0 → docling-2.42.0}/docling/utils/orientation.py +0 -0
  98. {docling-2.41.0 → docling-2.42.0}/docling/utils/profiling.py +0 -0
  99. {docling-2.41.0 → docling-2.42.0}/docling/utils/utils.py +0 -0
  100. {docling-2.41.0 → docling-2.42.0}/docling/utils/visualization.py +0 -0
  101. {docling-2.41.0 → docling-2.42.0}/docling.egg-info/SOURCES.txt +0 -0
  102. {docling-2.41.0 → docling-2.42.0}/docling.egg-info/dependency_links.txt +0 -0
  103. {docling-2.41.0 → docling-2.42.0}/docling.egg-info/entry_points.txt +0 -0
  104. {docling-2.41.0 → docling-2.42.0}/docling.egg-info/top_level.txt +0 -0
  105. {docling-2.41.0 → docling-2.42.0}/setup.cfg +0 -0
  106. {docling-2.41.0 → docling-2.42.0}/tests/test_asr_pipeline.py +0 -0
  107. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_asciidoc.py +0 -0
  108. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_csv.py +0 -0
  109. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_json.py +0 -0
  110. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_parse.py +0 -0
  111. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_parse_v2.py +0 -0
  112. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_docling_parse_v4.py +0 -0
  113. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_html.py +0 -0
  114. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_markdown.py +0 -0
  115. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_msexcel.py +0 -0
  116. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_msword.py +0 -0
  117. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_patent_uspto.py +0 -0
  118. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_pdfium.py +0 -0
  119. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_pptx.py +0 -0
  120. {docling-2.41.0 → docling-2.42.0}/tests/test_backend_webp.py +0 -0
  121. {docling-2.41.0 → docling-2.42.0}/tests/test_cli.py +0 -0
  122. {docling-2.41.0 → docling-2.42.0}/tests/test_code_formula.py +0 -0
  123. {docling-2.41.0 → docling-2.42.0}/tests/test_data_gen_flag.py +0 -0
  124. {docling-2.41.0 → docling-2.42.0}/tests/test_document_picture_classifier.py +0 -0
  125. {docling-2.41.0 → docling-2.42.0}/tests/test_e2e_conversion.py +0 -0
  126. {docling-2.41.0 → docling-2.42.0}/tests/test_e2e_ocr_conversion.py +0 -0
  127. {docling-2.41.0 → docling-2.42.0}/tests/test_input_doc.py +0 -0
  128. {docling-2.41.0 → docling-2.42.0}/tests/test_interfaces.py +0 -0
  129. {docling-2.41.0 → docling-2.42.0}/tests/test_invalid_input.py +0 -0
  130. {docling-2.41.0 → docling-2.42.0}/tests/test_legacy_format_transform.py +0 -0
  131. {docling-2.41.0 → docling-2.42.0}/tests/test_ocr_utils.py +0 -0
  132. {docling-2.41.0 → docling-2.42.0}/tests/test_options.py +0 -0
  133. {docling-2.41.0 → docling-2.42.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.41.0
3
+ Version: 2.42.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
50
50
  Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
+ Requires-Dist: accelerate<2,>=1.0.0
53
54
  Provides-Extra: tesserocr
54
55
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
55
56
  Provides-Extra: ocrmac
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
260
260
  the fraction object
261
261
  """
262
262
  c_dict = self.process_children_dict(elm)
263
- pr = c_dict["fPr"]
263
+ pr = c_dict.get("fPr")
264
+ if pr is None:
265
+ # Handle missing fPr element gracefully
266
+ _log.debug("Missing fPr element in fraction, using default formatting")
267
+ latex_s = F_DEFAULT
268
+ return latex_s.format(
269
+ num=c_dict.get("num"),
270
+ den=c_dict.get("den"),
271
+ )
264
272
  latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
265
273
  return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
266
274
 
@@ -379,6 +379,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
379
379
  else:
380
380
  _log.debug(f"list-item has no text: {element}")
381
381
 
382
+ @staticmethod
383
+ def _get_cell_spans(cell: Tag) -> tuple[int, int]:
384
+ """Extract colspan and rowspan values from a table cell tag.
385
+
386
+ This function retrieves the 'colspan' and 'rowspan' attributes from a given
387
+ table cell tag.
388
+ If the attribute does not exist or it is not numeric, it defaults to 1.
389
+ """
390
+ raw_spans: tuple[str, str] = (
391
+ str(cell.get("colspan", "1")),
392
+ str(cell.get("rowspan", "1")),
393
+ )
394
+ int_spans: tuple[int, int] = (
395
+ int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
396
+ int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
397
+ )
398
+
399
+ return int_spans
400
+
382
401
  @staticmethod
383
402
  def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
384
403
  nested_tables = element.find("table")
@@ -398,10 +417,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
398
417
  if not isinstance(row, Tag):
399
418
  continue
400
419
  cell_tag = cast(Tag, cell)
401
- val = cell_tag.get("colspan", "1")
402
- colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
403
- col_count += colspan
404
- if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
420
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
421
+ col_count += col_span
422
+ if cell_tag.name == "td" or row_span == 1:
405
423
  is_row_header = False
406
424
  num_cols = max(num_cols, col_count)
407
425
  if not is_row_header:
@@ -428,10 +446,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
428
446
  row_header = True
429
447
  for html_cell in cells:
430
448
  if isinstance(html_cell, Tag):
449
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
431
450
  if html_cell.name == "td":
432
451
  col_header = False
433
452
  row_header = False
434
- elif html_cell.get("rowspan") is None:
453
+ elif row_span == 1:
435
454
  row_header = False
436
455
  if not row_header:
437
456
  row_idx += 1
@@ -456,18 +475,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
456
475
  text = html_cell.text
457
476
 
458
477
  # label = html_cell.name
459
- col_val = html_cell.get("colspan", "1")
460
- col_span = (
461
- int(col_val)
462
- if isinstance(col_val, str) and col_val.isnumeric()
463
- else 1
464
- )
465
- row_val = html_cell.get("rowspan", "1")
466
- row_span = (
467
- int(row_val)
468
- if isinstance(row_val, str) and row_val.isnumeric()
469
- else 1
470
- )
478
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
471
479
  if row_header:
472
480
  row_span -= 1
473
481
  while (
@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
93
93
 
94
94
  # Initialize the root of the document hierarchy
95
95
  self.root: Optional[NodeItem] = None
96
-
97
- self.valid = False
96
+ self.hlevel: int = 0
97
+ self.valid: bool = False
98
98
  try:
99
99
  if isinstance(self.path_or_stream, BytesIO):
100
100
  self.path_or_stream.seek(0)
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
147
147
  binary_hash=self.document_hash,
148
148
  )
149
149
  doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
150
+ self.hlevel = 0
150
151
 
151
152
  # Get metadata XML components
152
153
  xml_components: XMLComponents = self._parse_metadata()
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
304
305
  title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
305
306
  if not text:
306
307
  continue
307
- parent = doc.add_heading(parent=self.root, text=title)
308
+ parent = doc.add_heading(
309
+ parent=self.root, text=title, level=self.hlevel + 1
310
+ )
308
311
  doc.add_text(
309
312
  parent=parent,
310
313
  text=text,
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
637
640
  elif child.tag == "ack":
638
641
  text = DEFAULT_HEADER_ACKNOWLEDGMENTS
639
642
  if text:
640
- new_parent = doc.add_heading(text=text, parent=parent)
643
+ self.hlevel += 1
644
+ new_parent = doc.add_heading(
645
+ text=text, parent=parent, level=self.hlevel
646
+ )
641
647
  elif child.tag == "list":
642
648
  new_parent = doc.add_group(
643
649
  label=GroupLabel.LIST, name="list", parent=parent
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
694
700
  new_text = self._walk_linear(doc, new_parent, child)
695
701
  if not (node.getparent().tag == "p" and node.tag in flush_tags):
696
702
  node_text += new_text
703
+ if child.tag in ("sec", "ack") and text:
704
+ self.hlevel -= 1
697
705
 
698
706
  # pick up the tail text
699
707
  node_text += child.tail.replace("\n", " ") if child.tail else ""
@@ -217,7 +217,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
217
217
 
218
218
  # GraniteVision
219
219
  granite_picture_description = PictureDescriptionVlmOptions(
220
- repo_id="ibm-granite/granite-vision-3.2-2b-preview",
220
+ repo_id="ibm-granite/granite-vision-3.3-2b",
221
221
  prompt="What is shown in this image?",
222
222
  )
223
223
 
@@ -279,6 +279,9 @@ class LayoutOptions(BaseModel):
279
279
  """Options for layout processing."""
280
280
 
281
281
  create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
282
+ keep_empty_clusters: bool = (
283
+ False # Whether to keep clusters that contain no text cells
284
+ )
282
285
  model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
283
286
 
284
287
 
@@ -1,6 +1,7 @@
1
1
  import hashlib
2
2
  import logging
3
3
  import sys
4
+ import threading
4
5
  import time
5
6
  from collections.abc import Iterable, Iterator
6
7
  from functools import partial
@@ -49,6 +50,7 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
49
50
  from docling.utils.utils import chunkify
50
51
 
51
52
  _log = logging.getLogger(__name__)
53
+ _PIPELINE_CACHE_LOCK = threading.Lock()
52
54
 
53
55
 
54
56
  class FormatOption(BaseModel):
@@ -315,17 +317,18 @@ class DocumentConverter:
315
317
  # Use a composite key to cache pipelines
316
318
  cache_key = (pipeline_class, options_hash)
317
319
 
318
- if cache_key not in self.initialized_pipelines:
319
- _log.info(
320
- f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
321
- )
322
- self.initialized_pipelines[cache_key] = pipeline_class(
323
- pipeline_options=pipeline_options
324
- )
325
- else:
326
- _log.debug(
327
- f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
328
- )
320
+ with _PIPELINE_CACHE_LOCK:
321
+ if cache_key not in self.initialized_pipelines:
322
+ _log.info(
323
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
324
+ )
325
+ self.initialized_pipelines[cache_key] = pipeline_class(
326
+ pipeline_options=pipeline_options
327
+ )
328
+ else:
329
+ _log.debug(
330
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
331
+ )
329
332
 
330
333
  return self.initialized_pipelines[cache_key]
331
334
 
@@ -65,6 +65,7 @@ class PictureDescriptionVlmModel(
65
65
  self.processor = AutoProcessor.from_pretrained(artifacts_path)
66
66
  self.model = AutoModelForVision2Seq.from_pretrained(
67
67
  artifacts_path,
68
+ device_map=self.device,
68
69
  torch_dtype=torch.bfloat16,
69
70
  _attn_implementation=(
70
71
  "flash_attention_2"
@@ -72,7 +73,7 @@ class PictureDescriptionVlmModel(
72
73
  and accelerator_options.cuda_use_flash_attention2
73
74
  else "eager"
74
75
  ),
75
- ).to(self.device)
76
+ )
76
77
 
77
78
  self.provenance = f"{self.options.repo_id}"
78
79
 
@@ -267,8 +267,9 @@ class LayoutPostprocessor:
267
267
  # Initial cell assignment
268
268
  clusters = self._assign_cells_to_clusters(clusters)
269
269
 
270
- # Remove clusters with no cells
271
- clusters = [cluster for cluster in clusters if cluster.cells]
270
+ # Remove clusters with no cells (if keep_empty_clusters is False)
271
+ if not self.options.keep_empty_clusters:
272
+ clusters = [cluster for cluster in clusters if cluster.cells]
272
273
 
273
274
  # Handle orphaned cells
274
275
  unassigned = self._find_unassigned_cells(clusters)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.41.0
3
+ Version: 2.42.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
50
50
  Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
+ Requires-Dist: accelerate<2,>=1.0.0
53
54
  Provides-Extra: tesserocr
54
55
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
55
56
  Provides-Extra: ocrmac
@@ -23,6 +23,7 @@ tqdm<5.0.0,>=4.65.0
23
23
  pluggy<2.0.0,>=1.0.0
24
24
  pylatexenc<3.0,>=2.10
25
25
  scipy<2.0.0,>=1.6.0
26
+ accelerate<2,>=1.0.0
26
27
 
27
28
  [asr]
28
29
  openai-whisper>=20250625
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.41.0" # DO NOT EDIT, updated automatically
3
+ version = "2.42.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -70,6 +70,7 @@ dependencies = [
70
70
  'scipy (>=1.6.0,<2.0.0)',
71
71
  # 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
72
72
  # 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
73
+ "accelerate>=1.0.0,<2",
73
74
  ]
74
75
 
75
76
  [project.urls]
@@ -14,9 +14,9 @@ from .verify_utils import verify_document, verify_export
14
14
  GENERATE = GEN_TEST_DATA
15
15
 
16
16
 
17
- def get_pubmed_paths():
18
- directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
19
- xml_files = sorted(directory.rglob("*.xml"))
17
+ def get_jats_paths():
18
+ directory = Path(os.path.dirname(__file__) + "/data/jats/")
19
+ xml_files = sorted(directory.rglob("*.nxml"))
20
20
  return xml_files
21
21
 
22
22
 
@@ -25,20 +25,20 @@ def get_converter():
25
25
  return converter
26
26
 
27
27
 
28
- def test_e2e_pubmed_conversions(use_stream=False):
29
- pubmed_paths = get_pubmed_paths()
28
+ def test_e2e_jats_conversions(use_stream=False):
29
+ jats_paths = get_jats_paths()
30
30
  converter = get_converter()
31
31
 
32
- for pubmed_path in pubmed_paths:
32
+ for jats_path in jats_paths:
33
33
  gt_path = (
34
- pubmed_path.parent.parent / "groundtruth" / "docling_v2" / pubmed_path.name
34
+ jats_path.parent.parent / "groundtruth" / "docling_v2" / jats_path.name
35
35
  )
36
36
  if use_stream:
37
- buf = BytesIO(pubmed_path.open("rb").read())
38
- stream = DocumentStream(name=pubmed_path.name, stream=buf)
37
+ buf = BytesIO(jats_path.open("rb").read())
38
+ stream = DocumentStream(name=jats_path.name, stream=buf)
39
39
  conv_result: ConversionResult = converter.convert(stream)
40
40
  else:
41
- conv_result: ConversionResult = converter.convert(pubmed_path)
41
+ conv_result: ConversionResult = converter.convert(jats_path)
42
42
  doc: DoclingDocument = conv_result.document
43
43
 
44
44
  pred_md: str = doc.export_to_markdown()
@@ -54,9 +54,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
54
54
  assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
55
55
 
56
56
 
57
- def test_e2e_pubmed_conversions_stream():
58
- test_e2e_pubmed_conversions(use_stream=True)
57
+ def test_e2e_jats_conversions_stream():
58
+ test_e2e_jats_conversions(use_stream=True)
59
59
 
60
60
 
61
- def test_e2e_pubmed_conversions_no_stream():
62
- test_e2e_pubmed_conversions(use_stream=False)
61
+ def test_e2e_jats_conversions_no_stream():
62
+ test_e2e_jats_conversions(use_stream=False)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes