docling 2.28.1__tar.gz → 2.28.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {docling-2.28.1 → docling-2.28.2}/PKG-INFO +1 -1
  2. {docling-2.28.1 → docling-2.28.2}/docling/backend/html_backend.py +3 -3
  3. {docling-2.28.1 → docling-2.28.2}/docling/backend/md_backend.py +15 -5
  4. {docling-2.28.1 → docling-2.28.2}/pyproject.toml +1 -1
  5. {docling-2.28.1 → docling-2.28.2}/LICENSE +0 -0
  6. {docling-2.28.1 → docling-2.28.2}/README.md +0 -0
  7. {docling-2.28.1 → docling-2.28.2}/docling/__init__.py +0 -0
  8. {docling-2.28.1 → docling-2.28.2}/docling/backend/__init__.py +0 -0
  9. {docling-2.28.1 → docling-2.28.2}/docling/backend/abstract_backend.py +0 -0
  10. {docling-2.28.1 → docling-2.28.2}/docling/backend/asciidoc_backend.py +0 -0
  11. {docling-2.28.1 → docling-2.28.2}/docling/backend/csv_backend.py +0 -0
  12. {docling-2.28.1 → docling-2.28.2}/docling/backend/docling_parse_backend.py +0 -0
  13. {docling-2.28.1 → docling-2.28.2}/docling/backend/docling_parse_v2_backend.py +0 -0
  14. {docling-2.28.1 → docling-2.28.2}/docling/backend/docling_parse_v4_backend.py +0 -0
  15. {docling-2.28.1 → docling-2.28.2}/docling/backend/docx/__init__.py +0 -0
  16. {docling-2.28.1 → docling-2.28.2}/docling/backend/docx/latex/__init__.py +0 -0
  17. {docling-2.28.1 → docling-2.28.2}/docling/backend/docx/latex/latex_dict.py +0 -0
  18. {docling-2.28.1 → docling-2.28.2}/docling/backend/docx/latex/omml.py +0 -0
  19. {docling-2.28.1 → docling-2.28.2}/docling/backend/json/__init__.py +0 -0
  20. {docling-2.28.1 → docling-2.28.2}/docling/backend/json/docling_json_backend.py +0 -0
  21. {docling-2.28.1 → docling-2.28.2}/docling/backend/msexcel_backend.py +0 -0
  22. {docling-2.28.1 → docling-2.28.2}/docling/backend/mspowerpoint_backend.py +0 -0
  23. {docling-2.28.1 → docling-2.28.2}/docling/backend/msword_backend.py +0 -0
  24. {docling-2.28.1 → docling-2.28.2}/docling/backend/pdf_backend.py +0 -0
  25. {docling-2.28.1 → docling-2.28.2}/docling/backend/pypdfium2_backend.py +0 -0
  26. {docling-2.28.1 → docling-2.28.2}/docling/backend/xml/__init__.py +0 -0
  27. {docling-2.28.1 → docling-2.28.2}/docling/backend/xml/jats_backend.py +0 -0
  28. {docling-2.28.1 → docling-2.28.2}/docling/backend/xml/uspto_backend.py +0 -0
  29. {docling-2.28.1 → docling-2.28.2}/docling/chunking/__init__.py +0 -0
  30. {docling-2.28.1 → docling-2.28.2}/docling/cli/__init__.py +0 -0
  31. {docling-2.28.1 → docling-2.28.2}/docling/cli/main.py +0 -0
  32. {docling-2.28.1 → docling-2.28.2}/docling/cli/models.py +0 -0
  33. {docling-2.28.1 → docling-2.28.2}/docling/cli/tools.py +0 -0
  34. {docling-2.28.1 → docling-2.28.2}/docling/datamodel/__init__.py +0 -0
  35. {docling-2.28.1 → docling-2.28.2}/docling/datamodel/base_models.py +0 -0
  36. {docling-2.28.1 → docling-2.28.2}/docling/datamodel/document.py +0 -0
  37. {docling-2.28.1 → docling-2.28.2}/docling/datamodel/pipeline_options.py +0 -0
  38. {docling-2.28.1 → docling-2.28.2}/docling/datamodel/settings.py +0 -0
  39. {docling-2.28.1 → docling-2.28.2}/docling/document_converter.py +0 -0
  40. {docling-2.28.1 → docling-2.28.2}/docling/exceptions.py +0 -0
  41. {docling-2.28.1 → docling-2.28.2}/docling/models/__init__.py +0 -0
  42. {docling-2.28.1 → docling-2.28.2}/docling/models/base_model.py +0 -0
  43. {docling-2.28.1 → docling-2.28.2}/docling/models/base_ocr_model.py +0 -0
  44. {docling-2.28.1 → docling-2.28.2}/docling/models/code_formula_model.py +0 -0
  45. {docling-2.28.1 → docling-2.28.2}/docling/models/document_picture_classifier.py +0 -0
  46. {docling-2.28.1 → docling-2.28.2}/docling/models/easyocr_model.py +0 -0
  47. {docling-2.28.1 → docling-2.28.2}/docling/models/factories/__init__.py +0 -0
  48. {docling-2.28.1 → docling-2.28.2}/docling/models/factories/base_factory.py +0 -0
  49. {docling-2.28.1 → docling-2.28.2}/docling/models/factories/ocr_factory.py +0 -0
  50. {docling-2.28.1 → docling-2.28.2}/docling/models/factories/picture_description_factory.py +0 -0
  51. {docling-2.28.1 → docling-2.28.2}/docling/models/hf_mlx_model.py +0 -0
  52. {docling-2.28.1 → docling-2.28.2}/docling/models/hf_vlm_model.py +0 -0
  53. {docling-2.28.1 → docling-2.28.2}/docling/models/layout_model.py +0 -0
  54. {docling-2.28.1 → docling-2.28.2}/docling/models/ocr_mac_model.py +0 -0
  55. {docling-2.28.1 → docling-2.28.2}/docling/models/page_assemble_model.py +0 -0
  56. {docling-2.28.1 → docling-2.28.2}/docling/models/page_preprocessing_model.py +0 -0
  57. {docling-2.28.1 → docling-2.28.2}/docling/models/picture_description_api_model.py +0 -0
  58. {docling-2.28.1 → docling-2.28.2}/docling/models/picture_description_base_model.py +0 -0
  59. {docling-2.28.1 → docling-2.28.2}/docling/models/picture_description_vlm_model.py +0 -0
  60. {docling-2.28.1 → docling-2.28.2}/docling/models/plugins/__init__.py +0 -0
  61. {docling-2.28.1 → docling-2.28.2}/docling/models/plugins/defaults.py +0 -0
  62. {docling-2.28.1 → docling-2.28.2}/docling/models/rapid_ocr_model.py +0 -0
  63. {docling-2.28.1 → docling-2.28.2}/docling/models/readingorder_model.py +0 -0
  64. {docling-2.28.1 → docling-2.28.2}/docling/models/table_structure_model.py +0 -0
  65. {docling-2.28.1 → docling-2.28.2}/docling/models/tesseract_ocr_cli_model.py +0 -0
  66. {docling-2.28.1 → docling-2.28.2}/docling/models/tesseract_ocr_model.py +0 -0
  67. {docling-2.28.1 → docling-2.28.2}/docling/pipeline/__init__.py +0 -0
  68. {docling-2.28.1 → docling-2.28.2}/docling/pipeline/base_pipeline.py +0 -0
  69. {docling-2.28.1 → docling-2.28.2}/docling/pipeline/simple_pipeline.py +0 -0
  70. {docling-2.28.1 → docling-2.28.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  71. {docling-2.28.1 → docling-2.28.2}/docling/pipeline/vlm_pipeline.py +0 -0
  72. {docling-2.28.1 → docling-2.28.2}/docling/py.typed +0 -0
  73. {docling-2.28.1 → docling-2.28.2}/docling/utils/__init__.py +0 -0
  74. {docling-2.28.1 → docling-2.28.2}/docling/utils/accelerator_utils.py +0 -0
  75. {docling-2.28.1 → docling-2.28.2}/docling/utils/export.py +0 -0
  76. {docling-2.28.1 → docling-2.28.2}/docling/utils/glm_utils.py +0 -0
  77. {docling-2.28.1 → docling-2.28.2}/docling/utils/layout_postprocessor.py +0 -0
  78. {docling-2.28.1 → docling-2.28.2}/docling/utils/locks.py +0 -0
  79. {docling-2.28.1 → docling-2.28.2}/docling/utils/model_downloader.py +0 -0
  80. {docling-2.28.1 → docling-2.28.2}/docling/utils/ocr_utils.py +0 -0
  81. {docling-2.28.1 → docling-2.28.2}/docling/utils/profiling.py +0 -0
  82. {docling-2.28.1 → docling-2.28.2}/docling/utils/utils.py +0 -0
  83. {docling-2.28.1 → docling-2.28.2}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.1
3
+ Version: 2.28.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -206,9 +206,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
206
206
  hlevel = int(element.name.replace("h", ""))
207
207
  text = element.text.strip()
208
208
 
209
- if hlevel == 1:
210
- self.content_layer = ContentLayer.BODY
209
+ self.content_layer = ContentLayer.BODY
211
210
 
211
+ if hlevel == 1:
212
212
  for key in self.parents.keys():
213
213
  self.parents[key] = None
214
214
 
@@ -243,7 +243,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
243
243
  self.parents[hlevel] = doc.add_heading(
244
244
  parent=self.parents[hlevel - 1],
245
245
  text=text,
246
- level=hlevel,
246
+ level=hlevel - 1,
247
247
  content_layer=self.content_layer,
248
248
  )
249
249
 
@@ -212,9 +212,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
212
212
  traverse(element)
213
213
  snippet_text = "".join(strings)
214
214
  if len(snippet_text) > 0:
215
- parent_item = doc.add_text(
216
- label=doc_label, parent=parent_item, text=snippet_text
217
- )
215
+ if doc_label == DocItemLabel.SECTION_HEADER:
216
+ parent_item = doc.add_heading(
217
+ text=snippet_text,
218
+ level=element.level - 1,
219
+ parent=parent_item,
220
+ )
221
+ else:
222
+ parent_item = doc.add_text(
223
+ label=doc_label, parent=parent_item, text=snippet_text
224
+ )
218
225
 
219
226
  elif isinstance(element, marko.block.List):
220
227
  has_non_empty_list_items = False
@@ -232,12 +239,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
232
239
  label=label, name=f"list", parent=parent_item
233
240
  )
234
241
 
235
- elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
242
+ elif (
243
+ isinstance(element, marko.block.ListItem)
244
+ and len(element.children) > 0
245
+ and isinstance((first_child := element.children[0]), marko.block.Paragraph)
246
+ ):
236
247
  self._close_table(doc)
237
248
  self._process_inline_text(parent_item, doc)
238
249
  _log.debug(" - List item")
239
250
 
240
- first_child = element.children[0]
241
251
  snippet_text = str(first_child.children[0].children) # type: ignore
242
252
  is_numbered = False
243
253
  if (
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.28.1" # DO NOT EDIT, updated automatically
3
+ version = "2.28.2" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = [
6
6
  "Christoph Auer <cau@zurich.ibm.com>",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes