docling 2.44.0__tar.gz → 2.46.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {docling-2.44.0 → docling-2.46.0}/PKG-INFO +2 -2
  2. {docling-2.44.0 → docling-2.46.0}/docling/backend/docling_parse_v4_backend.py +61 -27
  3. {docling-2.44.0 → docling-2.46.0}/docling/backend/html_backend.py +356 -80
  4. docling-2.46.0/docling/backend/mets_gbs_backend.py +399 -0
  5. {docling-2.44.0 → docling-2.46.0}/docling/backend/pdf_backend.py +3 -3
  6. {docling-2.44.0 → docling-2.46.0}/docling/cli/main.py +10 -0
  7. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/base_models.py +3 -0
  8. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/document.py +26 -0
  9. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/pipeline_options.py +1 -3
  10. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/pipeline_options_vlm_model.py +8 -2
  11. {docling-2.44.0 → docling-2.46.0}/docling/document_converter.py +4 -0
  12. {docling-2.44.0 → docling-2.46.0}/docling/models/api_vlm_model.py +2 -5
  13. {docling-2.44.0 → docling-2.46.0}/docling/models/code_formula_model.py +87 -76
  14. {docling-2.44.0 → docling-2.46.0}/docling/models/tesseract_ocr_cli_model.py +4 -2
  15. {docling-2.44.0 → docling-2.46.0}/docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
  16. {docling-2.44.0 → docling-2.46.0}/docling/models/vlm_models_inline/mlx_model.py +2 -4
  17. {docling-2.44.0 → docling-2.46.0}/docling/pipeline/base_pipeline.py +14 -5
  18. {docling-2.44.0 → docling-2.46.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +6 -4
  19. {docling-2.44.0 → docling-2.46.0}/docling.egg-info/PKG-INFO +2 -2
  20. {docling-2.44.0 → docling-2.46.0}/docling.egg-info/SOURCES.txt +2 -0
  21. {docling-2.44.0 → docling-2.46.0}/docling.egg-info/requires.txt +1 -1
  22. {docling-2.44.0 → docling-2.46.0}/pyproject.toml +2 -2
  23. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_html.py +52 -0
  24. docling-2.46.0/tests/test_backend_mets_gbs.py +77 -0
  25. {docling-2.44.0 → docling-2.46.0}/tests/test_e2e_conversion.py +1 -0
  26. {docling-2.44.0 → docling-2.46.0}/tests/test_interfaces.py +3 -0
  27. {docling-2.44.0 → docling-2.46.0}/LICENSE +0 -0
  28. {docling-2.44.0 → docling-2.46.0}/README.md +0 -0
  29. {docling-2.44.0 → docling-2.46.0}/docling/__init__.py +0 -0
  30. {docling-2.44.0 → docling-2.46.0}/docling/backend/__init__.py +0 -0
  31. {docling-2.44.0 → docling-2.46.0}/docling/backend/abstract_backend.py +0 -0
  32. {docling-2.44.0 → docling-2.46.0}/docling/backend/asciidoc_backend.py +0 -0
  33. {docling-2.44.0 → docling-2.46.0}/docling/backend/csv_backend.py +0 -0
  34. {docling-2.44.0 → docling-2.46.0}/docling/backend/docling_parse_backend.py +0 -0
  35. {docling-2.44.0 → docling-2.46.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  36. {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/__init__.py +0 -0
  37. {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/latex/__init__.py +0 -0
  38. {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  39. {docling-2.44.0 → docling-2.46.0}/docling/backend/docx/latex/omml.py +0 -0
  40. {docling-2.44.0 → docling-2.46.0}/docling/backend/json/__init__.py +0 -0
  41. {docling-2.44.0 → docling-2.46.0}/docling/backend/json/docling_json_backend.py +0 -0
  42. {docling-2.44.0 → docling-2.46.0}/docling/backend/md_backend.py +0 -0
  43. {docling-2.44.0 → docling-2.46.0}/docling/backend/msexcel_backend.py +0 -0
  44. {docling-2.44.0 → docling-2.46.0}/docling/backend/mspowerpoint_backend.py +0 -0
  45. {docling-2.44.0 → docling-2.46.0}/docling/backend/msword_backend.py +0 -0
  46. {docling-2.44.0 → docling-2.46.0}/docling/backend/noop_backend.py +0 -0
  47. {docling-2.44.0 → docling-2.46.0}/docling/backend/pypdfium2_backend.py +0 -0
  48. {docling-2.44.0 → docling-2.46.0}/docling/backend/xml/__init__.py +0 -0
  49. {docling-2.44.0 → docling-2.46.0}/docling/backend/xml/jats_backend.py +0 -0
  50. {docling-2.44.0 → docling-2.46.0}/docling/backend/xml/uspto_backend.py +0 -0
  51. {docling-2.44.0 → docling-2.46.0}/docling/chunking/__init__.py +0 -0
  52. {docling-2.44.0 → docling-2.46.0}/docling/cli/__init__.py +0 -0
  53. {docling-2.44.0 → docling-2.46.0}/docling/cli/models.py +0 -0
  54. {docling-2.44.0 → docling-2.46.0}/docling/cli/tools.py +0 -0
  55. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/__init__.py +0 -0
  56. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/accelerator_options.py +0 -0
  57. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/asr_model_specs.py +0 -0
  58. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/layout_model_specs.py +0 -0
  59. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  60. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/settings.py +0 -0
  61. {docling-2.44.0 → docling-2.46.0}/docling/datamodel/vlm_model_specs.py +0 -0
  62. {docling-2.44.0 → docling-2.46.0}/docling/exceptions.py +0 -0
  63. {docling-2.44.0 → docling-2.46.0}/docling/models/__init__.py +0 -0
  64. {docling-2.44.0 → docling-2.46.0}/docling/models/base_model.py +0 -0
  65. {docling-2.44.0 → docling-2.46.0}/docling/models/base_ocr_model.py +0 -0
  66. {docling-2.44.0 → docling-2.46.0}/docling/models/document_picture_classifier.py +0 -0
  67. {docling-2.44.0 → docling-2.46.0}/docling/models/easyocr_model.py +0 -0
  68. {docling-2.44.0 → docling-2.46.0}/docling/models/factories/__init__.py +0 -0
  69. {docling-2.44.0 → docling-2.46.0}/docling/models/factories/base_factory.py +0 -0
  70. {docling-2.44.0 → docling-2.46.0}/docling/models/factories/ocr_factory.py +0 -0
  71. {docling-2.44.0 → docling-2.46.0}/docling/models/factories/picture_description_factory.py +0 -0
  72. {docling-2.44.0 → docling-2.46.0}/docling/models/layout_model.py +0 -0
  73. {docling-2.44.0 → docling-2.46.0}/docling/models/ocr_mac_model.py +0 -0
  74. {docling-2.44.0 → docling-2.46.0}/docling/models/page_assemble_model.py +0 -0
  75. {docling-2.44.0 → docling-2.46.0}/docling/models/page_preprocessing_model.py +0 -0
  76. {docling-2.44.0 → docling-2.46.0}/docling/models/picture_description_api_model.py +0 -0
  77. {docling-2.44.0 → docling-2.46.0}/docling/models/picture_description_base_model.py +0 -0
  78. {docling-2.44.0 → docling-2.46.0}/docling/models/picture_description_vlm_model.py +0 -0
  79. {docling-2.44.0 → docling-2.46.0}/docling/models/plugins/__init__.py +0 -0
  80. {docling-2.44.0 → docling-2.46.0}/docling/models/plugins/defaults.py +0 -0
  81. {docling-2.44.0 → docling-2.46.0}/docling/models/rapid_ocr_model.py +0 -0
  82. {docling-2.44.0 → docling-2.46.0}/docling/models/readingorder_model.py +0 -0
  83. {docling-2.44.0 → docling-2.46.0}/docling/models/table_structure_model.py +0 -0
  84. {docling-2.44.0 → docling-2.46.0}/docling/models/tesseract_ocr_model.py +0 -0
  85. {docling-2.44.0 → docling-2.46.0}/docling/models/utils/__init__.py +0 -0
  86. {docling-2.44.0 → docling-2.46.0}/docling/models/utils/hf_model_download.py +0 -0
  87. {docling-2.44.0 → docling-2.46.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  88. {docling-2.44.0 → docling-2.46.0}/docling/pipeline/__init__.py +0 -0
  89. {docling-2.44.0 → docling-2.46.0}/docling/pipeline/asr_pipeline.py +0 -0
  90. {docling-2.44.0 → docling-2.46.0}/docling/pipeline/simple_pipeline.py +0 -0
  91. {docling-2.44.0 → docling-2.46.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  92. {docling-2.44.0 → docling-2.46.0}/docling/pipeline/vlm_pipeline.py +0 -0
  93. {docling-2.44.0 → docling-2.46.0}/docling/py.typed +0 -0
  94. {docling-2.44.0 → docling-2.46.0}/docling/utils/__init__.py +0 -0
  95. {docling-2.44.0 → docling-2.46.0}/docling/utils/accelerator_utils.py +0 -0
  96. {docling-2.44.0 → docling-2.46.0}/docling/utils/api_image_request.py +0 -0
  97. {docling-2.44.0 → docling-2.46.0}/docling/utils/export.py +0 -0
  98. {docling-2.44.0 → docling-2.46.0}/docling/utils/glm_utils.py +0 -0
  99. {docling-2.44.0 → docling-2.46.0}/docling/utils/layout_postprocessor.py +0 -0
  100. {docling-2.44.0 → docling-2.46.0}/docling/utils/locks.py +0 -0
  101. {docling-2.44.0 → docling-2.46.0}/docling/utils/model_downloader.py +0 -0
  102. {docling-2.44.0 → docling-2.46.0}/docling/utils/ocr_utils.py +0 -0
  103. {docling-2.44.0 → docling-2.46.0}/docling/utils/orientation.py +0 -0
  104. {docling-2.44.0 → docling-2.46.0}/docling/utils/profiling.py +0 -0
  105. {docling-2.44.0 → docling-2.46.0}/docling/utils/utils.py +0 -0
  106. {docling-2.44.0 → docling-2.46.0}/docling/utils/visualization.py +0 -0
  107. {docling-2.44.0 → docling-2.46.0}/docling.egg-info/dependency_links.txt +0 -0
  108. {docling-2.44.0 → docling-2.46.0}/docling.egg-info/entry_points.txt +0 -0
  109. {docling-2.44.0 → docling-2.46.0}/docling.egg-info/top_level.txt +0 -0
  110. {docling-2.44.0 → docling-2.46.0}/setup.cfg +0 -0
  111. {docling-2.44.0 → docling-2.46.0}/tests/test_asr_pipeline.py +0 -0
  112. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_asciidoc.py +0 -0
  113. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_csv.py +0 -0
  114. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_json.py +0 -0
  115. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_parse.py +0 -0
  116. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_parse_v2.py +0 -0
  117. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_docling_parse_v4.py +0 -0
  118. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_jats.py +0 -0
  119. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_markdown.py +0 -0
  120. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_msexcel.py +0 -0
  121. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_msword.py +0 -0
  122. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_patent_uspto.py +0 -0
  123. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_pdfium.py +0 -0
  124. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_pptx.py +0 -0
  125. {docling-2.44.0 → docling-2.46.0}/tests/test_backend_webp.py +0 -0
  126. {docling-2.44.0 → docling-2.46.0}/tests/test_cli.py +0 -0
  127. {docling-2.44.0 → docling-2.46.0}/tests/test_code_formula.py +0 -0
  128. {docling-2.44.0 → docling-2.46.0}/tests/test_data_gen_flag.py +0 -0
  129. {docling-2.44.0 → docling-2.46.0}/tests/test_document_picture_classifier.py +0 -0
  130. {docling-2.44.0 → docling-2.46.0}/tests/test_e2e_ocr_conversion.py +0 -0
  131. {docling-2.44.0 → docling-2.46.0}/tests/test_input_doc.py +0 -0
  132. {docling-2.44.0 → docling-2.46.0}/tests/test_invalid_input.py +0 -0
  133. {docling-2.44.0 → docling-2.46.0}/tests/test_legacy_format_transform.py +0 -0
  134. {docling-2.44.0 → docling-2.46.0}/tests/test_ocr_utils.py +0 -0
  135. {docling-2.44.0 → docling-2.46.0}/tests/test_options.py +0 -0
  136. {docling-2.44.0 → docling-2.46.0}/tests/test_settings_load.py +0 -0
  137. {docling-2.44.0 → docling-2.46.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.44.0
3
+ Version: 2.46.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.0.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.2.2
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
22
22
 
23
23
 
24
24
  class DoclingParseV4PageBackend(PdfPageBackend):
25
- def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
25
+ def __init__(
26
+ self,
27
+ *,
28
+ dp_doc: PdfDocument,
29
+ page_obj: PdfPage,
30
+ page_no: int,
31
+ create_words: bool = True,
32
+ create_textlines: bool = True,
33
+ ):
26
34
  self._ppage = page_obj
27
- self._dpage = parsed_page
28
- self.valid = parsed_page is not None
35
+ self._dp_doc = dp_doc
36
+ self._page_no = page_no
37
+ self._create_words = create_words
38
+ self._create_textlines = create_textlines
39
+
40
+ self._dpage: Optional[SegmentedPdfPage] = None
41
+ self._unloaded = False
42
+ self.valid = (self._ppage is not None) and (self._dp_doc is not None)
43
+
44
+ def _ensure_parsed(self) -> None:
45
+ if self._dpage is not None:
46
+ return
47
+
48
+ seg_page = self._dp_doc.get_page(
49
+ self._page_no + 1,
50
+ create_words=self._create_words,
51
+ create_textlines=self._create_textlines,
52
+ )
53
+
54
+ # In Docling, all TextCell instances are expected with top-left origin.
55
+ [
56
+ tc.to_top_left_origin(seg_page.dimension.height)
57
+ for tc in seg_page.textline_cells
58
+ ]
59
+ [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
60
+ [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
61
+
62
+ self._dpage = seg_page
29
63
 
30
64
  def is_valid(self) -> bool:
31
65
  return self.valid
32
66
 
33
67
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
68
+ self._ensure_parsed()
69
+ assert self._dpage is not None
70
+
34
71
  # Find intersecting cells on the page
35
72
  text_piece = ""
36
73
  page_size = self.get_size()
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
56
93
  return text_piece
57
94
 
58
95
  def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
96
+ self._ensure_parsed()
59
97
  return self._dpage
60
98
 
61
99
  def get_text_cells(self) -> Iterable[TextCell]:
100
+ self._ensure_parsed()
101
+ assert self._dpage is not None
102
+
62
103
  return self._dpage.textline_cells
63
104
 
64
105
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
106
+ self._ensure_parsed()
107
+ assert self._dpage is not None
108
+
65
109
  AREA_THRESHOLD = 0 # 32 * 32
66
110
 
67
111
  images = self._dpage.bitmap_resources
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
123
167
  # )
124
168
 
125
169
  def unload(self):
170
+ if not self._unloaded and self._dp_doc is not None:
171
+ self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
172
+ self._unloaded = True
173
+
126
174
  self._ppage = None
127
175
  self._dpage = None
176
+ self._dp_doc = None
128
177
 
129
178
 
130
179
  class DoclingParseV4DocumentBackend(PdfDocumentBackend):
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
157
206
  self, page_no: int, create_words: bool = True, create_textlines: bool = True
158
207
  ) -> DoclingParseV4PageBackend:
159
208
  with pypdfium2_lock:
160
- seg_page = self.dp_doc.get_page(
161
- page_no + 1,
162
- create_words=create_words,
163
- create_textlines=create_textlines,
164
- )
165
-
166
- # In Docling, all TextCell instances are expected with top-left origin.
167
- [
168
- tc.to_top_left_origin(seg_page.dimension.height)
169
- for tc in seg_page.textline_cells
170
- ]
171
- [
172
- tc.to_top_left_origin(seg_page.dimension.height)
173
- for tc in seg_page.char_cells
174
- ]
175
- [
176
- tc.to_top_left_origin(seg_page.dimension.height)
177
- for tc in seg_page.word_cells
178
- ]
179
-
180
- return DoclingParseV4PageBackend(
181
- seg_page,
182
- self._pdoc[page_no],
183
- )
209
+ ppage = self._pdoc[page_no]
210
+
211
+ return DoclingParseV4PageBackend(
212
+ dp_doc=self.dp_doc,
213
+ page_obj=ppage,
214
+ page_no=page_no,
215
+ create_words=create_words,
216
+ create_textlines=create_textlines,
217
+ )
184
218
 
185
219
  def is_valid(self) -> bool:
186
220
  return self.page_count() > 0