docling 2.45.0__tar.gz → 2.46.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {docling-2.45.0 → docling-2.46.0}/PKG-INFO +2 -2
  2. {docling-2.45.0 → docling-2.46.0}/docling/backend/docling_parse_v4_backend.py +61 -27
  3. {docling-2.45.0 → docling-2.46.0}/docling/backend/html_backend.py +8 -4
  4. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/pipeline_options.py +1 -3
  5. {docling-2.45.0 → docling-2.46.0}/docling/models/code_formula_model.py +87 -76
  6. {docling-2.45.0 → docling-2.46.0}/docling/models/tesseract_ocr_cli_model.py +4 -2
  7. {docling-2.45.0 → docling-2.46.0}/docling/pipeline/base_pipeline.py +7 -1
  8. {docling-2.45.0 → docling-2.46.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +6 -4
  9. {docling-2.45.0 → docling-2.46.0}/docling.egg-info/PKG-INFO +2 -2
  10. {docling-2.45.0 → docling-2.46.0}/docling.egg-info/requires.txt +1 -1
  11. {docling-2.45.0 → docling-2.46.0}/pyproject.toml +2 -2
  12. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_html.py +32 -0
  13. {docling-2.45.0 → docling-2.46.0}/tests/test_e2e_conversion.py +1 -0
  14. {docling-2.45.0 → docling-2.46.0}/tests/test_interfaces.py +3 -0
  15. {docling-2.45.0 → docling-2.46.0}/LICENSE +0 -0
  16. {docling-2.45.0 → docling-2.46.0}/README.md +0 -0
  17. {docling-2.45.0 → docling-2.46.0}/docling/__init__.py +0 -0
  18. {docling-2.45.0 → docling-2.46.0}/docling/backend/__init__.py +0 -0
  19. {docling-2.45.0 → docling-2.46.0}/docling/backend/abstract_backend.py +0 -0
  20. {docling-2.45.0 → docling-2.46.0}/docling/backend/asciidoc_backend.py +0 -0
  21. {docling-2.45.0 → docling-2.46.0}/docling/backend/csv_backend.py +0 -0
  22. {docling-2.45.0 → docling-2.46.0}/docling/backend/docling_parse_backend.py +0 -0
  23. {docling-2.45.0 → docling-2.46.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  24. {docling-2.45.0 → docling-2.46.0}/docling/backend/docx/__init__.py +0 -0
  25. {docling-2.45.0 → docling-2.46.0}/docling/backend/docx/latex/__init__.py +0 -0
  26. {docling-2.45.0 → docling-2.46.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  27. {docling-2.45.0 → docling-2.46.0}/docling/backend/docx/latex/omml.py +0 -0
  28. {docling-2.45.0 → docling-2.46.0}/docling/backend/json/__init__.py +0 -0
  29. {docling-2.45.0 → docling-2.46.0}/docling/backend/json/docling_json_backend.py +0 -0
  30. {docling-2.45.0 → docling-2.46.0}/docling/backend/md_backend.py +0 -0
  31. {docling-2.45.0 → docling-2.46.0}/docling/backend/mets_gbs_backend.py +0 -0
  32. {docling-2.45.0 → docling-2.46.0}/docling/backend/msexcel_backend.py +0 -0
  33. {docling-2.45.0 → docling-2.46.0}/docling/backend/mspowerpoint_backend.py +0 -0
  34. {docling-2.45.0 → docling-2.46.0}/docling/backend/msword_backend.py +0 -0
  35. {docling-2.45.0 → docling-2.46.0}/docling/backend/noop_backend.py +0 -0
  36. {docling-2.45.0 → docling-2.46.0}/docling/backend/pdf_backend.py +0 -0
  37. {docling-2.45.0 → docling-2.46.0}/docling/backend/pypdfium2_backend.py +0 -0
  38. {docling-2.45.0 → docling-2.46.0}/docling/backend/xml/__init__.py +0 -0
  39. {docling-2.45.0 → docling-2.46.0}/docling/backend/xml/jats_backend.py +0 -0
  40. {docling-2.45.0 → docling-2.46.0}/docling/backend/xml/uspto_backend.py +0 -0
  41. {docling-2.45.0 → docling-2.46.0}/docling/chunking/__init__.py +0 -0
  42. {docling-2.45.0 → docling-2.46.0}/docling/cli/__init__.py +0 -0
  43. {docling-2.45.0 → docling-2.46.0}/docling/cli/main.py +0 -0
  44. {docling-2.45.0 → docling-2.46.0}/docling/cli/models.py +0 -0
  45. {docling-2.45.0 → docling-2.46.0}/docling/cli/tools.py +0 -0
  46. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/__init__.py +0 -0
  47. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/accelerator_options.py +0 -0
  48. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/asr_model_specs.py +0 -0
  49. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/base_models.py +0 -0
  50. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/document.py +0 -0
  51. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/layout_model_specs.py +0 -0
  52. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  53. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  54. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/settings.py +0 -0
  55. {docling-2.45.0 → docling-2.46.0}/docling/datamodel/vlm_model_specs.py +0 -0
  56. {docling-2.45.0 → docling-2.46.0}/docling/document_converter.py +0 -0
  57. {docling-2.45.0 → docling-2.46.0}/docling/exceptions.py +0 -0
  58. {docling-2.45.0 → docling-2.46.0}/docling/models/__init__.py +0 -0
  59. {docling-2.45.0 → docling-2.46.0}/docling/models/api_vlm_model.py +0 -0
  60. {docling-2.45.0 → docling-2.46.0}/docling/models/base_model.py +0 -0
  61. {docling-2.45.0 → docling-2.46.0}/docling/models/base_ocr_model.py +0 -0
  62. {docling-2.45.0 → docling-2.46.0}/docling/models/document_picture_classifier.py +0 -0
  63. {docling-2.45.0 → docling-2.46.0}/docling/models/easyocr_model.py +0 -0
  64. {docling-2.45.0 → docling-2.46.0}/docling/models/factories/__init__.py +0 -0
  65. {docling-2.45.0 → docling-2.46.0}/docling/models/factories/base_factory.py +0 -0
  66. {docling-2.45.0 → docling-2.46.0}/docling/models/factories/ocr_factory.py +0 -0
  67. {docling-2.45.0 → docling-2.46.0}/docling/models/factories/picture_description_factory.py +0 -0
  68. {docling-2.45.0 → docling-2.46.0}/docling/models/layout_model.py +0 -0
  69. {docling-2.45.0 → docling-2.46.0}/docling/models/ocr_mac_model.py +0 -0
  70. {docling-2.45.0 → docling-2.46.0}/docling/models/page_assemble_model.py +0 -0
  71. {docling-2.45.0 → docling-2.46.0}/docling/models/page_preprocessing_model.py +0 -0
  72. {docling-2.45.0 → docling-2.46.0}/docling/models/picture_description_api_model.py +0 -0
  73. {docling-2.45.0 → docling-2.46.0}/docling/models/picture_description_base_model.py +0 -0
  74. {docling-2.45.0 → docling-2.46.0}/docling/models/picture_description_vlm_model.py +0 -0
  75. {docling-2.45.0 → docling-2.46.0}/docling/models/plugins/__init__.py +0 -0
  76. {docling-2.45.0 → docling-2.46.0}/docling/models/plugins/defaults.py +0 -0
  77. {docling-2.45.0 → docling-2.46.0}/docling/models/rapid_ocr_model.py +0 -0
  78. {docling-2.45.0 → docling-2.46.0}/docling/models/readingorder_model.py +0 -0
  79. {docling-2.45.0 → docling-2.46.0}/docling/models/table_structure_model.py +0 -0
  80. {docling-2.45.0 → docling-2.46.0}/docling/models/tesseract_ocr_model.py +0 -0
  81. {docling-2.45.0 → docling-2.46.0}/docling/models/utils/__init__.py +0 -0
  82. {docling-2.45.0 → docling-2.46.0}/docling/models/utils/hf_model_download.py +0 -0
  83. {docling-2.45.0 → docling-2.46.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  84. {docling-2.45.0 → docling-2.46.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  85. {docling-2.45.0 → docling-2.46.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  86. {docling-2.45.0 → docling-2.46.0}/docling/pipeline/__init__.py +0 -0
  87. {docling-2.45.0 → docling-2.46.0}/docling/pipeline/asr_pipeline.py +0 -0
  88. {docling-2.45.0 → docling-2.46.0}/docling/pipeline/simple_pipeline.py +0 -0
  89. {docling-2.45.0 → docling-2.46.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  90. {docling-2.45.0 → docling-2.46.0}/docling/pipeline/vlm_pipeline.py +0 -0
  91. {docling-2.45.0 → docling-2.46.0}/docling/py.typed +0 -0
  92. {docling-2.45.0 → docling-2.46.0}/docling/utils/__init__.py +0 -0
  93. {docling-2.45.0 → docling-2.46.0}/docling/utils/accelerator_utils.py +0 -0
  94. {docling-2.45.0 → docling-2.46.0}/docling/utils/api_image_request.py +0 -0
  95. {docling-2.45.0 → docling-2.46.0}/docling/utils/export.py +0 -0
  96. {docling-2.45.0 → docling-2.46.0}/docling/utils/glm_utils.py +0 -0
  97. {docling-2.45.0 → docling-2.46.0}/docling/utils/layout_postprocessor.py +0 -0
  98. {docling-2.45.0 → docling-2.46.0}/docling/utils/locks.py +0 -0
  99. {docling-2.45.0 → docling-2.46.0}/docling/utils/model_downloader.py +0 -0
  100. {docling-2.45.0 → docling-2.46.0}/docling/utils/ocr_utils.py +0 -0
  101. {docling-2.45.0 → docling-2.46.0}/docling/utils/orientation.py +0 -0
  102. {docling-2.45.0 → docling-2.46.0}/docling/utils/profiling.py +0 -0
  103. {docling-2.45.0 → docling-2.46.0}/docling/utils/utils.py +0 -0
  104. {docling-2.45.0 → docling-2.46.0}/docling/utils/visualization.py +0 -0
  105. {docling-2.45.0 → docling-2.46.0}/docling.egg-info/SOURCES.txt +0 -0
  106. {docling-2.45.0 → docling-2.46.0}/docling.egg-info/dependency_links.txt +0 -0
  107. {docling-2.45.0 → docling-2.46.0}/docling.egg-info/entry_points.txt +0 -0
  108. {docling-2.45.0 → docling-2.46.0}/docling.egg-info/top_level.txt +0 -0
  109. {docling-2.45.0 → docling-2.46.0}/setup.cfg +0 -0
  110. {docling-2.45.0 → docling-2.46.0}/tests/test_asr_pipeline.py +0 -0
  111. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_asciidoc.py +0 -0
  112. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_csv.py +0 -0
  113. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_json.py +0 -0
  114. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_parse.py +0 -0
  115. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_parse_v2.py +0 -0
  116. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_parse_v4.py +0 -0
  117. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_jats.py +0 -0
  118. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_markdown.py +0 -0
  119. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_mets_gbs.py +0 -0
  120. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_msexcel.py +0 -0
  121. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_msword.py +0 -0
  122. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_patent_uspto.py +0 -0
  123. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_pdfium.py +0 -0
  124. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_pptx.py +0 -0
  125. {docling-2.45.0 → docling-2.46.0}/tests/test_backend_webp.py +0 -0
  126. {docling-2.45.0 → docling-2.46.0}/tests/test_cli.py +0 -0
  127. {docling-2.45.0 → docling-2.46.0}/tests/test_code_formula.py +0 -0
  128. {docling-2.45.0 → docling-2.46.0}/tests/test_data_gen_flag.py +0 -0
  129. {docling-2.45.0 → docling-2.46.0}/tests/test_document_picture_classifier.py +0 -0
  130. {docling-2.45.0 → docling-2.46.0}/tests/test_e2e_ocr_conversion.py +0 -0
  131. {docling-2.45.0 → docling-2.46.0}/tests/test_input_doc.py +0 -0
  132. {docling-2.45.0 → docling-2.46.0}/tests/test_invalid_input.py +0 -0
  133. {docling-2.45.0 → docling-2.46.0}/tests/test_legacy_format_transform.py +0 -0
  134. {docling-2.45.0 → docling-2.46.0}/tests/test_ocr_utils.py +0 -0
  135. {docling-2.45.0 → docling-2.46.0}/tests/test_options.py +0 -0
  136. {docling-2.45.0 → docling-2.46.0}/tests/test_settings_load.py +0 -0
  137. {docling-2.45.0 → docling-2.46.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.45.0
3
+ Version: 2.46.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.0.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.2.2
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
22
22
 
23
23
 
24
24
  class DoclingParseV4PageBackend(PdfPageBackend):
25
- def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
25
+ def __init__(
26
+ self,
27
+ *,
28
+ dp_doc: PdfDocument,
29
+ page_obj: PdfPage,
30
+ page_no: int,
31
+ create_words: bool = True,
32
+ create_textlines: bool = True,
33
+ ):
26
34
  self._ppage = page_obj
27
- self._dpage = parsed_page
28
- self.valid = parsed_page is not None
35
+ self._dp_doc = dp_doc
36
+ self._page_no = page_no
37
+ self._create_words = create_words
38
+ self._create_textlines = create_textlines
39
+
40
+ self._dpage: Optional[SegmentedPdfPage] = None
41
+ self._unloaded = False
42
+ self.valid = (self._ppage is not None) and (self._dp_doc is not None)
43
+
44
+ def _ensure_parsed(self) -> None:
45
+ if self._dpage is not None:
46
+ return
47
+
48
+ seg_page = self._dp_doc.get_page(
49
+ self._page_no + 1,
50
+ create_words=self._create_words,
51
+ create_textlines=self._create_textlines,
52
+ )
53
+
54
+ # In Docling, all TextCell instances are expected with top-left origin.
55
+ [
56
+ tc.to_top_left_origin(seg_page.dimension.height)
57
+ for tc in seg_page.textline_cells
58
+ ]
59
+ [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
60
+ [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
61
+
62
+ self._dpage = seg_page
29
63
 
30
64
  def is_valid(self) -> bool:
31
65
  return self.valid
32
66
 
33
67
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
68
+ self._ensure_parsed()
69
+ assert self._dpage is not None
70
+
34
71
  # Find intersecting cells on the page
35
72
  text_piece = ""
36
73
  page_size = self.get_size()
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
56
93
  return text_piece
57
94
 
58
95
  def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
96
+ self._ensure_parsed()
59
97
  return self._dpage
60
98
 
61
99
  def get_text_cells(self) -> Iterable[TextCell]:
100
+ self._ensure_parsed()
101
+ assert self._dpage is not None
102
+
62
103
  return self._dpage.textline_cells
63
104
 
64
105
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
106
+ self._ensure_parsed()
107
+ assert self._dpage is not None
108
+
65
109
  AREA_THRESHOLD = 0 # 32 * 32
66
110
 
67
111
  images = self._dpage.bitmap_resources
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
123
167
  # )
124
168
 
125
169
  def unload(self):
170
+ if not self._unloaded and self._dp_doc is not None:
171
+ self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
172
+ self._unloaded = True
173
+
126
174
  self._ppage = None
127
175
  self._dpage = None
176
+ self._dp_doc = None
128
177
 
129
178
 
130
179
  class DoclingParseV4DocumentBackend(PdfDocumentBackend):
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
157
206
  self, page_no: int, create_words: bool = True, create_textlines: bool = True
158
207
  ) -> DoclingParseV4PageBackend:
159
208
  with pypdfium2_lock:
160
- seg_page = self.dp_doc.get_page(
161
- page_no + 1,
162
- create_words=create_words,
163
- create_textlines=create_textlines,
164
- )
165
-
166
- # In Docling, all TextCell instances are expected with top-left origin.
167
- [
168
- tc.to_top_left_origin(seg_page.dimension.height)
169
- for tc in seg_page.textline_cells
170
- ]
171
- [
172
- tc.to_top_left_origin(seg_page.dimension.height)
173
- for tc in seg_page.char_cells
174
- ]
175
- [
176
- tc.to_top_left_origin(seg_page.dimension.height)
177
- for tc in seg_page.word_cells
178
- ]
179
-
180
- return DoclingParseV4PageBackend(
181
- seg_page,
182
- self._pdoc[page_no],
183
- )
209
+ ppage = self._pdoc[page_no]
210
+
211
+ return DoclingParseV4PageBackend(
212
+ dp_doc=self.dp_doc,
213
+ page_obj=ppage,
214
+ page_no=page_no,
215
+ create_words=create_words,
216
+ create_textlines=create_textlines,
217
+ )
184
218
 
185
219
  def is_valid(self) -> bool:
186
220
  return self.page_count() > 0
@@ -38,6 +38,7 @@ _BLOCK_TAGS: Final = {
38
38
  "address",
39
39
  "details",
40
40
  "figure",
41
+ "footer",
41
42
  "h1",
42
43
  "h2",
43
44
  "h3",
@@ -639,10 +640,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
639
640
  hyperlink=annotated_text.hyperlink,
640
641
  )
641
642
 
642
- elif tag_name == "details":
643
- # handle details and its content.
643
+ elif tag_name in {"details", "footer"}:
644
+ if tag_name == "footer":
645
+ current_layer = self.content_layer
646
+ self.content_layer = ContentLayer.FURNITURE
644
647
  self.parents[self.level + 1] = doc.add_group(
645
- name="details",
648
+ name=tag_name,
646
649
  label=GroupLabel.SECTION,
647
650
  parent=self.parents[self.level],
648
651
  content_layer=self.content_layer,
@@ -651,6 +654,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
651
654
  self._walk(tag, doc)
652
655
  self.parents[self.level + 1] = None
653
656
  self.level -= 1
657
+ if tag_name == "footer":
658
+ self.content_layer = current_layer
654
659
 
655
660
  def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
656
661
  figure = img_tag.find_parent("figure")
@@ -686,7 +691,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
686
691
  text_clean = HTMLDocumentBackend._clean_unicode(
687
692
  caption_anno_text.text.strip()
688
693
  )
689
- print(caption_anno_text)
690
694
  caption_item = doc.add_text(
691
695
  label=DocItemLabel.CAPTION,
692
696
  text=text_clean,
@@ -323,9 +323,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
323
323
  ),
324
324
  )
325
325
 
326
- generate_parsed_pages: Literal[True] = (
327
- True # Always True since parsed_page is now mandatory
328
- )
326
+ generate_parsed_pages: bool = False
329
327
 
330
328
 
331
329
  class ProcessingPipeline(str, Enum):
@@ -1,5 +1,4 @@
1
1
  import re
2
- from collections import Counter
3
2
  from collections.abc import Iterable
4
3
  from pathlib import Path
5
4
  from typing import List, Literal, Optional, Tuple, Union
@@ -13,10 +12,11 @@ from docling_core.types.doc import (
13
12
  TextItem,
14
13
  )
15
14
  from docling_core.types.doc.labels import CodeLanguageLabel
16
- from PIL import Image, ImageOps
15
+ from PIL import Image
17
16
  from pydantic import BaseModel
17
+ from transformers import AutoModelForImageTextToText, AutoProcessor
18
18
 
19
- from docling.datamodel.accelerator_options import AcceleratorOptions
19
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
20
20
  from docling.datamodel.base_models import ItemAndImageEnrichmentElement
21
21
  from docling.models.base_model import BaseItemAndImageEnrichmentModel
22
22
  from docling.models.utils.hf_model_download import download_hf_model
@@ -65,9 +65,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
65
65
  Processes the given batch of elements and enriches them with predictions.
66
66
  """
67
67
 
68
- _model_repo_folder = "ds4sd--CodeFormula"
68
+ _model_repo_folder = "ds4sd--CodeFormulaV2"
69
69
  elements_batch_size = 5
70
- images_scale = 1.66 # = 120 dpi, aligned with training data resolution
70
+ images_scale = 1.67 # = 120 dpi, aligned with training data resolution
71
71
  expansion_factor = 0.18
72
72
 
73
73
  def __init__(
@@ -95,10 +95,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
95
95
  self.options = options
96
96
 
97
97
  if self.enabled:
98
- device = decide_device(accelerator_options.device)
99
-
100
- from docling_ibm_models.code_formula_model.code_formula_predictor import (
101
- CodeFormulaPredictor,
98
+ self.device = decide_device(
99
+ accelerator_options.device,
100
+ supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
102
101
  )
103
102
 
104
103
  if artifacts_path is None:
@@ -106,11 +105,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
106
105
  else:
107
106
  artifacts_path = artifacts_path / self._model_repo_folder
108
107
 
109
- self.code_formula_model = CodeFormulaPredictor(
110
- artifacts_path=str(artifacts_path),
111
- device=device,
112
- num_threads=accelerator_options.num_threads,
108
+ self._processor = AutoProcessor.from_pretrained(
109
+ artifacts_path,
110
+ )
111
+ self._model_max_length = self._processor.tokenizer.model_max_length
112
+ self._model = AutoModelForImageTextToText.from_pretrained(
113
+ artifacts_path, device_map=self.device
113
114
  )
115
+ self._model.eval()
114
116
 
115
117
  @staticmethod
116
118
  def download_models(
@@ -119,8 +121,8 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
119
121
  progress: bool = False,
120
122
  ) -> Path:
121
123
  return download_hf_model(
122
- repo_id="ds4sd/CodeFormula",
123
- revision="v1.0.2",
124
+ repo_id="ds4sd/CodeFormulaV2",
125
+ revision="main",
124
126
  local_dir=local_dir,
125
127
  force=force,
126
128
  progress=progress,
@@ -172,7 +174,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
172
174
  - The second element is the extracted language if a match is found;
173
175
  otherwise, `None`.
174
176
  """
175
- pattern = r"^<_([^_>]+)_>\s(.*)"
177
+ pattern = r"^<_([^_>]+)_>\s*(.*)"
176
178
  match = re.match(pattern, input_string, flags=re.DOTALL)
177
179
  if match:
178
180
  language = str(match.group(1)) # the captured programming language
@@ -203,81 +205,74 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
203
205
  except ValueError:
204
206
  return CodeLanguageLabel.UNKNOWN
205
207
 
206
- def _get_most_frequent_edge_color(self, pil_img: Image.Image):
208
+ def _get_prompt(self, label: str) -> str:
207
209
  """
208
- Compute the most frequent color along the outer edges of a PIL image.
210
+ Constructs the prompt for the model based on the input label.
209
211
 
210
212
  Parameters
211
213
  ----------
212
- pil_img : Image.Image
213
- A PIL Image in any mode (L, RGB, RGBA, etc.).
214
+ label : str
215
+ The type of input, either 'code' or 'formula'.
214
216
 
215
217
  Returns
216
218
  -------
217
- (int) or (tuple): The most common edge color as a scalar (for grayscale) or
218
- tuple (for RGB/RGBA).
219
+ str
220
+ The constructed prompt including necessary tokens and query.
221
+
222
+ Raises
223
+ ------
224
+ NotImplementedError
225
+ If the label is not 'code' or 'formula'.
219
226
  """
220
- # Convert to NumPy array for easy pixel access
221
- img_np = np.array(pil_img)
227
+ if label == "code":
228
+ query = "<code>"
229
+ elif label == "formula":
230
+ query = "<formula>"
231
+ else:
232
+ raise NotImplementedError("Label must be either code or formula")
222
233
 
223
- if img_np.ndim == 2:
224
- # Grayscale-like image: shape (H, W)
225
- # Extract edges: top row, bottom row, left col, right col
226
- top = img_np[0, :] # shape (W,)
227
- bottom = img_np[-1, :] # shape (W,)
228
- left = img_np[:, 0] # shape (H,)
229
- right = img_np[:, -1] # shape (H,)
234
+ messages = [
235
+ {
236
+ "role": "user",
237
+ "content": [{"type": "image"}, {"type": "text", "text": query}],
238
+ },
239
+ ]
230
240
 
231
- # Concatenate all edges
232
- edges = np.concatenate([top, bottom, left, right])
241
+ prompt = self._processor.apply_chat_template(
242
+ messages, add_generation_prompt=True
243
+ )
233
244
 
234
- # Count frequencies
235
- freq = Counter(edges.tolist())
236
- most_common_value, _ = freq.most_common(1)[0]
237
- return int(most_common_value) # single channel color
245
+ return prompt
238
246
 
239
- else:
240
- # Color image: shape (H, W, C)
241
- top = img_np[0, :, :] # shape (W, C)
242
- bottom = img_np[-1, :, :] # shape (W, C)
243
- left = img_np[:, 0, :] # shape (H, C)
244
- right = img_np[:, -1, :] # shape (H, C)
245
-
246
- # Concatenate edges along first axis
247
- edges = np.concatenate([top, bottom, left, right], axis=0)
248
-
249
- # Convert each color to a tuple for counting
250
- edges_as_tuples = [tuple(pixel) for pixel in edges]
251
- freq = Counter(edges_as_tuples)
252
- most_common_value, _ = freq.most_common(1)[0]
253
- return most_common_value # e.g. (R, G, B) or (R, G, B, A)
254
-
255
- def _pad_with_most_frequent_edge_color(
256
- self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
257
- ):
247
+ def _post_process(self, texts: list[str]) -> list[str]:
258
248
  """
259
- Pads an image (PIL or NumPy array) using the most frequent edge color.
249
+ Processes a list of text strings by truncating at '<end_of_utterance>' and
250
+ removing a predefined set of unwanted substrings.
260
251
 
261
252
  Parameters
262
253
  ----------
263
- img : Union[Image.Image, np.ndarray]
264
- The original image.
265
- padding : tuple
266
- Padding (left, top, right, bottom) in pixels.
254
+ texts : list[str]
255
+ A list of strings to be post-processed.
267
256
 
268
257
  Returns
269
258
  -------
270
- Image.Image: A new PIL image with the specified padding.
259
+ list[str]
260
+ A list of cleaned strings with specified substrings removed and truncated at
261
+ '<end_of_utterance>' if present.
271
262
  """
272
- if isinstance(img, np.ndarray):
273
- pil_img = Image.fromarray(img)
274
- else:
275
- pil_img = img
263
+ to_remove = ["</code>", "</formula>", "<loc_0><loc_0><loc_500><loc_500>"]
276
264
 
277
- most_freq_color = self._get_most_frequent_edge_color(pil_img)
265
+ def clean_text(text: str) -> str:
266
+ idx = text.find("<end_of_utterance>")
267
+ if idx != -1:
268
+ text = text[:idx]
278
269
 
279
- padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
280
- return padded_img
270
+ for token in to_remove:
271
+ if token in text:
272
+ text = text.replace(token, "")
273
+ return text.lstrip()
274
+
275
+ return [clean_text(t) for t in texts]
281
276
 
282
277
  def __call__(
283
278
  self,
@@ -308,14 +303,30 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
308
303
  images: List[Union[Image.Image, np.ndarray]] = []
309
304
  elements: List[TextItem] = []
310
305
  for el in element_batch:
311
- assert isinstance(el.item, TextItem)
312
- elements.append(el.item)
313
- labels.append(el.item.label)
314
- images.append(
315
- self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
316
- )
306
+ elements.append(el.item) # type: ignore[arg-type]
307
+ labels.append(el.item.label) # type: ignore[attr-defined]
308
+ images.append(el.image)
309
+
310
+ prompts = [self._get_prompt(label) for label in labels]
311
+ inputs = self._processor(
312
+ text=prompts,
313
+ images=images,
314
+ return_tensors="pt",
315
+ )
316
+ inputs = inputs.to(self.device)
317
317
 
318
- outputs = self.code_formula_model.predict(images, labels)
318
+ gen_kwargs = dict(
319
+ max_new_tokens=self._model_max_length - inputs.input_ids.shape[1],
320
+ use_cache=True,
321
+ do_sample=False,
322
+ )
323
+
324
+ generated_ids = self._model.generate(**inputs, **gen_kwargs)
325
+
326
+ outputs = self._processor.batch_decode(
327
+ generated_ids[:, inputs.input_ids.shape[1] :], skip_special_tokens=False
328
+ )
329
+ outputs = self._post_process(outputs)
319
330
 
320
331
  for item, output in zip(elements, outputs):
321
332
  if isinstance(item, CodeItem):
@@ -320,6 +320,8 @@ class TesseractOcrCliModel(BaseOcrModel):
320
320
 
321
321
 
322
322
  def _parse_orientation(df_osd: pd.DataFrame) -> int:
323
- orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
324
- orientation = parse_tesseract_orientation(orientations[0].strip())
323
+ # For strictly optimal performance with invariant dataframe format:
324
+ mask = df_osd["key"].to_numpy() == "Orientation in degrees"
325
+ orientation_val = df_osd["value"].to_numpy()[mask][0]
326
+ orientation = parse_tesseract_orientation(orientation_val.strip())
325
327
  return orientation
@@ -20,7 +20,7 @@ from docling.datamodel.base_models import (
20
20
  Page,
21
21
  )
22
22
  from docling.datamodel.document import ConversionResult, InputDocument
23
- from docling.datamodel.pipeline_options import PipelineOptions
23
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
24
24
  from docling.datamodel.settings import settings
25
25
  from docling.models.base_model import GenericEnrichmentModel
26
26
  from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -168,6 +168,12 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
168
168
  # Cleanup page backends
169
169
  if not self.keep_backend and p._backend is not None:
170
170
  p._backend.unload()
171
+ if (
172
+ isinstance(self.pipeline_options, PdfPipelineOptions)
173
+ and not self.pipeline_options.generate_parsed_pages
174
+ ):
175
+ del p.parsed_page
176
+ p.parsed_page = None
171
177
 
172
178
  end_batch_time = time.monotonic()
173
179
  total_elapsed_time += end_batch_time - start_batch_time
@@ -565,10 +565,12 @@ class ThreadedStandardPdfPipeline(BasePipeline):
565
565
  if not self.keep_images:
566
566
  for p in conv_res.pages:
567
567
  p._image_cache = {}
568
- if not self.keep_backend:
569
- for p in conv_res.pages:
570
- if p._backend is not None:
571
- p._backend.unload()
568
+ for p in conv_res.pages:
569
+ if not self.keep_backend and p._backend is not None:
570
+ p._backend.unload()
571
+ if not self.pipeline_options.generate_parsed_pages:
572
+ del p.parsed_page
573
+ p.parsed_page = None
572
574
 
573
575
  # ---------------------------------------------------------------- assemble
574
576
  def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.45.0
3
+ Version: 2.46.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.0.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.2.2
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -1,6 +1,6 @@
1
1
  pydantic<3.0.0,>=2.0.0
2
2
  docling-core[chunking]<3.0.0,>=2.42.0
3
- docling-parse<5.0.0,>=4.0.0
3
+ docling-parse<5.0.0,>=4.2.2
4
4
  docling-ibm-models<4,>=3.9.0
5
5
  filetype<2.0.0,>=1.2.0
6
6
  pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.45.0" # DO NOT EDIT, updated automatically
3
+ version = "2.46.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -45,7 +45,7 @@ requires-python = '>=3.9,<4.0'
45
45
  dependencies = [
46
46
  'pydantic (>=2.0.0,<3.0.0)',
47
47
  'docling-core[chunking] (>=2.42.0,<3.0.0)',
48
- 'docling-parse (>=4.0.0,<5.0.0)',
48
+ 'docling-parse (>=4.2.2,<5.0.0)',
49
49
  "docling-ibm-models>=3.9.0,<4",
50
50
  'filetype (>=1.2.0,<2.0.0)',
51
51
  'pypdfium2 (>=4.30.0,!=4.30.1,<5.0.0)',
@@ -1,6 +1,8 @@
1
1
  from io import BytesIO
2
2
  from pathlib import Path
3
3
 
4
+ from docling_core.types.doc.document import ContentLayer
5
+
4
6
  from docling.backend.html_backend import HTMLDocumentBackend
5
7
  from docling.datamodel.base_models import InputFormat
6
8
  from docling.datamodel.document import (
@@ -179,3 +181,33 @@ def test_e2e_html_conversions():
179
181
  )
180
182
 
181
183
  assert verify_document(doc, str(gt_path) + ".json", GENERATE)
184
+
185
+
186
+ def test_html_furniture():
187
+ raw_html = (
188
+ b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
189
+ b"<h1>Main Heading</h1>"
190
+ b"<p>Some Content</p>"
191
+ b"<footer><p>Some Footer Content</p></footer></body></html"
192
+ )
193
+
194
+ in_doc = InputDocument(
195
+ path_or_stream=BytesIO(raw_html),
196
+ format=InputFormat.HTML,
197
+ backend=HTMLDocumentBackend,
198
+ filename="test",
199
+ )
200
+ backend = HTMLDocumentBackend(
201
+ in_doc=in_doc,
202
+ path_or_stream=BytesIO(raw_html),
203
+ )
204
+ doc: DoclingDocument = backend.convert()
205
+ md_body = doc.export_to_markdown()
206
+ assert md_body == "# Main Heading\n\nSome Content"
207
+ md_all = doc.export_to_markdown(
208
+ included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
209
+ )
210
+ assert md_all == (
211
+ "Initial content with some bold text\n\n# Main Heading\n\nSome Content\n\n"
212
+ "Some Footer Content"
213
+ )
@@ -27,6 +27,7 @@ def get_converter():
27
27
  pipeline_options.do_table_structure = True
28
28
  pipeline_options.table_structure_options.do_cell_matching = True
29
29
  pipeline_options.accelerator_options.device = AcceleratorDevice.CPU
30
+ pipeline_options.generate_parsed_pages = True
30
31
 
31
32
  converter = DocumentConverter(
32
33
  format_options={
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
 
4
4
  import pytest
5
5
 
6
+ from docling.datamodel.accelerator_options import AcceleratorDevice
6
7
  from docling.datamodel.base_models import DocumentStream, InputFormat
7
8
  from docling.datamodel.pipeline_options import PdfPipelineOptions
8
9
  from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -24,6 +25,8 @@ def converter():
24
25
  pipeline_options.do_ocr = False
25
26
  pipeline_options.do_table_structure = True
26
27
  pipeline_options.table_structure_options.do_cell_matching = True
28
+ pipeline_options.accelerator_options.device = AcceleratorDevice.CPU
29
+ pipeline_options.generate_parsed_pages = True
27
30
 
28
31
  converter = DocumentConverter(
29
32
  format_options={
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes