docling 2.15.1__tar.gz → 2.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {docling-2.15.1 → docling-2.16.0}/PKG-INFO +5 -4
  2. {docling-2.15.1 → docling-2.16.0}/docling/backend/abstract_backend.py +0 -1
  3. {docling-2.15.1 → docling-2.16.0}/docling/backend/asciidoc_backend.py +0 -1
  4. {docling-2.15.1 → docling-2.16.0}/docling/backend/docling_parse_backend.py +1 -1
  5. {docling-2.15.1 → docling-2.16.0}/docling/backend/docling_parse_v2_backend.py +1 -1
  6. {docling-2.15.1 → docling-2.16.0}/docling/backend/html_backend.py +1 -1
  7. docling-2.16.0/docling/backend/json/docling_json_backend.py +58 -0
  8. {docling-2.15.1 → docling-2.16.0}/docling/backend/md_backend.py +44 -27
  9. {docling-2.15.1 → docling-2.16.0}/docling/backend/msexcel_backend.py +50 -38
  10. {docling-2.15.1 → docling-2.16.0}/docling/backend/msword_backend.py +0 -1
  11. {docling-2.15.1 → docling-2.16.0}/docling/backend/pdf_backend.py +0 -2
  12. {docling-2.15.1 → docling-2.16.0}/docling/backend/pypdfium2_backend.py +1 -1
  13. {docling-2.15.1 → docling-2.16.0}/docling/datamodel/base_models.py +30 -3
  14. {docling-2.15.1 → docling-2.16.0}/docling/datamodel/document.py +2 -0
  15. {docling-2.15.1 → docling-2.16.0}/docling/datamodel/pipeline_options.py +6 -9
  16. {docling-2.15.1 → docling-2.16.0}/docling/document_converter.py +4 -0
  17. docling-2.16.0/docling/models/base_model.py +84 -0
  18. docling-2.16.0/docling/models/code_formula_model.py +245 -0
  19. docling-2.16.0/docling/models/document_picture_classifier.py +187 -0
  20. {docling-2.15.1 → docling-2.16.0}/docling/models/layout_model.py +10 -86
  21. {docling-2.15.1 → docling-2.16.0}/docling/models/page_assemble_model.py +1 -33
  22. {docling-2.15.1 → docling-2.16.0}/docling/models/tesseract_ocr_cli_model.py +0 -1
  23. {docling-2.15.1 → docling-2.16.0}/docling/models/tesseract_ocr_model.py +63 -15
  24. {docling-2.15.1 → docling-2.16.0}/docling/pipeline/base_pipeline.py +40 -17
  25. {docling-2.15.1 → docling-2.16.0}/docling/pipeline/standard_pdf_pipeline.py +31 -2
  26. docling-2.16.0/docling/utils/__init__.py +0 -0
  27. {docling-2.15.1 → docling-2.16.0}/docling/utils/glm_utils.py +4 -1
  28. docling-2.16.0/docling/utils/visualization.py +80 -0
  29. {docling-2.15.1 → docling-2.16.0}/pyproject.toml +5 -4
  30. docling-2.15.1/docling/models/base_model.py +0 -28
  31. {docling-2.15.1 → docling-2.16.0}/LICENSE +0 -0
  32. {docling-2.15.1 → docling-2.16.0}/README.md +0 -0
  33. {docling-2.15.1 → docling-2.16.0}/docling/__init__.py +0 -0
  34. {docling-2.15.1 → docling-2.16.0}/docling/backend/__init__.py +0 -0
  35. {docling-2.15.1/docling/backend/xml → docling-2.16.0/docling/backend/json}/__init__.py +0 -0
  36. {docling-2.15.1 → docling-2.16.0}/docling/backend/mspowerpoint_backend.py +0 -0
  37. {docling-2.15.1/docling/cli → docling-2.16.0/docling/backend/xml}/__init__.py +0 -0
  38. {docling-2.15.1 → docling-2.16.0}/docling/backend/xml/pubmed_backend.py +0 -0
  39. {docling-2.15.1 → docling-2.16.0}/docling/backend/xml/uspto_backend.py +0 -0
  40. {docling-2.15.1 → docling-2.16.0}/docling/chunking/__init__.py +0 -0
  41. {docling-2.15.1/docling/datamodel → docling-2.16.0/docling/cli}/__init__.py +0 -0
  42. {docling-2.15.1 → docling-2.16.0}/docling/cli/main.py +0 -0
  43. {docling-2.15.1/docling/models → docling-2.16.0/docling/datamodel}/__init__.py +0 -0
  44. {docling-2.15.1 → docling-2.16.0}/docling/datamodel/settings.py +0 -0
  45. {docling-2.15.1 → docling-2.16.0}/docling/exceptions.py +0 -0
  46. {docling-2.15.1/docling/pipeline → docling-2.16.0/docling/models}/__init__.py +0 -0
  47. {docling-2.15.1 → docling-2.16.0}/docling/models/base_ocr_model.py +0 -0
  48. {docling-2.15.1 → docling-2.16.0}/docling/models/ds_glm_model.py +0 -0
  49. {docling-2.15.1 → docling-2.16.0}/docling/models/easyocr_model.py +0 -0
  50. {docling-2.15.1 → docling-2.16.0}/docling/models/ocr_mac_model.py +0 -0
  51. {docling-2.15.1 → docling-2.16.0}/docling/models/page_preprocessing_model.py +0 -0
  52. {docling-2.15.1 → docling-2.16.0}/docling/models/rapid_ocr_model.py +0 -0
  53. {docling-2.15.1 → docling-2.16.0}/docling/models/table_structure_model.py +0 -0
  54. {docling-2.15.1/docling/utils → docling-2.16.0/docling/pipeline}/__init__.py +0 -0
  55. {docling-2.15.1 → docling-2.16.0}/docling/pipeline/simple_pipeline.py +0 -0
  56. {docling-2.15.1 → docling-2.16.0}/docling/py.typed +0 -0
  57. {docling-2.15.1 → docling-2.16.0}/docling/utils/accelerator_utils.py +0 -0
  58. {docling-2.15.1 → docling-2.16.0}/docling/utils/export.py +0 -0
  59. {docling-2.15.1 → docling-2.16.0}/docling/utils/layout_postprocessor.py +0 -0
  60. {docling-2.15.1 → docling-2.16.0}/docling/utils/profiling.py +0 -0
  61. {docling-2.15.1 → docling-2.16.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.15.1
3
+ Version: 2.16.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
30
- Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
31
- Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
30
+ Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
+ Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,6 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
42
43
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
43
44
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
45
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
27
27
  def supports_pagination(cls) -> bool:
28
28
  pass
29
29
 
30
- @abstractmethod
31
30
  def unload(self):
32
31
  if isinstance(self.path_or_stream, BytesIO):
33
32
  self.path_or_stream.close()
@@ -24,7 +24,6 @@ _log = logging.getLogger(__name__)
24
24
 
25
25
 
26
26
  class AsciiDocBackend(DeclarativeDocumentBackend):
27
-
28
27
  def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
29
28
  super().__init__(in_doc, path_or_stream)
30
29
 
@@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
163
163
  l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
164
164
  )
165
165
  else:
166
- padbox = cropbox.to_bottom_left_origin(page_size.height)
166
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
167
167
  padbox.r = page_size.width - padbox.r
168
168
  padbox.t = page_size.height - padbox.t
169
169
 
@@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
178
178
  l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
179
179
  )
180
180
  else:
181
- padbox = cropbox.to_bottom_left_origin(page_size.height)
181
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
182
182
  padbox.r = page_size.width - padbox.r
183
183
  padbox.t = page_size.height - padbox.t
184
184
 
@@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
215
215
  label = DocItemLabel.CODE
216
216
  if len(text) == 0:
217
217
  return
218
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
218
+ doc.add_code(parent=self.parents[self.level], label=label, text=text)
219
219
 
220
220
  def handle_paragraph(self, element, idx, doc):
221
221
  """Handles paragraph tags (p)."""
@@ -0,0 +1,58 @@
1
+ from io import BytesIO
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ from docling_core.types.doc import DoclingDocument
6
+ from typing_extensions import override
7
+
8
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
11
+
12
+
13
+ class DoclingJSONBackend(DeclarativeDocumentBackend):
14
+ @override
15
+ def __init__(
16
+ self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
17
+ ) -> None:
18
+ super().__init__(in_doc, path_or_stream)
19
+
20
+ # given we need to store any actual conversion exception for raising it from
21
+ # convert(), this captures the successful result or the actual error in a
22
+ # mutually exclusive way:
23
+ self._doc_or_err = self._get_doc_or_err()
24
+
25
+ @override
26
+ def is_valid(self) -> bool:
27
+ return isinstance(self._doc_or_err, DoclingDocument)
28
+
29
+ @classmethod
30
+ @override
31
+ def supports_pagination(cls) -> bool:
32
+ return False
33
+
34
+ @classmethod
35
+ @override
36
+ def supported_formats(cls) -> set[InputFormat]:
37
+ return {InputFormat.JSON_DOCLING}
38
+
39
+ def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
40
+ try:
41
+ json_data: Union[str, bytes]
42
+ if isinstance(self.path_or_stream, Path):
43
+ with open(self.path_or_stream, encoding="utf-8") as f:
44
+ json_data = f.read()
45
+ elif isinstance(self.path_or_stream, BytesIO):
46
+ json_data = self.path_or_stream.getvalue()
47
+ else:
48
+ raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
49
+ return DoclingDocument.model_validate_json(json_data=json_data)
50
+ except Exception as e:
51
+ return e
52
+
53
+ @override
54
+ def convert(self) -> DoclingDocument:
55
+ if isinstance(self._doc_or_err, DoclingDocument):
56
+ return self._doc_or_err
57
+ else:
58
+ raise self._doc_or_err
@@ -3,19 +3,22 @@ import re
3
3
  import warnings
4
4
  from io import BytesIO
5
5
  from pathlib import Path
6
- from typing import Set, Union
6
+ from typing import List, Optional, Set, Union
7
7
 
8
8
  import marko
9
9
  import marko.ext
10
10
  import marko.ext.gfm
11
11
  import marko.inline
12
12
  from docling_core.types.doc import (
13
+ DocItem,
13
14
  DocItemLabel,
14
15
  DoclingDocument,
15
16
  DocumentOrigin,
16
17
  GroupLabel,
18
+ NodeItem,
17
19
  TableCell,
18
20
  TableData,
21
+ TextItem,
19
22
  )
20
23
  from marko import Markdown
21
24
 
@@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
27
30
 
28
31
 
29
32
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
30
-
31
- def shorten_underscore_sequences(self, markdown_text, max_length=10):
33
+ def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
32
34
  # This regex will match any sequence of underscores
33
35
  pattern = r"_+"
34
36
 
@@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
90
92
  ) from e
91
93
  return
92
94
 
93
- def close_table(self, doc=None):
95
+ def close_table(self, doc: DoclingDocument):
94
96
  if self.in_table:
95
97
  _log.debug("=== TABLE START ===")
96
98
  for md_table_row in self.md_table_buffer:
97
99
  _log.debug(md_table_row)
98
100
  _log.debug("=== TABLE END ===")
99
- tcells = []
101
+ tcells: List[TableCell] = []
100
102
  result_table = []
101
103
  for n, md_table_row in enumerate(self.md_table_buffer):
102
104
  data = []
@@ -137,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
137
139
  self.in_table = False
138
140
  self.md_table_buffer = [] # clean table markdown buffer
139
141
  # Initialize Docling TableData
140
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
142
+ table_data = TableData(
143
+ num_rows=num_rows, num_cols=num_cols, table_cells=tcells
144
+ )
141
145
  # Populate
142
146
  for tcell in tcells:
143
- data.table_cells.append(tcell)
147
+ table_data.table_cells.append(tcell)
144
148
  if len(tcells) > 0:
145
- doc.add_table(data=data)
149
+ doc.add_table(data=table_data)
146
150
  return
147
151
 
148
- def process_inline_text(self, parent_element, doc=None):
152
+ def process_inline_text(
153
+ self, parent_element: Optional[NodeItem], doc: DoclingDocument
154
+ ):
149
155
  # self.inline_text_buffer += str(text_in)
150
156
  txt = self.inline_text_buffer.strip()
151
157
  if len(txt) > 0:
@@ -156,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
156
162
  )
157
163
  self.inline_text_buffer = ""
158
164
 
159
- def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
165
+ def iterate_elements(
166
+ self,
167
+ element: marko.block.Element,
168
+ depth: int,
169
+ doc: DoclingDocument,
170
+ parent_element: Optional[NodeItem] = None,
171
+ ):
160
172
  # Iterates over all elements in the AST
161
173
  # Check for different element types and process relevant details
162
174
  if isinstance(element, marko.block.Heading):
163
175
  self.close_table(doc)
164
176
  self.process_inline_text(parent_element, doc)
165
177
  _log.debug(
166
- f" - Heading level {element.level}, content: {element.children[0].children}"
178
+ f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
167
179
  )
168
180
  if element.level == 1:
169
181
  doc_label = DocItemLabel.TITLE
@@ -172,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
172
184
 
173
185
  # Header could have arbitrary inclusion of bold, italic or emphasis,
174
186
  # hence we need to traverse the tree to get full text of a header
175
- strings = []
187
+ strings: List[str] = []
176
188
 
177
189
  # Define a recursive function to traverse the tree
178
- def traverse(node):
190
+ def traverse(node: marko.block.BlockElement):
179
191
  # Check if the node has a "children" attribute
180
192
  if hasattr(node, "children"):
181
193
  # If "children" is a list, continue traversal
@@ -209,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
209
221
  self.process_inline_text(parent_element, doc)
210
222
  _log.debug(" - List item")
211
223
 
212
- snippet_text = str(element.children[0].children[0].children)
224
+ snippet_text = str(element.children[0].children[0].children) # type: ignore
213
225
  is_numbered = False
214
- if parent_element.label == GroupLabel.ORDERED_LIST:
226
+ if (
227
+ parent_element is not None
228
+ and isinstance(parent_element, DocItem)
229
+ and parent_element.label == GroupLabel.ORDERED_LIST
230
+ ):
215
231
  is_numbered = True
216
232
  doc.add_list_item(
217
233
  enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -221,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
221
237
  self.close_table(doc)
222
238
  self.process_inline_text(parent_element, doc)
223
239
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
224
- doc.add_picture(parent=parent_element, caption=element.title)
240
+
241
+ fig_caption: Optional[TextItem] = None
242
+ if element.title is not None and element.title != "":
243
+ fig_caption = doc.add_text(
244
+ label=DocItemLabel.CAPTION, text=element.title
245
+ )
246
+
247
+ doc.add_picture(parent=parent_element, caption=fig_caption)
225
248
 
226
249
  elif isinstance(element, marko.block.Paragraph):
227
250
  self.process_inline_text(parent_element, doc)
@@ -252,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
252
275
  self.process_inline_text(parent_element, doc)
253
276
  _log.debug(f" - Code Span: {element.children}")
254
277
  snippet_text = str(element.children).strip()
255
- doc.add_text(
256
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
257
- )
278
+ doc.add_code(parent=parent_element, text=snippet_text)
258
279
 
259
280
  elif isinstance(element, marko.block.CodeBlock):
260
281
  self.close_table(doc)
261
282
  self.process_inline_text(parent_element, doc)
262
283
  _log.debug(f" - Code Block: {element.children}")
263
- snippet_text = str(element.children[0].children).strip()
264
- doc.add_text(
265
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
266
- )
284
+ snippet_text = str(element.children[0].children).strip() # type: ignore
285
+ doc.add_code(parent=parent_element, text=snippet_text)
267
286
 
268
287
  elif isinstance(element, marko.block.FencedCode):
269
288
  self.close_table(doc)
270
289
  self.process_inline_text(parent_element, doc)
271
290
  _log.debug(f" - Code Block: {element.children}")
272
- snippet_text = str(element.children[0].children).strip()
273
- doc.add_text(
274
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
275
- )
291
+ snippet_text = str(element.children[0].children).strip() # type: ignore
292
+ doc.add_code(parent=parent_element, text=snippet_text)
276
293
 
277
294
  elif isinstance(element, marko.inline.LineBreak):
278
295
  self.process_inline_text(parent_element, doc)
@@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
26
26
 
27
27
  from typing import Any, List
28
28
 
29
+ from PIL import Image as PILImage
29
30
  from pydantic import BaseModel
30
31
 
31
32
 
@@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
44
45
 
45
46
 
46
47
  class MsExcelDocumentBackend(DeclarativeDocumentBackend):
47
-
48
48
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
49
49
  super().__init__(in_doc, path_or_stream)
50
50
 
@@ -326,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
326
326
  self, doc: DoclingDocument, sheet: Worksheet
327
327
  ) -> DoclingDocument:
328
328
 
329
- # FIXME: mypy does not agree with _images ...
330
- """
331
- # Iterate over images in the sheet
332
- for idx, image in enumerate(sheet._images): # Access embedded images
329
+ # Iterate over byte images in the sheet
330
+ for idx, image in enumerate(sheet._images): # type: ignore
333
331
 
334
- image_bytes = BytesIO(image.ref.blob)
335
- pil_image = Image.open(image_bytes)
332
+ try:
333
+ pil_image = PILImage.open(image.ref)
336
334
 
337
- doc.add_picture(
338
- parent=self.parents[0],
339
- image=ImageRef.from_pil(image=pil_image, dpi=72),
340
- caption=None,
341
- )
342
- """
335
+ doc.add_picture(
336
+ parent=self.parents[0],
337
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
338
+ caption=None,
339
+ )
340
+ except:
341
+ _log.error("could not extract the image from excel sheets")
343
342
 
344
- # FIXME: mypy does not agree with _charts ...
345
343
  """
346
- for idx, chart in enumerate(sheet._charts): # Access embedded charts
347
- chart_path = f"chart_{idx + 1}.png"
348
- _log.info(
349
- f"Chart found, but dynamic rendering is required for: {chart_path}"
350
- )
344
+ for idx, chart in enumerate(sheet._charts): # type: ignore
345
+ try:
346
+ chart_path = f"chart_{idx + 1}.png"
347
+ _log.info(
348
+ f"Chart found, but dynamic rendering is required for: {chart_path}"
349
+ )
351
350
 
352
- _log.info(f"Chart {idx + 1}:")
353
-
354
- # Chart type
355
- _log.info(f"Type: {type(chart).__name__}")
356
-
357
- # Title
358
- if chart.title:
359
- _log.info(f"Title: {chart.title}")
360
- else:
361
- _log.info("No title")
362
-
363
- # Data series
364
- for series in chart.series:
365
- _log.info(" => series ...")
366
- _log.info(f"Data Series: {series.title}")
367
- _log.info(f"Values: {series.values}")
368
- _log.info(f"Categories: {series.categories}")
351
+ _log.info(f"Chart {idx + 1}:")
369
352
 
370
- # Position
371
- # _log.info(f"Anchor Cell: {chart.anchor}")
353
+ # Chart type
354
+ # _log.info(f"Type: {type(chart).__name__}")
355
+ print(f"Type: {type(chart).__name__}")
356
+
357
+ # Extract series data
358
+ for series_idx, series in enumerate(chart.series):
359
+ #_log.info(f"Series {series_idx + 1}:")
360
+ print(f"Series {series_idx + 1} type: {type(series).__name__}")
361
+ #print(f"x-values: {series.xVal}")
362
+ #print(f"y-values: {series.yVal}")
363
+
364
+ print(f"xval type: {type(series.xVal).__name__}")
365
+
366
+ xvals = []
367
+ for _ in series.xVal.numLit.pt:
368
+ print(f"xval type: {type(_).__name__}")
369
+ if hasattr(_, 'v'):
370
+ xvals.append(_.v)
371
+
372
+ print(f"x-values: {xvals}")
373
+
374
+ yvals = []
375
+ for _ in series.yVal:
376
+ if hasattr(_, 'v'):
377
+ yvals.append(_.v)
378
+
379
+ print(f"y-values: {yvals}")
380
+
381
+ except Exception as exc:
382
+ print(exc)
383
+ continue
372
384
  """
373
385
 
374
386
  return doc
@@ -26,7 +26,6 @@ _log = logging.getLogger(__name__)
26
26
 
27
27
 
28
28
  class MsWordDocumentBackend(DeclarativeDocumentBackend):
29
-
30
29
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
31
30
  super().__init__(in_doc, path_or_stream)
32
31
  self.XML_KEY = (
@@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument
12
12
 
13
13
 
14
14
  class PdfPageBackend(ABC):
15
-
16
15
  @abstractmethod
17
16
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
18
17
  pass
@@ -45,7 +44,6 @@ class PdfPageBackend(ABC):
45
44
 
46
45
 
47
46
  class PdfDocumentBackend(PaginatedDocumentBackend):
48
-
49
47
  def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
50
48
  super().__init__(in_doc, path_or_stream)
51
49
 
@@ -210,7 +210,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
210
210
  l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
211
211
  )
212
212
  else:
213
- padbox = cropbox.to_bottom_left_origin(page_size.height)
213
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
214
214
  padbox.r = page_size.width - padbox.r
215
215
  padbox.t = page_size.height - padbox.t
216
216
 
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
4
  from docling_core.types.doc import (
5
5
  BoundingBox,
6
6
  DocItemLabel,
7
+ NodeItem,
7
8
  PictureDataType,
8
9
  Size,
9
10
  TableCell,
@@ -40,6 +41,7 @@ class InputFormat(str, Enum):
40
41
  MD = "md"
41
42
  XLSX = "xlsx"
42
43
  XML_USPTO = "xml_uspto"
44
+ JSON_DOCLING = "json_docling"
43
45
 
44
46
 
45
47
  class OutputFormat(str, Enum):
@@ -61,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
61
63
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
62
64
  InputFormat.XLSX: ["xlsx"],
63
65
  InputFormat.XML_USPTO: ["xml", "txt"],
66
+ InputFormat.JSON_DOCLING: ["json"],
64
67
  }
65
68
 
66
69
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -89,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
89
92
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
90
93
  ],
91
94
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
95
+ InputFormat.JSON_DOCLING: ["application/json"],
92
96
  }
93
97
 
94
98
  MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -201,6 +205,13 @@ class AssembledUnit(BaseModel):
201
205
  headers: List[PageElement] = []
202
206
 
203
207
 
208
+ class ItemAndImageEnrichmentElement(BaseModel):
209
+ model_config = ConfigDict(arbitrary_types_allowed=True)
210
+
211
+ item: NodeItem
212
+ image: Image
213
+
214
+
204
215
  class Page(BaseModel):
205
216
  model_config = ConfigDict(arbitrary_types_allowed=True)
206
217
 
@@ -219,12 +230,28 @@ class Page(BaseModel):
219
230
  {}
220
231
  ) # Cache of images in different scales. By default it is cleared during assembling.
221
232
 
222
- def get_image(self, scale: float = 1.0) -> Optional[Image]:
233
+ def get_image(
234
+ self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
235
+ ) -> Optional[Image]:
223
236
  if self._backend is None:
224
237
  return self._image_cache.get(scale, None)
238
+
225
239
  if not scale in self._image_cache:
226
- self._image_cache[scale] = self._backend.get_page_image(scale=scale)
227
- return self._image_cache[scale]
240
+ if cropbox is None:
241
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
242
+ else:
243
+ return self._backend.get_page_image(scale=scale, cropbox=cropbox)
244
+
245
+ if cropbox is None:
246
+ return self._image_cache[scale]
247
+ else:
248
+ page_im = self._image_cache[scale]
249
+ assert self.size is not None
250
+ return page_im.crop(
251
+ cropbox.to_top_left_origin(page_height=self.size.height)
252
+ .scaled(scale=scale)
253
+ .as_tuple()
254
+ )
228
255
 
229
256
  @property
230
257
  def image(self) -> Optional[Image]:
@@ -350,6 +350,8 @@ class _DocumentConversionInput(BaseModel):
350
350
  mime = FormatToMimeType[InputFormat.HTML][0]
351
351
  elif ext in FormatToExtensions[InputFormat.MD]:
352
352
  mime = FormatToMimeType[InputFormat.MD][0]
353
+ elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
354
+ mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
353
355
  return mime
354
356
 
355
357
  @staticmethod
@@ -1,17 +1,11 @@
1
1
  import logging
2
2
  import os
3
- import warnings
4
3
  from enum import Enum
5
4
  from pathlib import Path
6
- from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
5
+ from typing import Any, List, Literal, Optional, Union
7
6
 
8
- from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
9
- from pydantic_settings import (
10
- BaseSettings,
11
- PydanticBaseSettingsSource,
12
- SettingsConfigDict,
13
- )
14
- from typing_extensions import deprecated
7
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
15
9
 
16
10
  _log = logging.getLogger(__name__)
17
11
 
@@ -225,6 +219,9 @@ class PdfPipelineOptions(PipelineOptions):
225
219
  artifacts_path: Optional[Union[Path, str]] = None
226
220
  do_table_structure: bool = True # True: perform table structure extraction
227
221
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
222
+ do_code_enrichment: bool = False # True: perform code OCR
223
+ do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
224
+ do_picture_classification: bool = False # True: classify pictures in documents
228
225
 
229
226
  table_structure_options: TableStructureOptions = TableStructureOptions()
230
227
  ocr_options: Union[
@@ -11,6 +11,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
11
11
  from docling.backend.asciidoc_backend import AsciiDocBackend
12
12
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
13
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
+ from docling.backend.json.docling_json_backend import DoclingJSONBackend
14
15
  from docling.backend.md_backend import MarkdownDocumentBackend
15
16
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
17
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
@@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
136
137
  InputFormat.PDF: FormatOption(
137
138
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
138
139
  ),
140
+ InputFormat.JSON_DOCLING: FormatOption(
141
+ pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
142
+ ),
139
143
  }
140
144
  if (options := format_to_default_options.get(format)) is not None:
141
145
  return options