docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. docling/backend/abstract_backend.py +0 -1
  2. docling/backend/asciidoc_backend.py +0 -1
  3. docling/backend/docling_parse_backend.py +1 -1
  4. docling/backend/docling_parse_v2_backend.py +1 -1
  5. docling/backend/html_backend.py +4 -3
  6. docling/backend/json/__init__.py +0 -0
  7. docling/backend/json/docling_json_backend.py +58 -0
  8. docling/backend/md_backend.py +49 -36
  9. docling/backend/msexcel_backend.py +50 -38
  10. docling/backend/msword_backend.py +0 -1
  11. docling/backend/pdf_backend.py +0 -2
  12. docling/backend/pypdfium2_backend.py +1 -1
  13. docling/backend/xml/uspto_backend.py +25 -25
  14. docling/cli/main.py +18 -3
  15. docling/datamodel/base_models.py +30 -3
  16. docling/datamodel/document.py +4 -0
  17. docling/datamodel/pipeline_options.py +7 -9
  18. docling/document_converter.py +4 -0
  19. docling/models/base_model.py +62 -6
  20. docling/models/code_formula_model.py +245 -0
  21. docling/models/document_picture_classifier.py +187 -0
  22. docling/models/layout_model.py +10 -86
  23. docling/models/page_assemble_model.py +1 -33
  24. docling/models/rapid_ocr_model.py +1 -0
  25. docling/models/tesseract_ocr_cli_model.py +72 -5
  26. docling/models/tesseract_ocr_model.py +68 -20
  27. docling/pipeline/base_pipeline.py +40 -17
  28. docling/pipeline/standard_pdf_pipeline.py +31 -2
  29. docling/utils/glm_utils.py +4 -1
  30. docling/utils/ocr_utils.py +9 -0
  31. docling/utils/visualization.py +80 -0
  32. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
  33. docling-2.17.0.dist-info/RECORD +62 -0
  34. docling-2.15.1.dist-info/RECORD +0 -56
  35. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
  36. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
  37. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0
@@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
27
27
  def supports_pagination(cls) -> bool:
28
28
  pass
29
29
 
30
- @abstractmethod
31
30
  def unload(self):
32
31
  if isinstance(self.path_or_stream, BytesIO):
33
32
  self.path_or_stream.close()
@@ -24,7 +24,6 @@ _log = logging.getLogger(__name__)
24
24
 
25
25
 
26
26
  class AsciiDocBackend(DeclarativeDocumentBackend):
27
-
28
27
  def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
29
28
  super().__init__(in_doc, path_or_stream)
30
29
 
@@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
163
163
  l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
164
164
  )
165
165
  else:
166
- padbox = cropbox.to_bottom_left_origin(page_size.height)
166
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
167
167
  padbox.r = page_size.width - padbox.r
168
168
  padbox.t = page_size.height - padbox.t
169
169
 
@@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
178
178
  l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
179
179
  )
180
180
  else:
181
- padbox = cropbox.to_bottom_left_origin(page_size.height)
181
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
182
182
  padbox.r = page_size.width - padbox.r
183
183
  padbox.t = page_size.height - padbox.t
184
184
 
@@ -78,10 +78,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
78
78
 
79
79
  if self.is_valid():
80
80
  assert self.soup is not None
81
+ content = self.soup.body or self.soup
81
82
  # Replace <br> tags with newline characters
82
- for br in self.soup.body.find_all("br"):
83
+ for br in content.find_all("br"):
83
84
  br.replace_with("\n")
84
- doc = self.walk(self.soup.body, doc)
85
+ doc = self.walk(content, doc)
85
86
  else:
86
87
  raise RuntimeError(
87
88
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
@@ -215,7 +216,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
215
216
  label = DocItemLabel.CODE
216
217
  if len(text) == 0:
217
218
  return
218
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
219
+ doc.add_code(parent=self.parents[self.level], label=label, text=text)
219
220
 
220
221
  def handle_paragraph(self, element, idx, doc):
221
222
  """Handles paragraph tags (p)."""
File without changes
@@ -0,0 +1,58 @@
1
+ from io import BytesIO
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ from docling_core.types.doc import DoclingDocument
6
+ from typing_extensions import override
7
+
8
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
11
+
12
+
13
+ class DoclingJSONBackend(DeclarativeDocumentBackend):
14
+ @override
15
+ def __init__(
16
+ self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
17
+ ) -> None:
18
+ super().__init__(in_doc, path_or_stream)
19
+
20
+ # given we need to store any actual conversion exception for raising it from
21
+ # convert(), this captures the successful result or the actual error in a
22
+ # mutually exclusive way:
23
+ self._doc_or_err = self._get_doc_or_err()
24
+
25
+ @override
26
+ def is_valid(self) -> bool:
27
+ return isinstance(self._doc_or_err, DoclingDocument)
28
+
29
+ @classmethod
30
+ @override
31
+ def supports_pagination(cls) -> bool:
32
+ return False
33
+
34
+ @classmethod
35
+ @override
36
+ def supported_formats(cls) -> set[InputFormat]:
37
+ return {InputFormat.JSON_DOCLING}
38
+
39
+ def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
40
+ try:
41
+ json_data: Union[str, bytes]
42
+ if isinstance(self.path_or_stream, Path):
43
+ with open(self.path_or_stream, encoding="utf-8") as f:
44
+ json_data = f.read()
45
+ elif isinstance(self.path_or_stream, BytesIO):
46
+ json_data = self.path_or_stream.getvalue()
47
+ else:
48
+ raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
49
+ return DoclingDocument.model_validate_json(json_data=json_data)
50
+ except Exception as e:
51
+ return e
52
+
53
+ @override
54
+ def convert(self) -> DoclingDocument:
55
+ if isinstance(self._doc_or_err, DoclingDocument):
56
+ return self._doc_or_err
57
+ else:
58
+ raise self._doc_or_err
@@ -3,19 +3,22 @@ import re
3
3
  import warnings
4
4
  from io import BytesIO
5
5
  from pathlib import Path
6
- from typing import Set, Union
6
+ from typing import List, Optional, Set, Union
7
7
 
8
8
  import marko
9
9
  import marko.ext
10
10
  import marko.ext.gfm
11
11
  import marko.inline
12
12
  from docling_core.types.doc import (
13
+ DocItem,
13
14
  DocItemLabel,
14
15
  DoclingDocument,
15
16
  DocumentOrigin,
16
17
  GroupLabel,
18
+ NodeItem,
17
19
  TableCell,
18
20
  TableData,
21
+ TextItem,
19
22
  )
20
23
  from marko import Markdown
21
24
 
@@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
27
30
 
28
31
 
29
32
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
30
-
31
- def shorten_underscore_sequences(self, markdown_text, max_length=10):
33
+ def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
32
34
  # This regex will match any sequence of underscores
33
35
  pattern = r"_+"
34
36
 
@@ -63,7 +65,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
63
65
 
64
66
  self.in_table = False
65
67
  self.md_table_buffer: list[str] = []
66
- self.inline_text_buffer = ""
68
+ self.inline_texts: list[str] = []
67
69
 
68
70
  try:
69
71
  if isinstance(self.path_or_stream, BytesIO):
@@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
90
92
  ) from e
91
93
  return
92
94
 
93
- def close_table(self, doc=None):
95
+ def close_table(self, doc: DoclingDocument):
94
96
  if self.in_table:
95
97
  _log.debug("=== TABLE START ===")
96
98
  for md_table_row in self.md_table_buffer:
97
99
  _log.debug(md_table_row)
98
100
  _log.debug("=== TABLE END ===")
99
- tcells = []
101
+ tcells: List[TableCell] = []
100
102
  result_table = []
101
103
  for n, md_table_row in enumerate(self.md_table_buffer):
102
104
  data = []
@@ -137,33 +139,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
137
139
  self.in_table = False
138
140
  self.md_table_buffer = [] # clean table markdown buffer
139
141
  # Initialize Docling TableData
140
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
142
+ table_data = TableData(
143
+ num_rows=num_rows, num_cols=num_cols, table_cells=tcells
144
+ )
141
145
  # Populate
142
146
  for tcell in tcells:
143
- data.table_cells.append(tcell)
147
+ table_data.table_cells.append(tcell)
144
148
  if len(tcells) > 0:
145
- doc.add_table(data=data)
149
+ doc.add_table(data=table_data)
146
150
  return
147
151
 
148
- def process_inline_text(self, parent_element, doc=None):
149
- # self.inline_text_buffer += str(text_in)
150
- txt = self.inline_text_buffer.strip()
152
+ def process_inline_text(
153
+ self, parent_element: Optional[NodeItem], doc: DoclingDocument
154
+ ):
155
+ txt = " ".join(self.inline_texts)
151
156
  if len(txt) > 0:
152
157
  doc.add_text(
153
158
  label=DocItemLabel.PARAGRAPH,
154
159
  parent=parent_element,
155
160
  text=txt,
156
161
  )
157
- self.inline_text_buffer = ""
158
-
159
- def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
162
+ self.inline_texts = []
163
+
164
+ def iterate_elements(
165
+ self,
166
+ element: marko.block.Element,
167
+ depth: int,
168
+ doc: DoclingDocument,
169
+ parent_element: Optional[NodeItem] = None,
170
+ ):
160
171
  # Iterates over all elements in the AST
161
172
  # Check for different element types and process relevant details
162
173
  if isinstance(element, marko.block.Heading):
163
174
  self.close_table(doc)
164
175
  self.process_inline_text(parent_element, doc)
165
176
  _log.debug(
166
- f" - Heading level {element.level}, content: {element.children[0].children}"
177
+ f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
167
178
  )
168
179
  if element.level == 1:
169
180
  doc_label = DocItemLabel.TITLE
@@ -172,10 +183,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
172
183
 
173
184
  # Header could have arbitrary inclusion of bold, italic or emphasis,
174
185
  # hence we need to traverse the tree to get full text of a header
175
- strings = []
186
+ strings: List[str] = []
176
187
 
177
188
  # Define a recursive function to traverse the tree
178
- def traverse(node):
189
+ def traverse(node: marko.block.BlockElement):
179
190
  # Check if the node has a "children" attribute
180
191
  if hasattr(node, "children"):
181
192
  # If "children" is a list, continue traversal
@@ -209,9 +220,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
209
220
  self.process_inline_text(parent_element, doc)
210
221
  _log.debug(" - List item")
211
222
 
212
- snippet_text = str(element.children[0].children[0].children)
223
+ snippet_text = str(element.children[0].children[0].children) # type: ignore
213
224
  is_numbered = False
214
- if parent_element.label == GroupLabel.ORDERED_LIST:
225
+ if (
226
+ parent_element is not None
227
+ and isinstance(parent_element, DocItem)
228
+ and parent_element.label == GroupLabel.ORDERED_LIST
229
+ ):
215
230
  is_numbered = True
216
231
  doc.add_list_item(
217
232
  enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -221,7 +236,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
221
236
  self.close_table(doc)
222
237
  self.process_inline_text(parent_element, doc)
223
238
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
224
- doc.add_picture(parent=parent_element, caption=element.title)
239
+
240
+ fig_caption: Optional[TextItem] = None
241
+ if element.title is not None and element.title != "":
242
+ fig_caption = doc.add_text(
243
+ label=DocItemLabel.CAPTION, text=element.title
244
+ )
245
+
246
+ doc.add_picture(parent=parent_element, caption=fig_caption)
225
247
 
226
248
  elif isinstance(element, marko.block.Paragraph):
227
249
  self.process_inline_text(parent_element, doc)
@@ -243,39 +265,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
243
265
  self.close_table(doc)
244
266
  self.in_table = False
245
267
  # most likely just inline text
246
- self.inline_text_buffer += str(
247
- element.children
248
- ) # do not strip an inline text, as it may contain important spaces
268
+ self.inline_texts.append(str(element.children))
249
269
 
250
270
  elif isinstance(element, marko.inline.CodeSpan):
251
271
  self.close_table(doc)
252
272
  self.process_inline_text(parent_element, doc)
253
273
  _log.debug(f" - Code Span: {element.children}")
254
274
  snippet_text = str(element.children).strip()
255
- doc.add_text(
256
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
257
- )
275
+ doc.add_code(parent=parent_element, text=snippet_text)
258
276
 
259
277
  elif isinstance(element, marko.block.CodeBlock):
260
278
  self.close_table(doc)
261
279
  self.process_inline_text(parent_element, doc)
262
280
  _log.debug(f" - Code Block: {element.children}")
263
- snippet_text = str(element.children[0].children).strip()
264
- doc.add_text(
265
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
266
- )
281
+ snippet_text = str(element.children[0].children).strip() # type: ignore
282
+ doc.add_code(parent=parent_element, text=snippet_text)
267
283
 
268
284
  elif isinstance(element, marko.block.FencedCode):
269
285
  self.close_table(doc)
270
286
  self.process_inline_text(parent_element, doc)
271
287
  _log.debug(f" - Code Block: {element.children}")
272
- snippet_text = str(element.children[0].children).strip()
273
- doc.add_text(
274
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
275
- )
288
+ snippet_text = str(element.children[0].children).strip() # type: ignore
289
+ doc.add_code(parent=parent_element, text=snippet_text)
276
290
 
277
291
  elif isinstance(element, marko.inline.LineBreak):
278
- self.process_inline_text(parent_element, doc)
279
292
  if self.in_table:
280
293
  _log.debug("Line break in a table")
281
294
  self.md_table_buffer.append("")
@@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
26
26
 
27
27
  from typing import Any, List
28
28
 
29
+ from PIL import Image as PILImage
29
30
  from pydantic import BaseModel
30
31
 
31
32
 
@@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
44
45
 
45
46
 
46
47
  class MsExcelDocumentBackend(DeclarativeDocumentBackend):
47
-
48
48
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
49
49
  super().__init__(in_doc, path_or_stream)
50
50
 
@@ -326,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
326
326
  self, doc: DoclingDocument, sheet: Worksheet
327
327
  ) -> DoclingDocument:
328
328
 
329
- # FIXME: mypy does not agree with _images ...
330
- """
331
- # Iterate over images in the sheet
332
- for idx, image in enumerate(sheet._images): # Access embedded images
329
+ # Iterate over byte images in the sheet
330
+ for idx, image in enumerate(sheet._images): # type: ignore
333
331
 
334
- image_bytes = BytesIO(image.ref.blob)
335
- pil_image = Image.open(image_bytes)
332
+ try:
333
+ pil_image = PILImage.open(image.ref)
336
334
 
337
- doc.add_picture(
338
- parent=self.parents[0],
339
- image=ImageRef.from_pil(image=pil_image, dpi=72),
340
- caption=None,
341
- )
342
- """
335
+ doc.add_picture(
336
+ parent=self.parents[0],
337
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
338
+ caption=None,
339
+ )
340
+ except:
341
+ _log.error("could not extract the image from excel sheets")
343
342
 
344
- # FIXME: mypy does not agree with _charts ...
345
343
  """
346
- for idx, chart in enumerate(sheet._charts): # Access embedded charts
347
- chart_path = f"chart_{idx + 1}.png"
348
- _log.info(
349
- f"Chart found, but dynamic rendering is required for: {chart_path}"
350
- )
344
+ for idx, chart in enumerate(sheet._charts): # type: ignore
345
+ try:
346
+ chart_path = f"chart_{idx + 1}.png"
347
+ _log.info(
348
+ f"Chart found, but dynamic rendering is required for: {chart_path}"
349
+ )
351
350
 
352
- _log.info(f"Chart {idx + 1}:")
353
-
354
- # Chart type
355
- _log.info(f"Type: {type(chart).__name__}")
356
-
357
- # Title
358
- if chart.title:
359
- _log.info(f"Title: {chart.title}")
360
- else:
361
- _log.info("No title")
362
-
363
- # Data series
364
- for series in chart.series:
365
- _log.info(" => series ...")
366
- _log.info(f"Data Series: {series.title}")
367
- _log.info(f"Values: {series.values}")
368
- _log.info(f"Categories: {series.categories}")
351
+ _log.info(f"Chart {idx + 1}:")
369
352
 
370
- # Position
371
- # _log.info(f"Anchor Cell: {chart.anchor}")
353
+ # Chart type
354
+ # _log.info(f"Type: {type(chart).__name__}")
355
+ print(f"Type: {type(chart).__name__}")
356
+
357
+ # Extract series data
358
+ for series_idx, series in enumerate(chart.series):
359
+ #_log.info(f"Series {series_idx + 1}:")
360
+ print(f"Series {series_idx + 1} type: {type(series).__name__}")
361
+ #print(f"x-values: {series.xVal}")
362
+ #print(f"y-values: {series.yVal}")
363
+
364
+ print(f"xval type: {type(series.xVal).__name__}")
365
+
366
+ xvals = []
367
+ for _ in series.xVal.numLit.pt:
368
+ print(f"xval type: {type(_).__name__}")
369
+ if hasattr(_, 'v'):
370
+ xvals.append(_.v)
371
+
372
+ print(f"x-values: {xvals}")
373
+
374
+ yvals = []
375
+ for _ in series.yVal:
376
+ if hasattr(_, 'v'):
377
+ yvals.append(_.v)
378
+
379
+ print(f"y-values: {yvals}")
380
+
381
+ except Exception as exc:
382
+ print(exc)
383
+ continue
372
384
  """
373
385
 
374
386
  return doc
@@ -26,7 +26,6 @@ _log = logging.getLogger(__name__)
26
26
 
27
27
 
28
28
  class MsWordDocumentBackend(DeclarativeDocumentBackend):
29
-
30
29
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
31
30
  super().__init__(in_doc, path_or_stream)
32
31
  self.XML_KEY = (
@@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument
12
12
 
13
13
 
14
14
  class PdfPageBackend(ABC):
15
-
16
15
  @abstractmethod
17
16
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
18
17
  pass
@@ -45,7 +44,6 @@ class PdfPageBackend(ABC):
45
44
 
46
45
 
47
46
  class PdfDocumentBackend(PaginatedDocumentBackend):
48
-
49
47
  def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
50
48
  super().__init__(in_doc, path_or_stream)
51
49
 
@@ -210,7 +210,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
210
210
  l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
211
211
  )
212
212
  else:
213
- padbox = cropbox.to_bottom_left_origin(page_size.height)
213
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
214
214
  padbox.r = page_size.width - padbox.r
215
215
  padbox.t = page_size.height - padbox.t
216
216