docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +0 -1
- docling/backend/asciidoc_backend.py +0 -1
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +1 -1
- docling/backend/html_backend.py +4 -3
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +49 -36
- docling/backend/msexcel_backend.py +50 -38
- docling/backend/msword_backend.py +0 -1
- docling/backend/pdf_backend.py +0 -2
- docling/backend/pypdfium2_backend.py +1 -1
- docling/backend/xml/uspto_backend.py +25 -25
- docling/cli/main.py +18 -3
- docling/datamodel/base_models.py +30 -3
- docling/datamodel/document.py +4 -0
- docling/datamodel/pipeline_options.py +7 -9
- docling/document_converter.py +4 -0
- docling/models/base_model.py +62 -6
- docling/models/code_formula_model.py +245 -0
- docling/models/document_picture_classifier.py +187 -0
- docling/models/layout_model.py +10 -86
- docling/models/page_assemble_model.py +1 -33
- docling/models/rapid_ocr_model.py +1 -0
- docling/models/tesseract_ocr_cli_model.py +72 -5
- docling/models/tesseract_ocr_model.py +68 -20
- docling/pipeline/base_pipeline.py +40 -17
- docling/pipeline/standard_pdf_pipeline.py +31 -2
- docling/utils/glm_utils.py +4 -1
- docling/utils/ocr_utils.py +9 -0
- docling/utils/visualization.py +80 -0
- {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
- docling-2.17.0.dist-info/RECORD +62 -0
- docling-2.15.1.dist-info/RECORD +0 -56
- {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
- {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
- {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0
@@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
163
163
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
164
164
|
)
|
165
165
|
else:
|
166
|
-
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
166
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
167
167
|
padbox.r = page_size.width - padbox.r
|
168
168
|
padbox.t = page_size.height - padbox.t
|
169
169
|
|
@@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
178
178
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
179
179
|
)
|
180
180
|
else:
|
181
|
-
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
181
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
182
182
|
padbox.r = page_size.width - padbox.r
|
183
183
|
padbox.t = page_size.height - padbox.t
|
184
184
|
|
docling/backend/html_backend.py
CHANGED
@@ -78,10 +78,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
78
78
|
|
79
79
|
if self.is_valid():
|
80
80
|
assert self.soup is not None
|
81
|
+
content = self.soup.body or self.soup
|
81
82
|
# Replace <br> tags with newline characters
|
82
|
-
for br in
|
83
|
+
for br in content.find_all("br"):
|
83
84
|
br.replace_with("\n")
|
84
|
-
doc = self.walk(
|
85
|
+
doc = self.walk(content, doc)
|
85
86
|
else:
|
86
87
|
raise RuntimeError(
|
87
88
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
@@ -215,7 +216,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
215
216
|
label = DocItemLabel.CODE
|
216
217
|
if len(text) == 0:
|
217
218
|
return
|
218
|
-
doc.
|
219
|
+
doc.add_code(parent=self.parents[self.level], label=label, text=text)
|
219
220
|
|
220
221
|
def handle_paragraph(self, element, idx, doc):
|
221
222
|
"""Handles paragraph tags (p)."""
|
File without changes
|
@@ -0,0 +1,58 @@
|
|
1
|
+
from io import BytesIO
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
from docling_core.types.doc import DoclingDocument
|
6
|
+
from typing_extensions import override
|
7
|
+
|
8
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
9
|
+
from docling.datamodel.base_models import InputFormat
|
10
|
+
from docling.datamodel.document import InputDocument
|
11
|
+
|
12
|
+
|
13
|
+
class DoclingJSONBackend(DeclarativeDocumentBackend):
|
14
|
+
@override
|
15
|
+
def __init__(
|
16
|
+
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
17
|
+
) -> None:
|
18
|
+
super().__init__(in_doc, path_or_stream)
|
19
|
+
|
20
|
+
# given we need to store any actual conversion exception for raising it from
|
21
|
+
# convert(), this captures the successful result or the actual error in a
|
22
|
+
# mutually exclusive way:
|
23
|
+
self._doc_or_err = self._get_doc_or_err()
|
24
|
+
|
25
|
+
@override
|
26
|
+
def is_valid(self) -> bool:
|
27
|
+
return isinstance(self._doc_or_err, DoclingDocument)
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
@override
|
31
|
+
def supports_pagination(cls) -> bool:
|
32
|
+
return False
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
@override
|
36
|
+
def supported_formats(cls) -> set[InputFormat]:
|
37
|
+
return {InputFormat.JSON_DOCLING}
|
38
|
+
|
39
|
+
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
|
40
|
+
try:
|
41
|
+
json_data: Union[str, bytes]
|
42
|
+
if isinstance(self.path_or_stream, Path):
|
43
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
44
|
+
json_data = f.read()
|
45
|
+
elif isinstance(self.path_or_stream, BytesIO):
|
46
|
+
json_data = self.path_or_stream.getvalue()
|
47
|
+
else:
|
48
|
+
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
49
|
+
return DoclingDocument.model_validate_json(json_data=json_data)
|
50
|
+
except Exception as e:
|
51
|
+
return e
|
52
|
+
|
53
|
+
@override
|
54
|
+
def convert(self) -> DoclingDocument:
|
55
|
+
if isinstance(self._doc_or_err, DoclingDocument):
|
56
|
+
return self._doc_or_err
|
57
|
+
else:
|
58
|
+
raise self._doc_or_err
|
docling/backend/md_backend.py
CHANGED
@@ -3,19 +3,22 @@ import re
|
|
3
3
|
import warnings
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Set, Union
|
6
|
+
from typing import List, Optional, Set, Union
|
7
7
|
|
8
8
|
import marko
|
9
9
|
import marko.ext
|
10
10
|
import marko.ext.gfm
|
11
11
|
import marko.inline
|
12
12
|
from docling_core.types.doc import (
|
13
|
+
DocItem,
|
13
14
|
DocItemLabel,
|
14
15
|
DoclingDocument,
|
15
16
|
DocumentOrigin,
|
16
17
|
GroupLabel,
|
18
|
+
NodeItem,
|
17
19
|
TableCell,
|
18
20
|
TableData,
|
21
|
+
TextItem,
|
19
22
|
)
|
20
23
|
from marko import Markdown
|
21
24
|
|
@@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
|
|
27
30
|
|
28
31
|
|
29
32
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
30
|
-
|
31
|
-
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
33
|
+
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
32
34
|
# This regex will match any sequence of underscores
|
33
35
|
pattern = r"_+"
|
34
36
|
|
@@ -63,7 +65,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
63
65
|
|
64
66
|
self.in_table = False
|
65
67
|
self.md_table_buffer: list[str] = []
|
66
|
-
self.
|
68
|
+
self.inline_texts: list[str] = []
|
67
69
|
|
68
70
|
try:
|
69
71
|
if isinstance(self.path_or_stream, BytesIO):
|
@@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
90
92
|
) from e
|
91
93
|
return
|
92
94
|
|
93
|
-
def close_table(self, doc
|
95
|
+
def close_table(self, doc: DoclingDocument):
|
94
96
|
if self.in_table:
|
95
97
|
_log.debug("=== TABLE START ===")
|
96
98
|
for md_table_row in self.md_table_buffer:
|
97
99
|
_log.debug(md_table_row)
|
98
100
|
_log.debug("=== TABLE END ===")
|
99
|
-
tcells = []
|
101
|
+
tcells: List[TableCell] = []
|
100
102
|
result_table = []
|
101
103
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
102
104
|
data = []
|
@@ -137,33 +139,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
137
139
|
self.in_table = False
|
138
140
|
self.md_table_buffer = [] # clean table markdown buffer
|
139
141
|
# Initialize Docling TableData
|
140
|
-
|
142
|
+
table_data = TableData(
|
143
|
+
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
|
144
|
+
)
|
141
145
|
# Populate
|
142
146
|
for tcell in tcells:
|
143
|
-
|
147
|
+
table_data.table_cells.append(tcell)
|
144
148
|
if len(tcells) > 0:
|
145
|
-
doc.add_table(data=
|
149
|
+
doc.add_table(data=table_data)
|
146
150
|
return
|
147
151
|
|
148
|
-
def process_inline_text(
|
149
|
-
|
150
|
-
|
152
|
+
def process_inline_text(
|
153
|
+
self, parent_element: Optional[NodeItem], doc: DoclingDocument
|
154
|
+
):
|
155
|
+
txt = " ".join(self.inline_texts)
|
151
156
|
if len(txt) > 0:
|
152
157
|
doc.add_text(
|
153
158
|
label=DocItemLabel.PARAGRAPH,
|
154
159
|
parent=parent_element,
|
155
160
|
text=txt,
|
156
161
|
)
|
157
|
-
self.
|
158
|
-
|
159
|
-
def iterate_elements(
|
162
|
+
self.inline_texts = []
|
163
|
+
|
164
|
+
def iterate_elements(
|
165
|
+
self,
|
166
|
+
element: marko.block.Element,
|
167
|
+
depth: int,
|
168
|
+
doc: DoclingDocument,
|
169
|
+
parent_element: Optional[NodeItem] = None,
|
170
|
+
):
|
160
171
|
# Iterates over all elements in the AST
|
161
172
|
# Check for different element types and process relevant details
|
162
173
|
if isinstance(element, marko.block.Heading):
|
163
174
|
self.close_table(doc)
|
164
175
|
self.process_inline_text(parent_element, doc)
|
165
176
|
_log.debug(
|
166
|
-
f" - Heading level {element.level}, content: {element.children[0].children}"
|
177
|
+
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
167
178
|
)
|
168
179
|
if element.level == 1:
|
169
180
|
doc_label = DocItemLabel.TITLE
|
@@ -172,10 +183,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
172
183
|
|
173
184
|
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
174
185
|
# hence we need to traverse the tree to get full text of a header
|
175
|
-
strings = []
|
186
|
+
strings: List[str] = []
|
176
187
|
|
177
188
|
# Define a recursive function to traverse the tree
|
178
|
-
def traverse(node):
|
189
|
+
def traverse(node: marko.block.BlockElement):
|
179
190
|
# Check if the node has a "children" attribute
|
180
191
|
if hasattr(node, "children"):
|
181
192
|
# If "children" is a list, continue traversal
|
@@ -209,9 +220,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
209
220
|
self.process_inline_text(parent_element, doc)
|
210
221
|
_log.debug(" - List item")
|
211
222
|
|
212
|
-
snippet_text = str(element.children[0].children[0].children)
|
223
|
+
snippet_text = str(element.children[0].children[0].children) # type: ignore
|
213
224
|
is_numbered = False
|
214
|
-
if
|
225
|
+
if (
|
226
|
+
parent_element is not None
|
227
|
+
and isinstance(parent_element, DocItem)
|
228
|
+
and parent_element.label == GroupLabel.ORDERED_LIST
|
229
|
+
):
|
215
230
|
is_numbered = True
|
216
231
|
doc.add_list_item(
|
217
232
|
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
@@ -221,7 +236,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
221
236
|
self.close_table(doc)
|
222
237
|
self.process_inline_text(parent_element, doc)
|
223
238
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
224
|
-
|
239
|
+
|
240
|
+
fig_caption: Optional[TextItem] = None
|
241
|
+
if element.title is not None and element.title != "":
|
242
|
+
fig_caption = doc.add_text(
|
243
|
+
label=DocItemLabel.CAPTION, text=element.title
|
244
|
+
)
|
245
|
+
|
246
|
+
doc.add_picture(parent=parent_element, caption=fig_caption)
|
225
247
|
|
226
248
|
elif isinstance(element, marko.block.Paragraph):
|
227
249
|
self.process_inline_text(parent_element, doc)
|
@@ -243,39 +265,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
243
265
|
self.close_table(doc)
|
244
266
|
self.in_table = False
|
245
267
|
# most likely just inline text
|
246
|
-
self.
|
247
|
-
element.children
|
248
|
-
) # do not strip an inline text, as it may contain important spaces
|
268
|
+
self.inline_texts.append(str(element.children))
|
249
269
|
|
250
270
|
elif isinstance(element, marko.inline.CodeSpan):
|
251
271
|
self.close_table(doc)
|
252
272
|
self.process_inline_text(parent_element, doc)
|
253
273
|
_log.debug(f" - Code Span: {element.children}")
|
254
274
|
snippet_text = str(element.children).strip()
|
255
|
-
doc.
|
256
|
-
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
257
|
-
)
|
275
|
+
doc.add_code(parent=parent_element, text=snippet_text)
|
258
276
|
|
259
277
|
elif isinstance(element, marko.block.CodeBlock):
|
260
278
|
self.close_table(doc)
|
261
279
|
self.process_inline_text(parent_element, doc)
|
262
280
|
_log.debug(f" - Code Block: {element.children}")
|
263
|
-
snippet_text = str(element.children[0].children).strip()
|
264
|
-
doc.
|
265
|
-
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
266
|
-
)
|
281
|
+
snippet_text = str(element.children[0].children).strip() # type: ignore
|
282
|
+
doc.add_code(parent=parent_element, text=snippet_text)
|
267
283
|
|
268
284
|
elif isinstance(element, marko.block.FencedCode):
|
269
285
|
self.close_table(doc)
|
270
286
|
self.process_inline_text(parent_element, doc)
|
271
287
|
_log.debug(f" - Code Block: {element.children}")
|
272
|
-
snippet_text = str(element.children[0].children).strip()
|
273
|
-
doc.
|
274
|
-
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
275
|
-
)
|
288
|
+
snippet_text = str(element.children[0].children).strip() # type: ignore
|
289
|
+
doc.add_code(parent=parent_element, text=snippet_text)
|
276
290
|
|
277
291
|
elif isinstance(element, marko.inline.LineBreak):
|
278
|
-
self.process_inline_text(parent_element, doc)
|
279
292
|
if self.in_table:
|
280
293
|
_log.debug("Line break in a table")
|
281
294
|
self.md_table_buffer.append("")
|
@@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
|
|
26
26
|
|
27
27
|
from typing import Any, List
|
28
28
|
|
29
|
+
from PIL import Image as PILImage
|
29
30
|
from pydantic import BaseModel
|
30
31
|
|
31
32
|
|
@@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
|
|
44
45
|
|
45
46
|
|
46
47
|
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
47
|
-
|
48
48
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
49
49
|
super().__init__(in_doc, path_or_stream)
|
50
50
|
|
@@ -326,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
326
326
|
self, doc: DoclingDocument, sheet: Worksheet
|
327
327
|
) -> DoclingDocument:
|
328
328
|
|
329
|
-
#
|
330
|
-
|
331
|
-
# Iterate over images in the sheet
|
332
|
-
for idx, image in enumerate(sheet._images): # Access embedded images
|
329
|
+
# Iterate over byte images in the sheet
|
330
|
+
for idx, image in enumerate(sheet._images): # type: ignore
|
333
331
|
|
334
|
-
|
335
|
-
|
332
|
+
try:
|
333
|
+
pil_image = PILImage.open(image.ref)
|
336
334
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
335
|
+
doc.add_picture(
|
336
|
+
parent=self.parents[0],
|
337
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
338
|
+
caption=None,
|
339
|
+
)
|
340
|
+
except:
|
341
|
+
_log.error("could not extract the image from excel sheets")
|
343
342
|
|
344
|
-
# FIXME: mypy does not agree with _charts ...
|
345
343
|
"""
|
346
|
-
for idx, chart in enumerate(sheet._charts): #
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
344
|
+
for idx, chart in enumerate(sheet._charts): # type: ignore
|
345
|
+
try:
|
346
|
+
chart_path = f"chart_{idx + 1}.png"
|
347
|
+
_log.info(
|
348
|
+
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
349
|
+
)
|
351
350
|
|
352
|
-
|
353
|
-
|
354
|
-
# Chart type
|
355
|
-
_log.info(f"Type: {type(chart).__name__}")
|
356
|
-
|
357
|
-
# Title
|
358
|
-
if chart.title:
|
359
|
-
_log.info(f"Title: {chart.title}")
|
360
|
-
else:
|
361
|
-
_log.info("No title")
|
362
|
-
|
363
|
-
# Data series
|
364
|
-
for series in chart.series:
|
365
|
-
_log.info(" => series ...")
|
366
|
-
_log.info(f"Data Series: {series.title}")
|
367
|
-
_log.info(f"Values: {series.values}")
|
368
|
-
_log.info(f"Categories: {series.categories}")
|
351
|
+
_log.info(f"Chart {idx + 1}:")
|
369
352
|
|
370
|
-
|
371
|
-
|
353
|
+
# Chart type
|
354
|
+
# _log.info(f"Type: {type(chart).__name__}")
|
355
|
+
print(f"Type: {type(chart).__name__}")
|
356
|
+
|
357
|
+
# Extract series data
|
358
|
+
for series_idx, series in enumerate(chart.series):
|
359
|
+
#_log.info(f"Series {series_idx + 1}:")
|
360
|
+
print(f"Series {series_idx + 1} type: {type(series).__name__}")
|
361
|
+
#print(f"x-values: {series.xVal}")
|
362
|
+
#print(f"y-values: {series.yVal}")
|
363
|
+
|
364
|
+
print(f"xval type: {type(series.xVal).__name__}")
|
365
|
+
|
366
|
+
xvals = []
|
367
|
+
for _ in series.xVal.numLit.pt:
|
368
|
+
print(f"xval type: {type(_).__name__}")
|
369
|
+
if hasattr(_, 'v'):
|
370
|
+
xvals.append(_.v)
|
371
|
+
|
372
|
+
print(f"x-values: {xvals}")
|
373
|
+
|
374
|
+
yvals = []
|
375
|
+
for _ in series.yVal:
|
376
|
+
if hasattr(_, 'v'):
|
377
|
+
yvals.append(_.v)
|
378
|
+
|
379
|
+
print(f"y-values: {yvals}")
|
380
|
+
|
381
|
+
except Exception as exc:
|
382
|
+
print(exc)
|
383
|
+
continue
|
372
384
|
"""
|
373
385
|
|
374
386
|
return doc
|
docling/backend/pdf_backend.py
CHANGED
@@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument
|
|
12
12
|
|
13
13
|
|
14
14
|
class PdfPageBackend(ABC):
|
15
|
-
|
16
15
|
@abstractmethod
|
17
16
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
18
17
|
pass
|
@@ -45,7 +44,6 @@ class PdfPageBackend(ABC):
|
|
45
44
|
|
46
45
|
|
47
46
|
class PdfDocumentBackend(PaginatedDocumentBackend):
|
48
|
-
|
49
47
|
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
50
48
|
super().__init__(in_doc, path_or_stream)
|
51
49
|
|
@@ -210,7 +210,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
210
210
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
211
211
|
)
|
212
212
|
else:
|
213
|
-
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
213
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
214
214
|
padbox.r = page_size.width - padbox.r
|
215
215
|
padbox.t = page_size.height - padbox.t
|
216
216
|
|