docling 2.30.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +26 -9
- docling/backend/md_backend.py +5 -7
- docling/backend/msexcel_backend.py +1 -7
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +4 -4
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +10 -13
- docling/backend/xml/uspto_backend.py +15 -19
- docling/cli/main.py +7 -7
- docling/cli/models.py +2 -3
- docling/datamodel/base_models.py +7 -5
- docling/datamodel/document.py +11 -10
- docling/datamodel/pipeline_options.py +0 -1
- docling/document_converter.py +5 -5
- docling/models/api_vlm_model.py +1 -2
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +2 -1
- docling/models/picture_description_base_model.py +2 -3
- docling/models/picture_description_vlm_model.py +2 -3
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +8 -23
- docling/models/table_structure_model.py +2 -6
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +8 -6
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/pipeline/vlm_pipeline.py +0 -3
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +7 -7
- docling/utils/utils.py +1 -1
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/METADATA +2 -1
- docling-2.31.0.dist-info/RECORD +86 -0
- docling-2.30.0.dist-info/RECORD +0 -86
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
34
34
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
35
35
|
self.lines = text_stream.split("\n")
|
36
36
|
if isinstance(self.path_or_stream, Path):
|
37
|
-
with open(self.path_or_stream,
|
37
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
38
38
|
self.lines = f.readlines()
|
39
39
|
self.valid = True
|
40
40
|
|
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
75
75
|
|
76
76
|
return doc
|
77
77
|
|
78
|
-
def _parse(self, doc: DoclingDocument):
|
78
|
+
def _parse(self, doc: DoclingDocument): # noqa: C901
|
79
79
|
"""
|
80
80
|
Main function that orchestrates the parsing by yielding components:
|
81
81
|
title, section headers, text, lists, and tables.
|
82
82
|
"""
|
83
83
|
|
84
|
-
content = ""
|
85
|
-
|
86
84
|
in_list = False
|
87
85
|
in_table = False
|
88
86
|
|
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
95
93
|
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
96
94
|
indents: dict[int, Union[GroupItem, None]] = {}
|
97
95
|
|
98
|
-
for i in range(
|
96
|
+
for i in range(10):
|
99
97
|
parents[i] = None
|
100
98
|
indents[i] = None
|
101
99
|
|
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
125
123
|
|
126
124
|
# Lists
|
127
125
|
elif self._is_list_item(line):
|
128
|
-
|
129
126
|
_log.debug(f"line: {line}")
|
130
127
|
item = self._parse_list_item(line)
|
131
128
|
_log.debug(f"parsed list-item: {item}")
|
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
147
144
|
indents[level + 1] = item["indent"]
|
148
145
|
|
149
146
|
elif in_list and item["indent"] < indents[level]:
|
150
|
-
|
151
147
|
# print(item["indent"], " => ", indents[level])
|
152
148
|
while item["indent"] < indents[level]:
|
153
149
|
# print(item["indent"], " => ", indents[level])
|
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
176
172
|
elif in_table and (
|
177
173
|
(not self._is_table_line(line)) or line.strip() == "|==="
|
178
174
|
): # end of table
|
179
|
-
|
180
175
|
caption = None
|
181
176
|
if len(caption_data) > 0:
|
182
177
|
caption = doc.add_text(
|
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
195
190
|
|
196
191
|
# Picture
|
197
192
|
elif self._is_picture(line):
|
198
|
-
|
199
193
|
caption = None
|
200
194
|
if len(caption_data) > 0:
|
201
195
|
caption = doc.add_text(
|
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
250
244
|
text_data = []
|
251
245
|
|
252
246
|
elif len(line.strip()) > 0: # allow multiline texts
|
253
|
-
|
254
247
|
item = self._parse_text(line)
|
255
248
|
text_data.append(item["text"])
|
256
249
|
|
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
273
266
|
|
274
267
|
def _get_current_level(self, parents):
|
275
268
|
for k, v in parents.items():
|
276
|
-
if v
|
269
|
+
if v is None and k > 0:
|
277
270
|
return k - 1
|
278
271
|
|
279
272
|
return 0
|
280
273
|
|
281
274
|
def _get_current_parent(self, parents):
|
282
275
|
for k, v in parents.items():
|
283
|
-
if v
|
276
|
+
if v is None and k > 0:
|
284
277
|
return parents[k - 1]
|
285
278
|
|
286
279
|
return None
|
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
328
321
|
"marker": marker,
|
329
322
|
"text": text.strip(),
|
330
323
|
"numbered": False,
|
331
|
-
"indent": 0 if indent
|
324
|
+
"indent": 0 if indent is None else len(indent),
|
332
325
|
}
|
333
326
|
else:
|
334
327
|
return {
|
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
336
329
|
"marker": marker,
|
337
330
|
"text": text.strip(),
|
338
331
|
"numbered": True,
|
339
|
-
"indent": 0 if indent
|
332
|
+
"indent": 0 if indent is None else len(indent),
|
340
333
|
}
|
341
334
|
else:
|
342
335
|
# Fallback if no match
|
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
357
350
|
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
358
351
|
|
359
352
|
def _populate_table_as_grid(self, table_data):
|
360
|
-
|
361
353
|
num_rows = len(table_data)
|
362
354
|
|
363
355
|
# Adjust the table data into a grid format
|
docling/backend/csv_backend.py
CHANGED
@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
|
58
58
|
head = self.content.readline()
|
59
59
|
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
60
60
|
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
61
|
-
if
|
61
|
+
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
62
62
|
raise RuntimeError(
|
63
63
|
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
64
64
|
)
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
156
157
|
def get_page_image(
|
157
158
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
158
159
|
) -> Image.Image:
|
159
|
-
|
160
160
|
page_size = self.get_size()
|
161
161
|
|
162
162
|
if not cropbox:
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
172
173
|
def get_page_image(
|
173
174
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
174
175
|
) -> Image.Image:
|
175
|
-
|
176
176
|
page_size = self.get_size()
|
177
177
|
|
178
178
|
if not cropbox:
|
@@ -1,14 +1,14 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
2
|
+
from collections.abc import Iterable
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
5
|
+
from typing import TYPE_CHECKING, Optional, Union
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
10
10
|
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
11
|
-
from PIL import Image
|
11
|
+
from PIL import Image
|
12
12
|
from pypdfium2 import PdfPage
|
13
13
|
|
14
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
93
93
|
def get_page_image(
|
94
94
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
95
95
|
) -> Image.Image:
|
96
|
-
|
97
96
|
page_size = self.get_size()
|
98
97
|
|
99
98
|
if not cropbox:
|
@@ -1,12 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
1
|
"""
|
4
2
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
5
3
|
On 23/01/2025
|
6
4
|
"""
|
7
5
|
|
8
|
-
from __future__ import unicode_literals
|
9
|
-
|
10
6
|
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
11
7
|
|
12
8
|
BLANK = ""
|
@@ -79,7 +75,6 @@ CHR_BO = {
|
|
79
75
|
}
|
80
76
|
|
81
77
|
T = {
|
82
|
-
"\u2192": "\\rightarrow ",
|
83
78
|
# Greek letters
|
84
79
|
"\U0001d6fc": "\\alpha ",
|
85
80
|
"\U0001d6fd": "\\beta ",
|
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
|
|
76
76
|
return default
|
77
77
|
|
78
78
|
|
79
|
-
class Tag2Method
|
80
|
-
|
79
|
+
class Tag2Method:
|
81
80
|
def call_method(self, elm, stag=None):
|
82
81
|
getmethod = self.tag2meth.get
|
83
82
|
if stag is None:
|
@@ -130,7 +129,6 @@ class Tag2Method(object):
|
|
130
129
|
|
131
130
|
|
132
131
|
class Pr(Tag2Method):
|
133
|
-
|
134
132
|
text = ""
|
135
133
|
|
136
134
|
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
|
|
159
157
|
def do_common(self, elm):
|
160
158
|
stag = elm.tag.replace(OMML_NS, "")
|
161
159
|
if stag in self.__val_tags:
|
162
|
-
t = elm.get("{
|
160
|
+
t = elm.get(f"{OMML_NS}val")
|
163
161
|
self.__innerdict[stag] = t
|
164
162
|
return None
|
165
163
|
|
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
|
248
246
|
"""
|
249
247
|
the Pre-Sub-Superscript object -- Not support yet
|
250
248
|
"""
|
251
|
-
pass
|
252
249
|
|
253
250
|
def do_sub(self, elm):
|
254
251
|
text = self.process_children(elm)
|
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
|
331
328
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
332
329
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
333
330
|
if not latex_s:
|
334
|
-
raise
|
331
|
+
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
335
332
|
else:
|
336
333
|
return latex_s.format(lim=t_dict.get("lim"))
|
337
334
|
|
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
|
413
410
|
"""
|
414
411
|
_str = []
|
415
412
|
_base_str = []
|
416
|
-
found_text = elm.findtext("./{
|
413
|
+
found_text = elm.findtext(f"./{OMML_NS}t")
|
417
414
|
if found_text:
|
418
415
|
for s in found_text:
|
419
416
|
out_latex_str = self.process_unicode(s)
|
docling/backend/html_backend.py
CHANGED
@@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
|
|
26
26
|
|
27
27
|
# tags that generate NodeItem elements
|
28
28
|
TAGS_FOR_NODE_ITEMS: Final = [
|
29
|
+
"address",
|
30
|
+
"details",
|
29
31
|
"h1",
|
30
32
|
"h2",
|
31
33
|
"h3",
|
@@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
38
40
|
"ul",
|
39
41
|
"ol",
|
40
42
|
"li",
|
43
|
+
"summary",
|
41
44
|
"table",
|
42
45
|
"figure",
|
43
46
|
"img",
|
@@ -55,7 +58,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
55
58
|
self.max_levels = 10
|
56
59
|
self.level = 0
|
57
60
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
58
|
-
for i in range(
|
61
|
+
for i in range(self.max_levels):
|
59
62
|
self.parents[i] = None
|
60
63
|
|
61
64
|
try:
|
@@ -126,7 +129,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
126
129
|
return doc
|
127
130
|
|
128
131
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
129
|
-
|
130
132
|
# Iterate over elements in the body of the document
|
131
133
|
text: str = ""
|
132
134
|
for element in tag.children:
|
@@ -135,7 +137,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
135
137
|
self.analyze_tag(cast(Tag, element), doc)
|
136
138
|
except Exception as exc_child:
|
137
139
|
_log.error(
|
138
|
-
f"Error processing child from tag {tag.name}: {
|
140
|
+
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
139
141
|
)
|
140
142
|
raise exc_child
|
141
143
|
elif isinstance(element, NavigableString) and not isinstance(
|
@@ -147,7 +149,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
147
149
|
item for item in element.next_siblings if isinstance(item, Tag)
|
148
150
|
]
|
149
151
|
if element.next_sibling is None or any(
|
150
|
-
|
152
|
+
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
|
151
153
|
):
|
152
154
|
text = text.strip()
|
153
155
|
if text and tag.name in ["div"]:
|
@@ -164,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
164
166
|
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
165
167
|
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
166
168
|
self.handle_header(tag, doc)
|
167
|
-
elif tag.name in ["p"]:
|
169
|
+
elif tag.name in ["p", "address", "summary"]:
|
168
170
|
self.handle_paragraph(tag, doc)
|
169
171
|
elif tag.name in ["pre", "code"]:
|
170
172
|
self.handle_code(tag, doc)
|
@@ -178,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
178
180
|
self.handle_figure(tag, doc)
|
179
181
|
elif tag.name == "img":
|
180
182
|
self.handle_image(tag, doc)
|
183
|
+
elif tag.name == "details":
|
184
|
+
self.handle_details(tag, doc)
|
181
185
|
else:
|
182
186
|
self.walk(tag, doc)
|
183
187
|
|
@@ -202,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
202
206
|
|
203
207
|
return ["".join(result) + " "]
|
204
208
|
|
209
|
+
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
|
210
|
+
"""Handle details tag (details) and its content."""
|
211
|
+
|
212
|
+
self.parents[self.level + 1] = doc.add_group(
|
213
|
+
name="details",
|
214
|
+
label=GroupLabel.SECTION,
|
215
|
+
parent=self.parents[self.level],
|
216
|
+
content_layer=self.content_layer,
|
217
|
+
)
|
218
|
+
|
219
|
+
self.level += 1
|
220
|
+
self.walk(element, doc)
|
221
|
+
self.parents[self.level + 1] = None
|
222
|
+
self.level -= 1
|
223
|
+
|
205
224
|
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
206
225
|
"""Handles header tags (h1, h2, etc.)."""
|
207
226
|
hlevel = int(element.name.replace("h", ""))
|
@@ -222,7 +241,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
222
241
|
)
|
223
242
|
else:
|
224
243
|
if hlevel > self.level:
|
225
|
-
|
226
244
|
# add invisible group
|
227
245
|
for i in range(self.level + 1, hlevel):
|
228
246
|
self.parents[i] = doc.add_group(
|
@@ -234,7 +252,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
234
252
|
self.level = hlevel
|
235
253
|
|
236
254
|
elif hlevel < self.level:
|
237
|
-
|
238
255
|
# remove the tail
|
239
256
|
for key in self.parents.keys():
|
240
257
|
if key > hlevel:
|
@@ -261,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
261
278
|
)
|
262
279
|
|
263
280
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
264
|
-
"""Handles paragraph tags (p)."""
|
281
|
+
"""Handles paragraph tags (p) or equivalent ones."""
|
265
282
|
if element.text is None:
|
266
283
|
return
|
267
284
|
text = element.text.strip()
|
@@ -360,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
360
377
|
marker = ""
|
361
378
|
enumerated = False
|
362
379
|
if parent_label == GroupLabel.ORDERED_LIST:
|
363
|
-
marker = f"{
|
380
|
+
marker = f"{index_in_list!s}."
|
364
381
|
enumerated = True
|
365
382
|
doc.add_list_item(
|
366
383
|
text=text,
|
docling/backend/md_backend.py
CHANGED
@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
83
83
|
# otherwise they represent emphasis (bold or italic)
|
84
84
|
self.markdown = self._shorten_underscore_sequences(text_stream)
|
85
85
|
if isinstance(self.path_or_stream, Path):
|
86
|
-
with open(self.path_or_stream,
|
86
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
87
87
|
md_content = f.read()
|
88
88
|
# remove invalid sequences
|
89
89
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
168
168
|
)
|
169
169
|
self.inline_texts = []
|
170
170
|
|
171
|
-
def _iterate_elements(
|
171
|
+
def _iterate_elements( # noqa: C901
|
172
172
|
self,
|
173
173
|
element: marko.element.Element,
|
174
174
|
depth: int,
|
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
176
176
|
visited: Set[marko.element.Element],
|
177
177
|
parent_item: Optional[NodeItem] = None,
|
178
178
|
):
|
179
|
-
|
180
179
|
if element in visited:
|
181
180
|
return
|
182
181
|
|
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
236
235
|
if has_non_empty_list_items:
|
237
236
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
238
237
|
parent_item = doc.add_group(
|
239
|
-
label=label, name=
|
238
|
+
label=label, name="list", parent=parent_item
|
240
239
|
)
|
241
240
|
|
242
241
|
elif (
|
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
320
319
|
self._html_blocks += 1
|
321
320
|
self._process_inline_text(parent_item, doc)
|
322
321
|
self._close_table(doc)
|
323
|
-
_log.debug("HTML Block: {}"
|
322
|
+
_log.debug(f"HTML Block: {element}")
|
324
323
|
if (
|
325
324
|
len(element.body) > 0
|
326
325
|
): # If Marko doesn't return any content for HTML block, skip it
|
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
332
331
|
else:
|
333
332
|
if not isinstance(element, str):
|
334
333
|
self._close_table(doc)
|
335
|
-
_log.debug("Some other element: {}"
|
334
|
+
_log.debug(f"Some other element: {element}")
|
336
335
|
|
337
336
|
processed_block_types = (
|
338
337
|
marko.block.Heading,
|
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
398
397
|
|
399
398
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
400
399
|
if self._html_blocks > 0:
|
401
|
-
|
402
400
|
# export to HTML
|
403
401
|
html_backend_cls = HTMLDocumentBackend
|
404
402
|
html_str = doc.export_to_html()
|
@@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
184
184
|
"""
|
185
185
|
|
186
186
|
if self.workbook is not None:
|
187
|
-
|
188
187
|
# Iterate over all sheets
|
189
188
|
for sheet_name in self.workbook.sheetnames:
|
190
189
|
_log.info(f"Processing sheet: {sheet_name}")
|
@@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
253
252
|
)
|
254
253
|
|
255
254
|
for excel_cell in excel_table.data:
|
256
|
-
|
257
255
|
cell = TableCell(
|
258
256
|
text=excel_cell.text,
|
259
257
|
row_span=excel_cell.row_span,
|
@@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
303
301
|
# Iterate over all cells in the sheet
|
304
302
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
305
303
|
for rj, cell in enumerate(row):
|
306
|
-
|
307
304
|
# Skip empty or already visited cells
|
308
305
|
if cell.value is None or (ri, rj) in visited:
|
309
306
|
continue
|
@@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
342
339
|
visited_cells: set[tuple[int, int]] = set()
|
343
340
|
for ri in range(start_row, max_row + 1):
|
344
341
|
for rj in range(start_col, max_col + 1):
|
345
|
-
|
346
342
|
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
347
343
|
|
348
344
|
# Check if the cell belongs to a merged range
|
@@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
350
346
|
col_span = 1
|
351
347
|
|
352
348
|
for merged_range in sheet.merged_cells.ranges:
|
353
|
-
|
354
349
|
if (
|
355
350
|
merged_range.min_row <= ri + 1
|
356
351
|
and ri + 1 <= merged_range.max_row
|
357
352
|
and merged_range.min_col <= rj + 1
|
358
353
|
and rj + 1 <= merged_range.max_col
|
359
354
|
):
|
360
|
-
|
361
355
|
row_span = merged_range.max_row - merged_range.min_row + 1
|
362
356
|
col_span = merged_range.max_col - merged_range.min_col + 1
|
363
357
|
break
|
@@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
499
493
|
),
|
500
494
|
),
|
501
495
|
)
|
502
|
-
except:
|
496
|
+
except Exception:
|
503
497
|
_log.error("could not extract the image from excel sheets")
|
504
498
|
|
505
499
|
return doc
|
@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
120
120
|
|
121
121
|
return prov
|
122
122
|
|
123
|
-
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
123
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
124
124
|
is_a_list = False
|
125
125
|
is_list_group_created = False
|
126
126
|
enum_list_item_value = 0
|
127
127
|
new_list = None
|
128
128
|
bullet_type = "None"
|
129
|
-
list_text = ""
|
130
129
|
list_label = GroupLabel.LIST
|
131
130
|
doc_label = DocItemLabel.LIST_ITEM
|
132
131
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
243
242
|
enum_marker = str(enum_list_item_value) + "."
|
244
243
|
if not is_list_group_created:
|
245
244
|
new_list = doc.add_group(
|
246
|
-
label=list_label, name=
|
245
|
+
label=list_label, name="list", parent=parent_slide
|
247
246
|
)
|
248
247
|
is_list_group_created = True
|
249
248
|
doc.add_list_item(
|
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
368
367
|
slide_width = pptx_obj.slide_width
|
369
368
|
slide_height = pptx_obj.slide_height
|
370
369
|
|
371
|
-
text_content = [] # type: ignore
|
372
|
-
|
373
370
|
max_levels = 10
|
374
371
|
parents = {} # type: ignore
|
375
|
-
for i in range(
|
372
|
+
for i in range(max_levels):
|
376
373
|
parents[i] = None
|
377
374
|
|
378
375
|
# Loop through each slide
|
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
383
380
|
)
|
384
381
|
|
385
382
|
slide_size = Size(width=slide_width, height=slide_height)
|
386
|
-
|
383
|
+
doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
387
384
|
|
388
385
|
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
389
386
|
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
158
158
|
def _get_level(self) -> int:
|
159
159
|
"""Return the first None index."""
|
160
160
|
for k, v in self.parents.items():
|
161
|
-
if k >= 0 and v
|
161
|
+
if k >= 0 and v is None:
|
162
162
|
return k
|
163
163
|
return 0
|
164
164
|
|
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
418
418
|
else prev_parent
|
419
419
|
)
|
420
420
|
|
421
|
-
def _handle_text_elements(
|
421
|
+
def _handle_text_elements( # noqa: C901
|
422
422
|
self,
|
423
423
|
element: BaseOxmlElement,
|
424
424
|
docx_obj: DocxDocument,
|
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
812
812
|
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
813
813
|
)
|
814
814
|
if cell is None or cell._tc in cell_set:
|
815
|
-
_log.debug(
|
815
|
+
_log.debug(" skipped since repeated content")
|
816
816
|
col_idx += cell.grid_span
|
817
817
|
continue
|
818
818
|
else:
|
@@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
879
879
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
880
880
|
caption=None,
|
881
881
|
)
|
882
|
-
except (UnidentifiedImageError, OSError)
|
882
|
+
except (UnidentifiedImageError, OSError):
|
883
883
|
_log.warning("Warning: image cannot be loaded by Pillow")
|
884
884
|
doc.add_picture(
|
885
885
|
parent=self.parents[level - 1],
|
docling/backend/pdf_backend.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional, Set, Union
|
5
6
|
|
6
7
|
from docling_core.types.doc import BoundingBox, Size
|
7
8
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
import pypdfium2.raw as pdfium_c
|
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
29
30
|
self.valid = True # No better way to tell from pypdfium.
|
30
31
|
try:
|
31
32
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
32
|
-
except PdfiumError
|
33
|
+
except PdfiumError:
|
33
34
|
_log.info(
|
34
35
|
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
35
36
|
exc_info=True,
|
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
225
226
|
def get_page_image(
|
226
227
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
227
228
|
) -> Image.Image:
|
228
|
-
|
229
229
|
page_size = self.get_size()
|
230
230
|
|
231
231
|
if not cropbox:
|