docling 2.28.4__py3-none-any.whl → 2.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docx/latex/latex_dict.py +3 -0
- docling/backend/docx/latex/omml.py +14 -14
- docling/backend/html_backend.py +2 -1
- docling/backend/mspowerpoint_backend.py +4 -3
- docling/backend/msword_backend.py +300 -106
- docling/cli/main.py +50 -0
- docling/models/tesseract_ocr_cli_model.py +1 -1
- {docling-2.28.4.dist-info → docling-2.29.0.dist-info}/METADATA +1 -1
- {docling-2.28.4.dist-info → docling-2.29.0.dist-info}/RECORD +12 -12
- {docling-2.28.4.dist-info → docling-2.29.0.dist-info}/LICENSE +0 -0
- {docling-2.28.4.dist-info → docling-2.29.0.dist-info}/WHEEL +0 -0
- {docling-2.28.4.dist-info → docling-2.29.0.dist-info}/entry_points.txt +0 -0
@@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
|
5
5
|
On 23/01/2025
|
6
6
|
"""
|
7
7
|
|
8
|
+
import logging
|
9
|
+
|
8
10
|
import lxml.etree as ET
|
9
11
|
from pylatexenc.latexencode import UnicodeToLatexEncoder
|
10
12
|
|
@@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import (
|
|
39
41
|
|
40
42
|
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
41
43
|
|
44
|
+
_log = logging.getLogger(__name__)
|
45
|
+
|
42
46
|
|
43
47
|
def load(stream):
|
44
48
|
tree = ET.parse(stream)
|
@@ -281,8 +285,10 @@ class oMath2Latex(Tag2Method):
|
|
281
285
|
if FUNC.get(t):
|
282
286
|
latex_chars.append(FUNC[t])
|
283
287
|
else:
|
284
|
-
|
285
|
-
|
288
|
+
_log.warning("Function not supported, will default to text: %s", t)
|
289
|
+
if isinstance(t, str):
|
290
|
+
latex_chars.append(t)
|
291
|
+
elif isinstance(t, str):
|
286
292
|
latex_chars.append(t)
|
287
293
|
t = BLANK.join(latex_chars)
|
288
294
|
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
@@ -382,8 +388,6 @@ class oMath2Latex(Tag2Method):
|
|
382
388
|
|
383
389
|
out_latex_str = self.u.unicode_to_latex(s)
|
384
390
|
|
385
|
-
# print(s, out_latex_str)
|
386
|
-
|
387
391
|
if (
|
388
392
|
s.startswith("{") is False
|
389
393
|
and out_latex_str.startswith("{")
|
@@ -392,19 +396,13 @@ class oMath2Latex(Tag2Method):
|
|
392
396
|
):
|
393
397
|
out_latex_str = f" {out_latex_str[1:-1]} "
|
394
398
|
|
395
|
-
# print(s, out_latex_str)
|
396
|
-
|
397
399
|
if "ensuremath" in out_latex_str:
|
398
400
|
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
|
399
401
|
out_latex_str = out_latex_str.replace("}", " ")
|
400
402
|
|
401
|
-
# print(s, out_latex_str)
|
402
|
-
|
403
403
|
if out_latex_str.strip().startswith("\\text"):
|
404
404
|
out_latex_str = f" \\text{{{out_latex_str}}} "
|
405
405
|
|
406
|
-
# print(s, out_latex_str)
|
407
|
-
|
408
406
|
return out_latex_str
|
409
407
|
|
410
408
|
def do_r(self, elm):
|
@@ -415,10 +413,12 @@ class oMath2Latex(Tag2Method):
|
|
415
413
|
"""
|
416
414
|
_str = []
|
417
415
|
_base_str = []
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
416
|
+
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
417
|
+
if found_text:
|
418
|
+
for s in found_text:
|
419
|
+
out_latex_str = self.process_unicode(s)
|
420
|
+
_str.append(out_latex_str)
|
421
|
+
_base_str.append(s)
|
422
422
|
|
423
423
|
proc_str = escape_latex(BLANK.join(_str))
|
424
424
|
base_proc_str = BLANK.join(_base_str)
|
docling/backend/html_backend.py
CHANGED
@@ -34,6 +34,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
34
34
|
"h6",
|
35
35
|
"p",
|
36
36
|
"pre",
|
37
|
+
"code",
|
37
38
|
"ul",
|
38
39
|
"ol",
|
39
40
|
"li",
|
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
165
166
|
self.handle_header(tag, doc)
|
166
167
|
elif tag.name in ["p"]:
|
167
168
|
self.handle_paragraph(tag, doc)
|
168
|
-
elif tag.name in ["pre"]:
|
169
|
+
elif tag.name in ["pre", "code"]:
|
169
170
|
self.handle_code(tag, doc)
|
170
171
|
elif tag.name in ["ul", "ol"]:
|
171
172
|
self.handle_list(tag, doc)
|
@@ -392,9 +392,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
392
392
|
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
393
393
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
394
394
|
# Handle Pictures
|
395
|
-
|
396
|
-
|
397
|
-
|
395
|
+
if hasattr(shape, "image"):
|
396
|
+
self.handle_pictures(
|
397
|
+
shape, parent_slide, slide_ind, doc, slide_size
|
398
|
+
)
|
398
399
|
# If shape doesn't have any text, move on to the next shape
|
399
400
|
if not hasattr(shape, "text"):
|
400
401
|
return
|
@@ -14,15 +14,19 @@ from docling_core.types.doc import (
|
|
14
14
|
TableCell,
|
15
15
|
TableData,
|
16
16
|
)
|
17
|
+
from docling_core.types.doc.document import Formatting
|
17
18
|
from docx import Document
|
18
19
|
from docx.document import Document as DocxDocument
|
19
20
|
from docx.oxml.table import CT_Tc
|
20
21
|
from docx.oxml.xmlchemy import BaseOxmlElement
|
21
22
|
from docx.table import Table, _Cell
|
23
|
+
from docx.text.hyperlink import Hyperlink
|
22
24
|
from docx.text.paragraph import Paragraph
|
25
|
+
from docx.text.run import Run
|
23
26
|
from lxml import etree
|
24
27
|
from lxml.etree import XPath
|
25
28
|
from PIL import Image, UnidentifiedImageError
|
29
|
+
from pydantic import AnyUrl
|
26
30
|
from typing_extensions import override
|
27
31
|
|
28
32
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -54,6 +58,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
54
58
|
self.level_at_new_list: Optional[int] = None
|
55
59
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
56
60
|
self.numbered_headers: dict[int, int] = {}
|
61
|
+
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
57
62
|
for i in range(-1, self.max_levels):
|
58
63
|
self.parents[i] = None
|
59
64
|
|
@@ -118,14 +123,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
118
123
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
119
124
|
if self.is_valid():
|
120
125
|
assert self.docx_obj is not None
|
121
|
-
doc = self.
|
126
|
+
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
122
127
|
return doc
|
123
128
|
else:
|
124
129
|
raise RuntimeError(
|
125
130
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
126
131
|
)
|
127
132
|
|
128
|
-
def
|
133
|
+
def _update_history(
|
129
134
|
self,
|
130
135
|
name: str,
|
131
136
|
level: Optional[int],
|
@@ -138,26 +143,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
138
143
|
self.history["numids"].append(numid)
|
139
144
|
self.history["indents"].append(ilevel)
|
140
145
|
|
141
|
-
def
|
146
|
+
def _prev_name(self) -> Optional[str]:
|
142
147
|
return self.history["names"][-1]
|
143
148
|
|
144
|
-
def
|
149
|
+
def _prev_level(self) -> Optional[int]:
|
145
150
|
return self.history["levels"][-1]
|
146
151
|
|
147
|
-
def
|
152
|
+
def _prev_numid(self) -> Optional[int]:
|
148
153
|
return self.history["numids"][-1]
|
149
154
|
|
150
|
-
def
|
155
|
+
def _prev_indent(self) -> Optional[int]:
|
151
156
|
return self.history["indents"][-1]
|
152
157
|
|
153
|
-
def
|
158
|
+
def _get_level(self) -> int:
|
154
159
|
"""Return the first None index."""
|
155
160
|
for k, v in self.parents.items():
|
156
161
|
if k >= 0 and v == None:
|
157
162
|
return k
|
158
163
|
return 0
|
159
164
|
|
160
|
-
def
|
165
|
+
def _walk_linear(
|
161
166
|
self,
|
162
167
|
body: BaseOxmlElement,
|
163
168
|
docx_obj: DocxDocument,
|
@@ -177,12 +182,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
177
182
|
# Check for Tables
|
178
183
|
if element.tag.endswith("tbl"):
|
179
184
|
try:
|
180
|
-
self.
|
185
|
+
self._handle_tables(element, docx_obj, doc)
|
181
186
|
except Exception:
|
182
187
|
_log.debug("could not parse a table, broken docx table")
|
183
188
|
|
184
189
|
elif drawing_blip:
|
185
|
-
self.
|
190
|
+
self._handle_pictures(docx_obj, drawing_blip, doc)
|
186
191
|
# Check for the sdt containers, like table of contents
|
187
192
|
elif tag_name in ["sdt"]:
|
188
193
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
@@ -190,16 +195,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
190
195
|
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
191
196
|
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
192
197
|
for p in paragraphs:
|
193
|
-
self.
|
198
|
+
self._handle_text_elements(p, docx_obj, doc)
|
194
199
|
# Check for Text
|
195
200
|
elif tag_name in ["p"]:
|
196
201
|
# "tcPr", "sectPr"
|
197
|
-
self.
|
202
|
+
self._handle_text_elements(element, docx_obj, doc)
|
198
203
|
else:
|
199
204
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
200
205
|
return doc
|
201
206
|
|
202
|
-
def
|
207
|
+
def _str_to_int(
|
208
|
+
self, s: Optional[str], default: Optional[int] = 0
|
209
|
+
) -> Optional[int]:
|
203
210
|
if s is None:
|
204
211
|
return None
|
205
212
|
try:
|
@@ -207,7 +214,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
207
214
|
except ValueError:
|
208
215
|
return default
|
209
216
|
|
210
|
-
def
|
217
|
+
def _split_text_and_number(self, input_string: str) -> list[str]:
|
211
218
|
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
212
219
|
if match:
|
213
220
|
parts = list(filter(None, match.groups()))
|
@@ -215,7 +222,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
215
222
|
else:
|
216
223
|
return [input_string]
|
217
224
|
|
218
|
-
def
|
225
|
+
def _get_numId_and_ilvl(
|
219
226
|
self, paragraph: Paragraph
|
220
227
|
) -> tuple[Optional[int], Optional[int]]:
|
221
228
|
# Access the XML element of the paragraph
|
@@ -230,60 +237,188 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
230
237
|
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
231
238
|
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
232
239
|
|
233
|
-
return self.
|
240
|
+
return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
|
234
241
|
|
235
242
|
return None, None # If the paragraph is not part of a list
|
236
243
|
|
237
|
-
def
|
244
|
+
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
245
|
+
parts = self._split_text_and_number(style_label)
|
246
|
+
|
247
|
+
if len(parts) == 2:
|
248
|
+
parts.sort()
|
249
|
+
label_str: str = ""
|
250
|
+
label_level: Optional[int] = 0
|
251
|
+
if parts[0].strip().lower() == "heading":
|
252
|
+
label_str = "Heading"
|
253
|
+
label_level = self._str_to_int(parts[1], None)
|
254
|
+
if parts[1].strip().lower() == "heading":
|
255
|
+
label_str = "Heading"
|
256
|
+
label_level = self._str_to_int(parts[0], None)
|
257
|
+
return label_str, label_level
|
258
|
+
|
259
|
+
return style_label, None
|
260
|
+
|
261
|
+
def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
238
262
|
if paragraph.style is None:
|
239
263
|
return "Normal", None
|
264
|
+
|
240
265
|
label = paragraph.style.style_id
|
266
|
+
name = paragraph.style.name
|
267
|
+
base_style_label = None
|
268
|
+
base_style_name = None
|
269
|
+
if base_style := getattr(paragraph.style, "base_style", None):
|
270
|
+
base_style_label = base_style.style_id
|
271
|
+
base_style_name = base_style.name
|
272
|
+
|
241
273
|
if label is None:
|
242
274
|
return "Normal", None
|
275
|
+
|
243
276
|
if ":" in label:
|
244
277
|
parts = label.split(":")
|
245
|
-
|
246
278
|
if len(parts) == 2:
|
247
|
-
return parts[0], self.
|
279
|
+
return parts[0], self._str_to_int(parts[1], None)
|
248
280
|
|
249
|
-
|
281
|
+
if "heading" in label.lower():
|
282
|
+
return self._get_heading_and_level(label)
|
283
|
+
if "heading" in name.lower():
|
284
|
+
return self._get_heading_and_level(name)
|
285
|
+
if base_style_label and "heading" in base_style_label.lower():
|
286
|
+
return self._get_heading_and_level(base_style_label)
|
287
|
+
if base_style_name and "heading" in base_style_name.lower():
|
288
|
+
return self._get_heading_and_level(base_style_name)
|
250
289
|
|
251
|
-
|
252
|
-
parts.sort()
|
253
|
-
label_str: str = ""
|
254
|
-
label_level: Optional[int] = 0
|
255
|
-
if parts[0] == "Heading":
|
256
|
-
label_str = parts[0]
|
257
|
-
label_level = self.str_to_int(parts[1], None)
|
258
|
-
if parts[1] == "Heading":
|
259
|
-
label_str = parts[1]
|
260
|
-
label_level = self.str_to_int(parts[0], None)
|
261
|
-
return label_str, label_level
|
262
|
-
else:
|
263
|
-
return label, None
|
290
|
+
return label, None
|
264
291
|
|
265
|
-
|
292
|
+
@classmethod
|
293
|
+
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
294
|
+
has_any_formatting = run.bold or run.italic or run.underline
|
295
|
+
return (
|
296
|
+
Formatting(
|
297
|
+
bold=run.bold or False,
|
298
|
+
italic=run.italic or False,
|
299
|
+
underline=run.underline or False,
|
300
|
+
)
|
301
|
+
if has_any_formatting
|
302
|
+
else None
|
303
|
+
)
|
304
|
+
|
305
|
+
def _get_paragraph_elements(self, paragraph: Paragraph):
|
306
|
+
"""
|
307
|
+
Extract paragraph elements along with their formatting and hyperlink
|
308
|
+
"""
|
309
|
+
|
310
|
+
# for now retain empty paragraphs for backwards compatibility:
|
311
|
+
if paragraph.text.strip() == "":
|
312
|
+
return [("", None, None)]
|
313
|
+
|
314
|
+
paragraph_elements: list[
|
315
|
+
tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
|
316
|
+
] = []
|
317
|
+
group_text = ""
|
318
|
+
previous_format = None
|
319
|
+
|
320
|
+
# Iterate over the runs of the paragraph and group them by format
|
321
|
+
for c in paragraph.iter_inner_content():
|
322
|
+
if isinstance(c, Hyperlink):
|
323
|
+
text = c.text
|
324
|
+
hyperlink = Path(c.address)
|
325
|
+
format = self._get_format_from_run(c.runs[0])
|
326
|
+
elif isinstance(c, Run):
|
327
|
+
text = c.text
|
328
|
+
hyperlink = None
|
329
|
+
format = self._get_format_from_run(c)
|
330
|
+
else:
|
331
|
+
continue
|
332
|
+
|
333
|
+
if (len(text.strip()) and format != previous_format) or (
|
334
|
+
hyperlink is not None
|
335
|
+
):
|
336
|
+
# If the style changes for a non empty text, add the previous group
|
337
|
+
if len(group_text.strip()) > 0:
|
338
|
+
paragraph_elements.append(
|
339
|
+
(group_text.strip(), previous_format, None)
|
340
|
+
)
|
341
|
+
group_text = ""
|
342
|
+
|
343
|
+
# If there is a hyperlink, add it immediately
|
344
|
+
if hyperlink is not None:
|
345
|
+
paragraph_elements.append((text.strip(), format, hyperlink))
|
346
|
+
text = ""
|
347
|
+
else:
|
348
|
+
previous_format = format
|
349
|
+
|
350
|
+
group_text += text
|
351
|
+
|
352
|
+
# Format the last group
|
353
|
+
if len(group_text.strip()) > 0:
|
354
|
+
paragraph_elements.append((group_text.strip(), format, None))
|
355
|
+
|
356
|
+
return paragraph_elements
|
357
|
+
|
358
|
+
def _handle_equations_in_text(self, element, text):
|
266
359
|
only_texts = []
|
267
360
|
only_equations = []
|
268
361
|
texts_and_equations = []
|
269
362
|
for subt in element.iter():
|
270
363
|
tag_name = etree.QName(subt).localname
|
271
364
|
if tag_name == "t" and "math" not in subt.tag:
|
272
|
-
|
273
|
-
|
365
|
+
if isinstance(subt.text, str):
|
366
|
+
only_texts.append(subt.text)
|
367
|
+
texts_and_equations.append(subt.text)
|
274
368
|
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
275
|
-
latex_equation = str(oMath2Latex(subt))
|
276
|
-
|
277
|
-
|
369
|
+
latex_equation = str(oMath2Latex(subt)).strip()
|
370
|
+
if len(latex_equation) > 0:
|
371
|
+
only_equations.append(
|
372
|
+
self.equation_bookends.format(EQ=latex_equation)
|
373
|
+
)
|
374
|
+
texts_and_equations.append(
|
375
|
+
self.equation_bookends.format(EQ=latex_equation)
|
376
|
+
)
|
278
377
|
|
279
|
-
if
|
378
|
+
if len(only_equations) < 1:
|
379
|
+
return text, []
|
380
|
+
|
381
|
+
if (
|
382
|
+
re.sub(r"\s+", "", "".join(only_texts)).strip()
|
383
|
+
!= re.sub(r"\s+", "", text).strip()
|
384
|
+
):
|
280
385
|
# If we are not able to reconstruct the initial raw text
|
281
386
|
# do not try to parse equations and return the original
|
282
387
|
return text, []
|
283
388
|
|
284
|
-
|
389
|
+
# Insert equations into original text
|
390
|
+
# This is done to preserve white space structure
|
391
|
+
output_text = text[:]
|
392
|
+
init_i = 0
|
393
|
+
for i_substr, substr in enumerate(texts_and_equations):
|
394
|
+
if len(substr) == 0:
|
395
|
+
continue
|
285
396
|
|
286
|
-
|
397
|
+
if substr in output_text[init_i:]:
|
398
|
+
init_i += output_text[init_i:].find(substr) + len(substr)
|
399
|
+
else:
|
400
|
+
if i_substr > 0:
|
401
|
+
output_text = output_text[:init_i] + substr + output_text[init_i:]
|
402
|
+
init_i += len(substr)
|
403
|
+
else:
|
404
|
+
output_text = substr + output_text
|
405
|
+
|
406
|
+
return output_text, only_equations
|
407
|
+
|
408
|
+
def _create_or_reuse_parent(
|
409
|
+
self,
|
410
|
+
*,
|
411
|
+
doc: DoclingDocument,
|
412
|
+
prev_parent: Optional[NodeItem],
|
413
|
+
paragraph_elements: list,
|
414
|
+
) -> Optional[NodeItem]:
|
415
|
+
return (
|
416
|
+
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
|
417
|
+
if len(paragraph_elements) > 1
|
418
|
+
else prev_parent
|
419
|
+
)
|
420
|
+
|
421
|
+
def _handle_text_elements(
|
287
422
|
self,
|
288
423
|
element: BaseOxmlElement,
|
289
424
|
docx_obj: DocxDocument,
|
@@ -292,10 +427,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
292
427
|
paragraph = Paragraph(element, docx_obj)
|
293
428
|
|
294
429
|
raw_text = paragraph.text
|
295
|
-
text, equations = self.
|
430
|
+
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
296
431
|
|
297
432
|
if text is None:
|
298
433
|
return
|
434
|
+
paragraph_elements = self._get_paragraph_elements(paragraph)
|
299
435
|
text = text.strip()
|
300
436
|
|
301
437
|
# Common styles for bullet and numbered lists.
|
@@ -303,8 +439,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
303
439
|
# Identify wether list is a numbered list or not
|
304
440
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
305
441
|
is_numbered = False
|
306
|
-
p_style_id, p_level = self.
|
307
|
-
numid, ilevel = self.
|
442
|
+
p_style_id, p_level = self._get_label_and_level(paragraph)
|
443
|
+
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
308
444
|
|
309
445
|
if numid == 0:
|
310
446
|
numid = None
|
@@ -315,18 +451,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
315
451
|
and ilevel is not None
|
316
452
|
and p_style_id not in ["Title", "Heading"]
|
317
453
|
):
|
318
|
-
self.
|
319
|
-
doc,
|
320
|
-
numid,
|
321
|
-
ilevel,
|
322
|
-
|
323
|
-
is_numbered,
|
454
|
+
self._add_list_item(
|
455
|
+
doc=doc,
|
456
|
+
numid=numid,
|
457
|
+
ilevel=ilevel,
|
458
|
+
elements=paragraph_elements,
|
459
|
+
is_numbered=is_numbered,
|
324
460
|
)
|
325
|
-
self.
|
461
|
+
self._update_history(p_style_id, p_level, numid, ilevel)
|
326
462
|
return
|
327
463
|
elif (
|
328
464
|
numid is None
|
329
|
-
and self.
|
465
|
+
and self._prev_numid() is not None
|
330
466
|
and p_style_id not in ["Title", "Heading"]
|
331
467
|
): # Close list
|
332
468
|
if self.level_at_new_list:
|
@@ -348,26 +484,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
348
484
|
)
|
349
485
|
elif "Heading" in p_style_id:
|
350
486
|
style_element = getattr(paragraph.style, "element", None)
|
351
|
-
if style_element:
|
487
|
+
if style_element is not None:
|
352
488
|
is_numbered_style = (
|
353
489
|
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
354
490
|
)
|
355
491
|
else:
|
356
492
|
is_numbered_style = False
|
357
|
-
self.
|
493
|
+
self._add_header(doc, p_level, text, is_numbered_style)
|
358
494
|
|
359
495
|
elif len(equations) > 0:
|
360
|
-
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
496
|
+
if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
|
361
497
|
# Standalone equation
|
362
|
-
level = self.
|
498
|
+
level = self._get_level()
|
363
499
|
doc.add_text(
|
364
500
|
label=DocItemLabel.FORMULA,
|
365
501
|
parent=self.parents[level - 1],
|
366
|
-
text=text,
|
502
|
+
text=text.replace("<eq>", "").replace("</eq>", ""),
|
367
503
|
)
|
368
504
|
else:
|
369
505
|
# Inline equation
|
370
|
-
level = self.
|
506
|
+
level = self._get_level()
|
371
507
|
inline_equation = doc.add_group(
|
372
508
|
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
373
509
|
)
|
@@ -376,8 +512,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
376
512
|
if len(text_tmp) == 0:
|
377
513
|
break
|
378
514
|
|
379
|
-
|
380
|
-
|
515
|
+
split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
|
516
|
+
|
517
|
+
pre_eq_text = split_text_tmp[0]
|
518
|
+
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
|
519
|
+
|
381
520
|
if len(pre_eq_text) > 0:
|
382
521
|
doc.add_text(
|
383
522
|
label=DocItemLabel.PARAGRAPH,
|
@@ -387,13 +526,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
387
526
|
doc.add_text(
|
388
527
|
label=DocItemLabel.FORMULA,
|
389
528
|
parent=inline_equation,
|
390
|
-
text=eq,
|
529
|
+
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
391
530
|
)
|
531
|
+
|
392
532
|
if len(text_tmp) > 0:
|
393
533
|
doc.add_text(
|
394
534
|
label=DocItemLabel.PARAGRAPH,
|
395
535
|
parent=inline_equation,
|
396
|
-
text=text_tmp,
|
536
|
+
text=text_tmp.strip(),
|
397
537
|
)
|
398
538
|
|
399
539
|
elif p_style_id in [
|
@@ -406,30 +546,50 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
406
546
|
"ListBullet",
|
407
547
|
"Quote",
|
408
548
|
]:
|
409
|
-
level = self.
|
410
|
-
|
411
|
-
|
549
|
+
level = self._get_level()
|
550
|
+
parent = self._create_or_reuse_parent(
|
551
|
+
doc=doc,
|
552
|
+
prev_parent=self.parents.get(level - 1),
|
553
|
+
paragraph_elements=paragraph_elements,
|
412
554
|
)
|
555
|
+
for text, format, hyperlink in paragraph_elements:
|
556
|
+
doc.add_text(
|
557
|
+
label=DocItemLabel.PARAGRAPH,
|
558
|
+
parent=parent,
|
559
|
+
text=text,
|
560
|
+
formatting=format,
|
561
|
+
hyperlink=hyperlink,
|
562
|
+
)
|
413
563
|
|
414
564
|
else:
|
415
565
|
# Text style names can, and will have, not only default values but user values too
|
416
566
|
# hence we treat all other labels as pure text
|
417
|
-
level = self.
|
418
|
-
|
419
|
-
|
567
|
+
level = self._get_level()
|
568
|
+
parent = self._create_or_reuse_parent(
|
569
|
+
doc=doc,
|
570
|
+
prev_parent=self.parents.get(level - 1),
|
571
|
+
paragraph_elements=paragraph_elements,
|
420
572
|
)
|
573
|
+
for text, format, hyperlink in paragraph_elements:
|
574
|
+
doc.add_text(
|
575
|
+
label=DocItemLabel.PARAGRAPH,
|
576
|
+
parent=parent,
|
577
|
+
text=text,
|
578
|
+
formatting=format,
|
579
|
+
hyperlink=hyperlink,
|
580
|
+
)
|
421
581
|
|
422
|
-
self.
|
582
|
+
self._update_history(p_style_id, p_level, numid, ilevel)
|
423
583
|
return
|
424
584
|
|
425
|
-
def
|
585
|
+
def _add_header(
|
426
586
|
self,
|
427
587
|
doc: DoclingDocument,
|
428
588
|
curr_level: Optional[int],
|
429
589
|
text: str,
|
430
590
|
is_numbered_style: bool = False,
|
431
591
|
) -> None:
|
432
|
-
level = self.
|
592
|
+
level = self._get_level()
|
433
593
|
if isinstance(curr_level, int):
|
434
594
|
if curr_level > level:
|
435
595
|
# add invisible group
|
@@ -485,19 +645,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
485
645
|
)
|
486
646
|
return
|
487
647
|
|
488
|
-
def
|
648
|
+
def _add_list_item(
|
489
649
|
self,
|
650
|
+
*,
|
490
651
|
doc: DoclingDocument,
|
491
652
|
numid: int,
|
492
653
|
ilevel: int,
|
493
|
-
|
654
|
+
elements: list,
|
494
655
|
is_numbered: bool = False,
|
495
656
|
) -> None:
|
496
657
|
enum_marker = ""
|
497
658
|
|
498
|
-
level = self.
|
499
|
-
prev_indent = self.
|
500
|
-
if self.
|
659
|
+
level = self._get_level()
|
660
|
+
prev_indent = self._prev_indent()
|
661
|
+
if self._prev_numid() is None: # Open new list
|
501
662
|
self.level_at_new_list = level
|
502
663
|
|
503
664
|
self.parents[level] = doc.add_group(
|
@@ -509,15 +670,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
509
670
|
if is_numbered:
|
510
671
|
enum_marker = str(self.listIter) + "."
|
511
672
|
is_numbered = True
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
text=text,
|
673
|
+
new_parent = self._create_or_reuse_parent(
|
674
|
+
doc=doc,
|
675
|
+
prev_parent=self.parents[level],
|
676
|
+
paragraph_elements=elements,
|
517
677
|
)
|
678
|
+
for text, format, hyperlink in elements:
|
679
|
+
doc.add_list_item(
|
680
|
+
marker=enum_marker,
|
681
|
+
enumerated=is_numbered,
|
682
|
+
parent=new_parent,
|
683
|
+
text=text,
|
684
|
+
formatting=format,
|
685
|
+
hyperlink=hyperlink,
|
686
|
+
)
|
518
687
|
|
519
688
|
elif (
|
520
|
-
self.
|
689
|
+
self._prev_numid() == numid
|
521
690
|
and self.level_at_new_list is not None
|
522
691
|
and prev_indent is not None
|
523
692
|
and prev_indent < ilevel
|
@@ -545,15 +714,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
545
714
|
if is_numbered:
|
546
715
|
enum_marker = str(self.listIter) + "."
|
547
716
|
is_numbered = True
|
548
|
-
doc.add_list_item(
|
549
|
-
marker=enum_marker,
|
550
|
-
enumerated=is_numbered,
|
551
|
-
parent=self.parents[self.level_at_new_list + ilevel],
|
552
|
-
text=text,
|
553
|
-
)
|
554
717
|
|
718
|
+
new_parent = self._create_or_reuse_parent(
|
719
|
+
doc=doc,
|
720
|
+
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
721
|
+
paragraph_elements=elements,
|
722
|
+
)
|
723
|
+
for text, format, hyperlink in elements:
|
724
|
+
doc.add_list_item(
|
725
|
+
marker=enum_marker,
|
726
|
+
enumerated=is_numbered,
|
727
|
+
parent=new_parent,
|
728
|
+
text=text,
|
729
|
+
formatting=format,
|
730
|
+
hyperlink=hyperlink,
|
731
|
+
)
|
555
732
|
elif (
|
556
|
-
self.
|
733
|
+
self._prev_numid() == numid
|
557
734
|
and self.level_at_new_list is not None
|
558
735
|
and prev_indent is not None
|
559
736
|
and ilevel < prev_indent
|
@@ -567,29 +744,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
567
744
|
if is_numbered:
|
568
745
|
enum_marker = str(self.listIter) + "."
|
569
746
|
is_numbered = True
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
text=text,
|
747
|
+
new_parent = self._create_or_reuse_parent(
|
748
|
+
doc=doc,
|
749
|
+
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
750
|
+
paragraph_elements=elements,
|
575
751
|
)
|
752
|
+
for text, format, hyperlink in elements:
|
753
|
+
doc.add_list_item(
|
754
|
+
marker=enum_marker,
|
755
|
+
enumerated=is_numbered,
|
756
|
+
parent=new_parent,
|
757
|
+
text=text,
|
758
|
+
formatting=format,
|
759
|
+
hyperlink=hyperlink,
|
760
|
+
)
|
576
761
|
self.listIter = 0
|
577
762
|
|
578
|
-
elif self.
|
763
|
+
elif self._prev_numid() == numid or prev_indent == ilevel:
|
579
764
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
580
765
|
self.listIter += 1
|
581
766
|
if is_numbered:
|
582
767
|
enum_marker = str(self.listIter) + "."
|
583
768
|
is_numbered = True
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
text=text,
|
769
|
+
new_parent = self._create_or_reuse_parent(
|
770
|
+
doc=doc,
|
771
|
+
prev_parent=self.parents[level - 1],
|
772
|
+
paragraph_elements=elements,
|
589
773
|
)
|
774
|
+
for text, format, hyperlink in elements:
|
775
|
+
# Add the list item to the parent group
|
776
|
+
doc.add_list_item(
|
777
|
+
marker=enum_marker,
|
778
|
+
enumerated=is_numbered,
|
779
|
+
parent=new_parent,
|
780
|
+
text=text,
|
781
|
+
formatting=format,
|
782
|
+
hyperlink=hyperlink,
|
783
|
+
)
|
590
784
|
return
|
591
785
|
|
592
|
-
def
|
786
|
+
def _handle_tables(
|
593
787
|
self,
|
594
788
|
element: BaseOxmlElement,
|
595
789
|
docx_obj: DocxDocument,
|
@@ -604,7 +798,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
604
798
|
cell_element = table.rows[0].cells[0]
|
605
799
|
# In case we have a table of only 1 cell, we consider it furniture
|
606
800
|
# And proceed processing the content of the cell as though it's in the document body
|
607
|
-
self.
|
801
|
+
self._walk_linear(cell_element._element, docx_obj, doc)
|
608
802
|
return
|
609
803
|
|
610
804
|
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
@@ -649,11 +843,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
649
843
|
data.table_cells.append(table_cell)
|
650
844
|
col_idx += cell.grid_span
|
651
845
|
|
652
|
-
level = self.
|
846
|
+
level = self._get_level()
|
653
847
|
doc.add_table(data=data, parent=self.parents[level - 1])
|
654
848
|
return
|
655
849
|
|
656
|
-
def
|
850
|
+
def _handle_pictures(
|
657
851
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
658
852
|
) -> None:
|
659
853
|
def get_docx_image(drawing_blip):
|
@@ -666,7 +860,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
666
860
|
image_data = image_part.blob # Get the binary image data
|
667
861
|
return image_data
|
668
862
|
|
669
|
-
level = self.
|
863
|
+
level = self._get_level()
|
670
864
|
# Open the BytesIO object with PIL to create an Image
|
671
865
|
try:
|
672
866
|
image_data = get_docx_image(drawing_blip)
|
docling/cli/main.py
CHANGED
@@ -60,6 +60,44 @@ err_console = Console(stderr=True)
|
|
60
60
|
ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
|
61
61
|
ocr_engines_enum_internal = ocr_factory_internal.get_enum()
|
62
62
|
|
63
|
+
DOCLING_ASCII_ART = r"""
|
64
|
+
████ ██████
|
65
|
+
███░░██░░░░░██████
|
66
|
+
████████░░░░░░░░████████████
|
67
|
+
████████░░░░░░░░░░░░░░░░░░████████
|
68
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
69
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
|
70
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
|
71
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
72
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
73
|
+
██████░░░░░░░ ░░░░░░░░░░░░░░░░░░░░░░ ░░░░░░░██████
|
74
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
75
|
+
██████░░░░░░ ░░░░░░░░░░░░░░░ ░░░░░░██████
|
76
|
+
███▒██░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒███
|
77
|
+
███▒██░░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒████
|
78
|
+
███▒██░░░░░░ ██ ██ ░░░░░░░░░░░░ ██ ██ ░░░░░██▒▒███
|
79
|
+
███▒███░░░░░ ██ ░░░░████░░░░ ██ ░░░░░██▒▒███
|
80
|
+
████▒▒██░░░░░░ ░░░███▒▒▒▒███░░░ ░░░░░░░██▒▒████
|
81
|
+
████▒▒██░░░░░░░░░░░░░░░░░█▒▒▒▒▒▒▒▒▒▒█░░░░░░░░░░░░░░░░███▒▒████
|
82
|
+
████▒▒▒██░░░░░░░░░░░░█████ ▒▒▒▒▒▒ ██████░░░░░░░░░░░██▒▒▒████
|
83
|
+
███▒▒▒▒██░░░░░░░░███▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒███░░░░░░░░██▒▒▒▒███
|
84
|
+
███▒▒▒▒▒███░░░░░░██▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒██░░░░░░███▒▒▒▒▒███
|
85
|
+
████▒▒▒▒▒████░░░░░░██████████████████████░░░░░░████▒▒▒▒▒████
|
86
|
+
███▒▒▒▒▒▒▒▒████░░░░░░░░░░░░░░░░░░░░░░░░░░░████▒▒▒▒▒▒▒▒▒███
|
87
|
+
████▒▒▒▒▒▒▒▒███░░░░░████████████████████████▒▒▒▒▒▒▒▒▒████
|
88
|
+
████▒▒▒▒▒▒██░░░░░░█ █░░░░░██▒▒▒▒▒▒████
|
89
|
+
████▒▒▒▒█░░░░░░░█ D O C L I N G █░░░░░░░░██▒▒▒████
|
90
|
+
████▒▒██░░░░░░█ █░░░░░░░░░░█▒▒████
|
91
|
+
██████░░░░░░█ D O C L I N G █░░░░░░░░░░░██████
|
92
|
+
████░░░░░█ █░░░░░░░░░░░░████
|
93
|
+
█████░░█ D O C L I N G █░░░░░░░░░░░█████
|
94
|
+
█████ █░░░░░░░░████████
|
95
|
+
██ D O C L I N G █░░░░░░░░█████
|
96
|
+
█ █░░░████████
|
97
|
+
█████████████████████████████
|
98
|
+
"""
|
99
|
+
|
100
|
+
|
63
101
|
app = typer.Typer(
|
64
102
|
name="Docling",
|
65
103
|
no_args_is_help=True,
|
@@ -68,6 +106,12 @@ app = typer.Typer(
|
|
68
106
|
)
|
69
107
|
|
70
108
|
|
109
|
+
def logo_callback(value: bool):
|
110
|
+
if value:
|
111
|
+
print(DOCLING_ASCII_ART)
|
112
|
+
raise typer.Exit()
|
113
|
+
|
114
|
+
|
71
115
|
def version_callback(value: bool):
|
72
116
|
if value:
|
73
117
|
docling_version = importlib.metadata.version("docling")
|
@@ -356,6 +400,12 @@ def convert(
|
|
356
400
|
device: Annotated[
|
357
401
|
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
358
402
|
] = AcceleratorDevice.AUTO,
|
403
|
+
docling_logo: Annotated[
|
404
|
+
Optional[bool],
|
405
|
+
typer.Option(
|
406
|
+
"--logo", callback=logo_callback, is_eager=True, help="Docling logo"
|
407
|
+
),
|
408
|
+
] = None,
|
359
409
|
):
|
360
410
|
if verbose == 0:
|
361
411
|
logging.basicConfig(level=logging.WARNING)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.29.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -8,15 +8,15 @@ docling/backend/docling_parse_v2_backend.py,sha256=70kXqYhht-A8zb9z5emMe_1i0l9dy
|
|
8
8
|
docling/backend/docling_parse_v4_backend.py,sha256=IECMJQWEvYqQv043_1Ho6dLkCbuaK8cMUsqcxwqruXo,6287
|
9
9
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
docling/backend/docx/latex/latex_dict.py,sha256=
|
12
|
-
docling/backend/docx/latex/omml.py,sha256=
|
13
|
-
docling/backend/html_backend.py,sha256=
|
11
|
+
docling/backend/docx/latex/latex_dict.py,sha256=5pOMY_KyxYmgBZ40IrA4q0t5L6JvXOCx5cVwoQE1lls,6690
|
12
|
+
docling/backend/docx/latex/omml.py,sha256=5zuXYOQ10e9nSTKFURBjoU-XSQZVHsVyIiCsGYGVAk8,12127
|
13
|
+
docling/backend/html_backend.py,sha256=ghPLZfdBEPBzLIO9IWzzx0t1Os9B9r4VyGyEZtMsZVI,19468
|
14
14
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
16
16
|
docling/backend/md_backend.py,sha256=lqDiKIBHGsA0u-H1n9oVpPlrcpVT4gYRuNXXcyGlftM,17219
|
17
17
|
docling/backend/msexcel_backend.py,sha256=_ZVZFKRRijpg-Xz10xNxu2m-NpDaYvoiBqEZP6GbrgE,11095
|
18
|
-
docling/backend/mspowerpoint_backend.py,sha256=
|
19
|
-
docling/backend/msword_backend.py,sha256=
|
18
|
+
docling/backend/mspowerpoint_backend.py,sha256=X55-1anXm562wxAuYn5uwQkqKjirmgrn1KfbeaKUbXw,17273
|
19
|
+
docling/backend/msword_backend.py,sha256=1Yjs8J9vRSNDsgb9IKSKYcbvnoj1hO4Kf_mqncz3Ijs,32103
|
20
20
|
docling/backend/pdf_backend.py,sha256=odWb1rxk3WCUIEJMhq-dYFNUQ1pSDuNHbU9wlTZIRAs,2211
|
21
21
|
docling/backend/pypdfium2_backend.py,sha256=wRwhA5XHRqL7vyNhCAHM6P-ONkwtyjKG9LgC4NJ-4i8,10784
|
22
22
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -24,7 +24,7 @@ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqaf
|
|
24
24
|
docling/backend/xml/uspto_backend.py,sha256=H0jwIt2skOke_yEUk0wfXCtodrB-hrj2ygLtB3jMWaI,71056
|
25
25
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
26
26
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
docling/cli/main.py,sha256=
|
27
|
+
docling/cli/main.py,sha256=x8wmu0vb_wwpswdj8EKJyc3EnpVA1wnTJA4bjXRdi5Q,25255
|
28
28
|
docling/cli/models.py,sha256=tM_qbMM3YOPxFU7JlME96MLbtd1CX_bOAK7FS-NhJvY,3979
|
29
29
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
30
30
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -58,7 +58,7 @@ docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurH
|
|
58
58
|
docling/models/rapid_ocr_model.py,sha256=C_I0Ek9mAPIyTFRHuNbqtXg1c15rLNDE1tJ6_hPIi4c,5869
|
59
59
|
docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
|
60
60
|
docling/models/table_structure_model.py,sha256=pvTsqUa5QIANBUfot0XXG1UUeku-eaUi04EPE-Yh2g0,12597
|
61
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
61
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=CZ1W0QbvveIpXO0qSXmXFqz71P4PfLfJBQIqU_Wlg_E,10072
|
62
62
|
docling/models/tesseract_ocr_model.py,sha256=UpLAgKgJtBgbKtJELmKBNMcejJJKBCyFK0q-WgZN1Eg,9256
|
63
63
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
64
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
@@ -77,8 +77,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
|
|
77
77
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
78
78
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
79
79
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
80
|
-
docling-2.
|
81
|
-
docling-2.
|
82
|
-
docling-2.
|
83
|
-
docling-2.
|
84
|
-
docling-2.
|
80
|
+
docling-2.29.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
81
|
+
docling-2.29.0.dist-info/METADATA,sha256=PPcVfE4GnjhcLLurofnugm6QLj0EKRuaIuhlPuXYRT8,9982
|
82
|
+
docling-2.29.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
83
|
+
docling-2.29.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
|
84
|
+
docling-2.29.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|