docling 2.56.1__py3-none-any.whl → 2.57.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/msword_backend.py +87 -12
- docling/pipeline/vlm_pipeline.py +53 -33
- {docling-2.56.1.dist-info → docling-2.57.0.dist-info}/METADATA +1 -1
- {docling-2.56.1.dist-info → docling-2.57.0.dist-info}/RECORD +9 -8
- {docling-2.56.1.dist-info → docling-2.57.0.dist-info}/WHEEL +0 -0
- {docling-2.56.1.dist-info → docling-2.57.0.dist-info}/entry_points.txt +0 -0
- {docling-2.56.1.dist-info → docling-2.57.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.56.1.dist-info → docling-2.57.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from tempfile import mkdtemp
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
|
|
8
|
+
import pypdfium2
|
|
9
|
+
from docx.document import Document
|
|
10
|
+
from PIL import Image, ImageChops
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
|
|
14
|
+
"""Return the libreoffice cmd and optionally test it."""
|
|
15
|
+
|
|
16
|
+
libreoffice_cmd = (
|
|
17
|
+
shutil.which("libreoffice")
|
|
18
|
+
or shutil.which("soffice")
|
|
19
|
+
or (
|
|
20
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
|
|
21
|
+
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
|
|
22
|
+
else None
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if raise_if_unavailable:
|
|
27
|
+
if libreoffice_cmd is None:
|
|
28
|
+
raise RuntimeError("Libreoffice not found")
|
|
29
|
+
|
|
30
|
+
# The following test will raise if the libreoffice_cmd cannot be used
|
|
31
|
+
subprocess.run(
|
|
32
|
+
[
|
|
33
|
+
libreoffice_cmd,
|
|
34
|
+
"-h",
|
|
35
|
+
],
|
|
36
|
+
stdout=subprocess.DEVNULL,
|
|
37
|
+
stderr=subprocess.DEVNULL,
|
|
38
|
+
check=True,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
return libreoffice_cmd
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_docx_to_pdf_converter() -> Optional[Callable]:
|
|
45
|
+
"""
|
|
46
|
+
Detects the best available DOCX to PDF tool and returns a conversion function.
|
|
47
|
+
The returned function accepts (input_path, output_path).
|
|
48
|
+
Returns None if no tool is available.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Try LibreOffice
|
|
52
|
+
libreoffice_cmd = get_libreoffice_cmd()
|
|
53
|
+
|
|
54
|
+
if libreoffice_cmd:
|
|
55
|
+
|
|
56
|
+
def convert_with_libreoffice(input_path, output_path):
|
|
57
|
+
subprocess.run(
|
|
58
|
+
[
|
|
59
|
+
libreoffice_cmd,
|
|
60
|
+
"--headless",
|
|
61
|
+
"--convert-to",
|
|
62
|
+
"pdf",
|
|
63
|
+
"--outdir",
|
|
64
|
+
os.path.dirname(output_path),
|
|
65
|
+
input_path,
|
|
66
|
+
],
|
|
67
|
+
stdout=subprocess.DEVNULL,
|
|
68
|
+
stderr=subprocess.DEVNULL,
|
|
69
|
+
check=True,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
expected_output = os.path.join(
|
|
73
|
+
os.path.dirname(output_path),
|
|
74
|
+
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
|
75
|
+
)
|
|
76
|
+
if expected_output != output_path:
|
|
77
|
+
os.rename(expected_output, output_path)
|
|
78
|
+
|
|
79
|
+
return convert_with_libreoffice
|
|
80
|
+
|
|
81
|
+
## Space for other DOCX to PDF converters if available
|
|
82
|
+
|
|
83
|
+
# No tools found
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
|
|
88
|
+
if bg_color is None:
|
|
89
|
+
bg_color = image.getpixel((0, 0))
|
|
90
|
+
|
|
91
|
+
bg = Image.new(image.mode, image.size, bg_color)
|
|
92
|
+
diff = ImageChops.difference(image, bg)
|
|
93
|
+
bbox = diff.getbbox()
|
|
94
|
+
|
|
95
|
+
if bbox:
|
|
96
|
+
left, upper, right, lower = bbox
|
|
97
|
+
left = max(0, left - padding)
|
|
98
|
+
upper = max(0, upper - padding)
|
|
99
|
+
right = min(image.width, right + padding)
|
|
100
|
+
lower = min(image.height, lower + padding)
|
|
101
|
+
return image.crop((left, upper, right, lower))
|
|
102
|
+
else:
|
|
103
|
+
return image
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_pil_from_dml_docx(
|
|
107
|
+
docx: Document, converter: Optional[Callable]
|
|
108
|
+
) -> Optional[Image.Image]:
|
|
109
|
+
if converter is None:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
temp_dir = Path(mkdtemp())
|
|
113
|
+
temp_docx = Path(temp_dir / "drawing_only.docx")
|
|
114
|
+
temp_pdf = Path(temp_dir / "drawing_only.pdf")
|
|
115
|
+
|
|
116
|
+
# 1) Save docx temporarily
|
|
117
|
+
docx.save(str(temp_docx))
|
|
118
|
+
|
|
119
|
+
# 2) Export to PDF
|
|
120
|
+
converter(temp_docx, temp_pdf)
|
|
121
|
+
|
|
122
|
+
# 3) Load PDF as PNG
|
|
123
|
+
pdf = pypdfium2.PdfDocument(temp_pdf)
|
|
124
|
+
page = pdf[0]
|
|
125
|
+
image = crop_whitespace(page.render(scale=2).to_pil())
|
|
126
|
+
page.close()
|
|
127
|
+
pdf.close()
|
|
128
|
+
|
|
129
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
130
|
+
|
|
131
|
+
return image
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
+
from copy import deepcopy
|
|
3
4
|
from io import BytesIO
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Any, List, Optional, Union
|
|
6
|
+
from typing import Any, Callable, List, Optional, Union
|
|
6
7
|
|
|
7
8
|
from docling_core.types.doc import (
|
|
8
9
|
DocItemLabel,
|
|
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
|
|
|
33
34
|
from typing_extensions import override
|
|
34
35
|
|
|
35
36
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
|
37
|
+
from docling.backend.docx.drawingml.utils import (
|
|
38
|
+
get_docx_to_pdf_converter,
|
|
39
|
+
get_libreoffice_cmd,
|
|
40
|
+
get_pil_from_dml_docx,
|
|
41
|
+
)
|
|
36
42
|
from docling.backend.docx.latex.omml import oMath2Latex
|
|
37
43
|
from docling.datamodel.base_models import InputFormat
|
|
38
44
|
from docling.datamodel.document import InputDocument
|
|
@@ -64,6 +70,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
64
70
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
|
65
71
|
# Track processed textbox elements to avoid duplication
|
|
66
72
|
self.processed_textbox_elements: List[int] = []
|
|
73
|
+
self.docx_to_pdf_converter: Optional[Callable] = None
|
|
74
|
+
self.docx_to_pdf_converter_init = False
|
|
75
|
+
self.display_drawingml_warning = True
|
|
67
76
|
|
|
68
77
|
for i in range(-1, self.max_levels):
|
|
69
78
|
self.parents[i] = None
|
|
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
80
89
|
"indents": [None],
|
|
81
90
|
}
|
|
82
91
|
|
|
83
|
-
self.docx_obj =
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
elif isinstance(self.path_or_stream, Path):
|
|
88
|
-
self.docx_obj = Document(str(self.path_or_stream))
|
|
89
|
-
|
|
92
|
+
self.docx_obj = self.load_msword_file(
|
|
93
|
+
path_or_stream=self.path_or_stream, document_hash=self.document_hash
|
|
94
|
+
)
|
|
95
|
+
if self.docx_obj:
|
|
90
96
|
self.valid = True
|
|
91
|
-
except Exception as e:
|
|
92
|
-
raise RuntimeError(
|
|
93
|
-
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
|
94
|
-
) from e
|
|
95
97
|
|
|
96
98
|
@override
|
|
97
99
|
def is_valid(self) -> bool:
|
|
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
139
141
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
|
140
142
|
)
|
|
141
143
|
|
|
144
|
+
@staticmethod
|
|
145
|
+
def load_msword_file(
|
|
146
|
+
path_or_stream: Union[BytesIO, Path], document_hash: str
|
|
147
|
+
) -> DocxDocument:
|
|
148
|
+
try:
|
|
149
|
+
if isinstance(path_or_stream, BytesIO):
|
|
150
|
+
return Document(path_or_stream)
|
|
151
|
+
elif isinstance(path_or_stream, Path):
|
|
152
|
+
return Document(str(path_or_stream))
|
|
153
|
+
else:
|
|
154
|
+
return None
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
f"MsWordDocumentBackend could not load document with hash {document_hash}"
|
|
158
|
+
) from e
|
|
159
|
+
|
|
142
160
|
def _update_history(
|
|
143
161
|
self,
|
|
144
162
|
name: str,
|
|
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
195
213
|
}
|
|
196
214
|
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
|
197
215
|
drawing_blip = xpath_expr(element)
|
|
216
|
+
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
|
|
198
217
|
|
|
199
218
|
# Check for textbox content - check multiple textbox formats
|
|
200
219
|
# Only process if the element hasn't been processed before
|
|
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
274
293
|
):
|
|
275
294
|
te1 = self._handle_text_elements(element, docx_obj, doc)
|
|
276
295
|
added_elements.extend(te1)
|
|
296
|
+
# Check for DrawingML elements
|
|
297
|
+
elif drawingml_els:
|
|
298
|
+
if (
|
|
299
|
+
self.docx_to_pdf_converter is None
|
|
300
|
+
and self.docx_to_pdf_converter_init is False
|
|
301
|
+
):
|
|
302
|
+
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
|
|
303
|
+
self.docx_to_pdf_converter_init = True
|
|
304
|
+
|
|
305
|
+
if self.docx_to_pdf_converter is None:
|
|
306
|
+
if self.display_drawingml_warning:
|
|
307
|
+
if self.docx_to_pdf_converter is None:
|
|
308
|
+
_log.warning(
|
|
309
|
+
"Found DrawingML elements in document, but no DOCX to PDF converters. "
|
|
310
|
+
"If you want these exported, make sure you have "
|
|
311
|
+
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
|
|
312
|
+
)
|
|
313
|
+
self.display_drawingml_warning = False
|
|
314
|
+
else:
|
|
315
|
+
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
|
|
277
316
|
# Check for the sdt containers, like table of contents
|
|
278
317
|
elif tag_name in ["sdt"]:
|
|
279
318
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
|
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1381
1420
|
)
|
|
1382
1421
|
elem_ref.append(p3.get_ref())
|
|
1383
1422
|
return elem_ref
|
|
1423
|
+
|
|
1424
|
+
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
|
|
1425
|
+
# 1) Make an empty copy of the original document
|
|
1426
|
+
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
|
|
1427
|
+
body = dml_doc._element.body
|
|
1428
|
+
for child in list(body):
|
|
1429
|
+
body.remove(child)
|
|
1430
|
+
|
|
1431
|
+
# 2) Add DrawingML to empty document
|
|
1432
|
+
new_para = dml_doc.add_paragraph()
|
|
1433
|
+
new_r = new_para.add_run()
|
|
1434
|
+
for dml in drawingml_els:
|
|
1435
|
+
new_r._r.append(deepcopy(dml))
|
|
1436
|
+
|
|
1437
|
+
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
|
|
1438
|
+
level = self._get_level()
|
|
1439
|
+
try:
|
|
1440
|
+
pil_image = get_pil_from_dml_docx(
|
|
1441
|
+
dml_doc, converter=self.docx_to_pdf_converter
|
|
1442
|
+
)
|
|
1443
|
+
if pil_image is None:
|
|
1444
|
+
raise UnidentifiedImageError
|
|
1445
|
+
|
|
1446
|
+
doc.add_picture(
|
|
1447
|
+
parent=self.parents[level - 1],
|
|
1448
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
|
1449
|
+
caption=None,
|
|
1450
|
+
)
|
|
1451
|
+
except (UnidentifiedImageError, OSError):
|
|
1452
|
+
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
|
1453
|
+
doc.add_picture(
|
|
1454
|
+
parent=self.parents[level - 1],
|
|
1455
|
+
caption=None,
|
|
1456
|
+
)
|
|
1457
|
+
|
|
1458
|
+
return
|
docling/pipeline/vlm_pipeline.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import List, Optional, Union, cast
|
|
|
6
6
|
|
|
7
7
|
from docling_core.types.doc import (
|
|
8
8
|
BoundingBox,
|
|
9
|
+
ContentLayer,
|
|
9
10
|
DocItem,
|
|
10
11
|
DoclingDocument,
|
|
11
12
|
ImageRef,
|
|
@@ -251,9 +252,9 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
251
252
|
# No code blocks found, return original text
|
|
252
253
|
return text
|
|
253
254
|
|
|
254
|
-
|
|
255
|
-
page_no = pg_idx + 1 # FIXME: might be incorrect
|
|
255
|
+
page_docs = []
|
|
256
256
|
|
|
257
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
|
257
258
|
predicted_text = ""
|
|
258
259
|
if page.predictions.vlm_response:
|
|
259
260
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
|
@@ -273,6 +274,24 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
273
274
|
)
|
|
274
275
|
page_doc = backend.convert()
|
|
275
276
|
|
|
277
|
+
# Modify provenance in place for all items in the page document
|
|
278
|
+
for item, level in page_doc.iterate_items(
|
|
279
|
+
with_groups=True,
|
|
280
|
+
traverse_pictures=True,
|
|
281
|
+
included_content_layers=set(ContentLayer),
|
|
282
|
+
):
|
|
283
|
+
if isinstance(item, DocItem):
|
|
284
|
+
item.prov = [
|
|
285
|
+
ProvenanceItem(
|
|
286
|
+
page_no=pg_idx + 1,
|
|
287
|
+
bbox=BoundingBox(
|
|
288
|
+
t=0.0, b=0.0, l=0.0, r=0.0
|
|
289
|
+
), # FIXME: would be nice not to have to "fake" it
|
|
290
|
+
charspan=[0, 0],
|
|
291
|
+
)
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
# Add page metadata to the page document before concatenation
|
|
276
295
|
if page.image is not None:
|
|
277
296
|
pg_width = page.image.width
|
|
278
297
|
pg_height = page.image.height
|
|
@@ -280,27 +299,18 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
280
299
|
pg_width = 1
|
|
281
300
|
pg_height = 1
|
|
282
301
|
|
|
283
|
-
|
|
284
|
-
page_no=
|
|
302
|
+
page_doc.add_page(
|
|
303
|
+
page_no=pg_idx + 1,
|
|
285
304
|
size=Size(width=pg_width, height=pg_height),
|
|
286
305
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
|
287
306
|
if page.image
|
|
288
307
|
else None,
|
|
289
308
|
)
|
|
290
309
|
|
|
291
|
-
|
|
292
|
-
item.prov = [
|
|
293
|
-
ProvenanceItem(
|
|
294
|
-
page_no=pg_idx + 1,
|
|
295
|
-
bbox=BoundingBox(
|
|
296
|
-
t=0.0, b=0.0, l=0.0, r=0.0
|
|
297
|
-
), # FIXME: would be nice not to have to "fake" it
|
|
298
|
-
charspan=[0, 0],
|
|
299
|
-
)
|
|
300
|
-
]
|
|
301
|
-
conv_res.document.append_child_item(child=item)
|
|
310
|
+
page_docs.append(page_doc)
|
|
302
311
|
|
|
303
|
-
|
|
312
|
+
final_doc = DoclingDocument.concatenate(docs=page_docs)
|
|
313
|
+
return final_doc
|
|
304
314
|
|
|
305
315
|
def _turn_html_into_doc(self, conv_res):
|
|
306
316
|
def _extract_html_code(text):
|
|
@@ -328,9 +338,9 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
328
338
|
# No code blocks found, return original text
|
|
329
339
|
return text
|
|
330
340
|
|
|
331
|
-
|
|
332
|
-
page_no = pg_idx + 1 # FIXME: might be incorrect
|
|
341
|
+
page_docs = []
|
|
333
342
|
|
|
343
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
|
334
344
|
predicted_text = ""
|
|
335
345
|
if page.predictions.vlm_response:
|
|
336
346
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
|
@@ -341,7 +351,7 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
341
351
|
out_doc = InputDocument(
|
|
342
352
|
path_or_stream=response_bytes,
|
|
343
353
|
filename=conv_res.input.file.name,
|
|
344
|
-
format=InputFormat.
|
|
354
|
+
format=InputFormat.HTML,
|
|
345
355
|
backend=HTMLDocumentBackend,
|
|
346
356
|
)
|
|
347
357
|
backend = HTMLDocumentBackend(
|
|
@@ -350,6 +360,24 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
350
360
|
)
|
|
351
361
|
page_doc = backend.convert()
|
|
352
362
|
|
|
363
|
+
# Modify provenance in place for all items in the page document
|
|
364
|
+
for item, level in page_doc.iterate_items(
|
|
365
|
+
with_groups=True,
|
|
366
|
+
traverse_pictures=True,
|
|
367
|
+
included_content_layers=set(ContentLayer),
|
|
368
|
+
):
|
|
369
|
+
if isinstance(item, DocItem):
|
|
370
|
+
item.prov = [
|
|
371
|
+
ProvenanceItem(
|
|
372
|
+
page_no=pg_idx + 1,
|
|
373
|
+
bbox=BoundingBox(
|
|
374
|
+
t=0.0, b=0.0, l=0.0, r=0.0
|
|
375
|
+
), # FIXME: would be nice not to have to "fake" it
|
|
376
|
+
charspan=[0, 0],
|
|
377
|
+
)
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
# Add page metadata to the page document before concatenation
|
|
353
381
|
if page.image is not None:
|
|
354
382
|
pg_width = page.image.width
|
|
355
383
|
pg_height = page.image.height
|
|
@@ -357,27 +385,19 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
357
385
|
pg_width = 1
|
|
358
386
|
pg_height = 1
|
|
359
387
|
|
|
360
|
-
|
|
361
|
-
page_no=
|
|
388
|
+
page_doc.add_page(
|
|
389
|
+
page_no=pg_idx + 1,
|
|
362
390
|
size=Size(width=pg_width, height=pg_height),
|
|
363
391
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
|
364
392
|
if page.image
|
|
365
393
|
else None,
|
|
366
394
|
)
|
|
367
395
|
|
|
368
|
-
|
|
369
|
-
item.prov = [
|
|
370
|
-
ProvenanceItem(
|
|
371
|
-
page_no=pg_idx + 1,
|
|
372
|
-
bbox=BoundingBox(
|
|
373
|
-
t=0.0, b=0.0, l=0.0, r=0.0
|
|
374
|
-
), # FIXME: would be nice not to have to "fake" it
|
|
375
|
-
charspan=[0, 0],
|
|
376
|
-
)
|
|
377
|
-
]
|
|
378
|
-
conv_res.document.append_child_item(child=item)
|
|
396
|
+
page_docs.append(page_doc)
|
|
379
397
|
|
|
380
|
-
|
|
398
|
+
# Concatenate all page documents to preserve hierarchy
|
|
399
|
+
final_doc = DoclingDocument.concatenate(docs=page_docs)
|
|
400
|
+
return final_doc
|
|
381
401
|
|
|
382
402
|
@classmethod
|
|
383
403
|
def get_default_options(cls) -> VlmPipelineOptions:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.57.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -15,12 +15,13 @@ docling/backend/md_backend.py,sha256=TWboEPHl93pqI_Go1a3XpP-KpzI3d17xo5ZW42Ul0kY
|
|
|
15
15
|
docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
|
|
16
16
|
docling/backend/msexcel_backend.py,sha256=GOuA-MlShpzFmCmJq3-Z28iquwWUg4k8v-AT4O-aAQI,19305
|
|
17
17
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
|
18
|
-
docling/backend/msword_backend.py,sha256=
|
|
18
|
+
docling/backend/msword_backend.py,sha256=L44vFoSHOtbX-S_lSb8EKW-nzwL_ptVPhNV74ydmwqE,57457
|
|
19
19
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
|
20
20
|
docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
|
|
21
21
|
docling/backend/pypdfium2_backend.py,sha256=AYhWs9S8W_TkAK0-OkRmUNf4HUZl26FP7-XYjwU5zDk,14209
|
|
22
22
|
docling/backend/webvtt_backend.py,sha256=9xPcfWVLuqhEAFrkv8aU36qHnSgjeINZAXT_C9C6XJA,19165
|
|
23
23
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
docling/backend/docx/drawingml/utils.py,sha256=E9Iq8_052eEV5L1IN3ZqFX9eBidH56DKNlh6Tk7Do0I,3640
|
|
24
25
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
26
|
docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
|
|
26
27
|
docling/backend/docx/latex/omml.py,sha256=4vh9FCbXh-Tb6KJGqNwzlMUMYEnnJgBtBI24dwy6t2U,12416
|
|
@@ -88,7 +89,7 @@ docling/pipeline/extraction_vlm_pipeline.py,sha256=veUOTe8nGdnduZKaGn1RRb-NfU1H6
|
|
|
88
89
|
docling/pipeline/simple_pipeline.py,sha256=FSL_ucDd9k0D9DjNKMUkyCULIU8a057dvWfLEPmAc2A,2287
|
|
89
90
|
docling/pipeline/standard_pdf_pipeline.py,sha256=xOge0zP5wli51n_6QLrFHQlwwvsivI7OMt00tht3my4,10479
|
|
90
91
|
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=i67G5AOW7PIFCe5JS2sdBmPAKvAH6ScxIBhjwOGZcrI,28183
|
|
91
|
-
docling/pipeline/vlm_pipeline.py,sha256=
|
|
92
|
+
docling/pipeline/vlm_pipeline.py,sha256=HSbSoGZyy4eIK8eOL2g_NymrHg8r-DrB2buggJQAqHU,16189
|
|
92
93
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
93
94
|
docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
|
|
94
95
|
docling/utils/api_image_request.py,sha256=kQDmTvQT6M2IgXnGYeoNflI6sLUG6WTCcEft94CRwWg,5379
|
|
@@ -102,9 +103,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
|
102
103
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
|
103
104
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
|
104
105
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
|
105
|
-
docling-2.
|
|
106
|
-
docling-2.
|
|
107
|
-
docling-2.
|
|
108
|
-
docling-2.
|
|
109
|
-
docling-2.
|
|
110
|
-
docling-2.
|
|
106
|
+
docling-2.57.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
|
107
|
+
docling-2.57.0.dist-info/METADATA,sha256=oDfwFunLJTLSDVastMVq9JkUpIgeKOOVX1MZb6rtqcE,11364
|
|
108
|
+
docling-2.57.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
109
|
+
docling-2.57.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
|
110
|
+
docling-2.57.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
|
111
|
+
docling-2.57.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|