docling 2.8.2__tar.gz → 2.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.8.2 → docling-2.9.0}/PKG-INFO +5 -5
- {docling-2.8.2 → docling-2.9.0}/README.md +2 -2
- {docling-2.8.2 → docling-2.9.0}/docling/backend/msword_backend.py +43 -27
- docling-2.9.0/docling/chunking/__init__.py +12 -0
- {docling-2.8.2 → docling-2.9.0}/docling/cli/main.py +76 -23
- {docling-2.8.2 → docling-2.9.0}/docling/datamodel/base_models.py +3 -0
- {docling-2.8.2 → docling-2.9.0}/docling/datamodel/document.py +24 -10
- {docling-2.8.2 → docling-2.9.0}/docling/document_converter.py +103 -83
- docling-2.9.0/docling/exceptions.py +6 -0
- docling-2.9.0/docling/py.typed +1 -0
- {docling-2.8.2 → docling-2.9.0}/pyproject.toml +3 -3
- {docling-2.8.2 → docling-2.9.0}/LICENSE +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/__init__.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/__init__.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/html_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/md_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/cli/__init__.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/datamodel/settings.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/__init__.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/base_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/layout_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/utils/__init__.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/utils/export.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/utils/layout_utils.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/utils/profiling.py +0 -0
- {docling-2.8.2 → docling-2.9.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.9.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
29
|
-
Requires-Dist: docling-core (>=2.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.8.0,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
39
39
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
40
40
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
41
41
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
42
|
-
Requires-Dist: pydantic (>=2.0.0,<
|
42
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
43
43
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
44
44
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
45
45
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
|
|
59
59
|
</a>
|
60
60
|
</p>
|
61
61
|
|
62
|
-
#
|
62
|
+
# Docling
|
63
63
|
|
64
64
|
<p align="center">
|
65
65
|
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
81
81
|
|
82
82
|
## Features
|
83
83
|
|
84
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
84
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
85
85
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
86
86
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
87
87
|
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
@@ -4,7 +4,7 @@
|
|
4
4
|
</a>
|
5
5
|
</p>
|
6
6
|
|
7
|
-
#
|
7
|
+
# Docling
|
8
8
|
|
9
9
|
<p align="center">
|
10
10
|
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
@@ -26,7 +26,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
26
26
|
|
27
27
|
## Features
|
28
28
|
|
29
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
29
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
30
30
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
31
31
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
32
32
|
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Set, Union
|
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
133
134
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
134
135
|
for element in body:
|
135
136
|
tag_name = etree.QName(element).localname
|
136
|
-
|
137
137
|
# Check for Inline Images (blip elements)
|
138
138
|
namespaces = {
|
139
139
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
153
153
|
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
154
154
|
# Check for Text
|
155
155
|
elif tag_name in ["p"]:
|
156
|
+
# "tcPr", "sectPr"
|
156
157
|
self.handle_text_elements(element, docx_obj, doc)
|
157
158
|
else:
|
158
159
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
166
167
|
except ValueError:
|
167
168
|
return default
|
168
169
|
|
170
|
+
def split_text_and_number(self, input_string):
|
171
|
+
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
172
|
+
if match:
|
173
|
+
parts = list(filter(None, match.groups()))
|
174
|
+
return parts
|
175
|
+
else:
|
176
|
+
return [input_string]
|
177
|
+
|
169
178
|
def get_numId_and_ilvl(self, paragraph):
|
170
179
|
# Access the XML element of the paragraph
|
171
180
|
numPr = paragraph._element.find(
|
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
188
197
|
def get_label_and_level(self, paragraph):
|
189
198
|
if paragraph.style is None:
|
190
199
|
return "Normal", None
|
191
|
-
label = paragraph.style.
|
200
|
+
label = paragraph.style.style_id
|
192
201
|
if label is None:
|
193
202
|
return "Normal", None
|
194
203
|
if ":" in label:
|
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
197
206
|
if len(parts) == 2:
|
198
207
|
return parts[0], int(parts[1])
|
199
208
|
|
200
|
-
parts =
|
209
|
+
parts = self.split_text_and_number(label)
|
201
210
|
|
202
211
|
if "Heading" in label and len(parts) == 2:
|
203
212
|
parts.sort()
|
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
219
228
|
if paragraph.text is None:
|
220
229
|
return
|
221
230
|
text = paragraph.text.strip()
|
222
|
-
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
223
231
|
|
224
232
|
# Common styles for bullet and numbered lists.
|
225
233
|
# "List Bullet", "List Number", "List Paragraph"
|
226
234
|
# Identify wether list is a numbered list or not
|
227
235
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
228
236
|
is_numbered = False
|
229
|
-
|
237
|
+
p_style_id, p_level = self.get_label_and_level(paragraph)
|
230
238
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
231
239
|
|
232
240
|
if numid == 0:
|
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
238
246
|
element,
|
239
247
|
docx_obj,
|
240
248
|
doc,
|
241
|
-
|
249
|
+
p_style_id,
|
242
250
|
p_level,
|
243
251
|
numid,
|
244
252
|
ilevel,
|
245
253
|
text,
|
246
254
|
is_numbered,
|
247
255
|
)
|
248
|
-
self.update_history(
|
256
|
+
self.update_history(p_style_id, p_level, numid, ilevel)
|
249
257
|
return
|
250
258
|
elif numid is None and self.prev_numid() is not None: # Close list
|
251
259
|
for key, val in self.parents.items():
|
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
253
261
|
self.parents[key] = None
|
254
262
|
self.level = self.level_at_new_list - 1
|
255
263
|
self.level_at_new_list = None
|
256
|
-
if
|
264
|
+
if p_style_id in ["Title"]:
|
257
265
|
for key, val in self.parents.items():
|
258
266
|
self.parents[key] = None
|
259
267
|
self.parents[0] = doc.add_text(
|
260
268
|
parent=None, label=DocItemLabel.TITLE, text=text
|
261
269
|
)
|
262
|
-
elif "Heading" in
|
263
|
-
self.add_header(element, docx_obj, doc,
|
270
|
+
elif "Heading" in p_style_id:
|
271
|
+
self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
|
264
272
|
|
265
|
-
elif
|
273
|
+
elif p_style_id in [
|
266
274
|
"Paragraph",
|
267
275
|
"Normal",
|
268
276
|
"Subtitle",
|
269
277
|
"Author",
|
270
|
-
"
|
271
|
-
"
|
272
|
-
"
|
278
|
+
"DefaultText",
|
279
|
+
"ListParagraph",
|
280
|
+
"ListBullet",
|
273
281
|
"Quote",
|
274
282
|
]:
|
275
283
|
level = self.get_level()
|
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
285
293
|
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
286
294
|
)
|
287
295
|
|
288
|
-
self.update_history(
|
296
|
+
self.update_history(p_style_id, p_level, numid, ilevel)
|
289
297
|
return
|
290
298
|
|
291
299
|
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
292
300
|
level = self.get_level()
|
293
301
|
if isinstance(curr_level, int):
|
294
|
-
|
295
302
|
if curr_level > level:
|
296
|
-
|
297
303
|
# add invisible group
|
298
304
|
for i in range(level, curr_level):
|
299
305
|
self.parents[i] = doc.add_group(
|
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
301
307
|
label=GroupLabel.SECTION,
|
302
308
|
name=f"header-{i}",
|
303
309
|
)
|
304
|
-
|
305
310
|
elif curr_level < level:
|
306
|
-
|
307
311
|
# remove the tail
|
308
312
|
for key, val in self.parents.items():
|
309
313
|
if key >= curr_level:
|
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
314
318
|
text=text,
|
315
319
|
level=curr_level,
|
316
320
|
)
|
317
|
-
|
318
321
|
else:
|
319
322
|
self.parents[self.level] = doc.add_heading(
|
320
323
|
parent=self.parents[self.level - 1],
|
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
328
331
|
element,
|
329
332
|
docx_obj,
|
330
333
|
doc,
|
331
|
-
|
334
|
+
p_style_id,
|
332
335
|
p_level,
|
333
336
|
numid,
|
334
337
|
ilevel,
|
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
346
349
|
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
347
350
|
)
|
348
351
|
|
349
|
-
#
|
352
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
350
353
|
self.listIter += 1
|
351
354
|
if is_numbered:
|
352
355
|
enum_marker = str(self.listIter) + "."
|
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
365
368
|
self.level_at_new_list + self.prev_indent() + 1,
|
366
369
|
self.level_at_new_list + ilevel + 1,
|
367
370
|
):
|
368
|
-
#
|
369
|
-
#
|
371
|
+
# Determine if this is an unordered list or an ordered list.
|
372
|
+
# Set GroupLabel.ORDERED_LIST when it fits.
|
370
373
|
self.listIter = 0
|
371
374
|
if is_numbered:
|
372
375
|
self.parents[i] = doc.add_group(
|
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
467
470
|
row_span = get_rowspan(cell)
|
468
471
|
col_span = get_colspan(cell)
|
469
472
|
|
473
|
+
cell_text = cell.text
|
474
|
+
# In case cell doesn't return text via docx library:
|
475
|
+
if len(cell_text) == 0:
|
476
|
+
cell_xml = cell._element
|
477
|
+
|
478
|
+
texts = [""]
|
479
|
+
for elem in cell_xml.iter():
|
480
|
+
if elem.tag.endswith("t"): # <w:t> tags that contain text
|
481
|
+
if elem.text:
|
482
|
+
texts.append(elem.text)
|
483
|
+
# Join the collected text
|
484
|
+
cell_text = " ".join(texts).strip()
|
485
|
+
|
470
486
|
# Find the next available column in the grid
|
471
487
|
while table_grid[row_idx][col_idx] is not None:
|
472
488
|
col_idx += 1
|
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
477
493
|
table_grid[row_idx + i][col_idx + j] = ""
|
478
494
|
|
479
495
|
cell = TableCell(
|
480
|
-
text=
|
496
|
+
text=cell_text,
|
481
497
|
row_span=row_span,
|
482
498
|
col_span=col_span,
|
483
499
|
start_row_offset_idx=row_idx,
|
484
500
|
end_row_offset_idx=row_idx + row_span,
|
485
501
|
start_col_offset_idx=col_idx,
|
486
502
|
end_col_offset_idx=col_idx + col_span,
|
487
|
-
col_header=False,
|
488
|
-
row_header=False,
|
503
|
+
col_header=False,
|
504
|
+
row_header=False,
|
489
505
|
)
|
490
506
|
|
491
507
|
data.table_cells.append(cell)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
3
|
+
# SPDX-License-Identifier: MIT
|
4
|
+
#
|
5
|
+
|
6
|
+
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
7
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
8
|
+
DocChunk,
|
9
|
+
DocMeta,
|
10
|
+
HierarchicalChunker,
|
11
|
+
)
|
12
|
+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
@@ -10,7 +10,9 @@ from pathlib import Path
|
|
10
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
11
11
|
|
12
12
|
import typer
|
13
|
+
from docling_core.types.doc import ImageRefMode
|
13
14
|
from docling_core.utils.file import resolve_source_to_path
|
15
|
+
from pydantic import TypeAdapter, ValidationError
|
14
16
|
|
15
17
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
16
18
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -86,9 +88,11 @@ def export_documents(
|
|
86
88
|
conv_results: Iterable[ConversionResult],
|
87
89
|
output_dir: Path,
|
88
90
|
export_json: bool,
|
91
|
+
export_html: bool,
|
89
92
|
export_md: bool,
|
90
93
|
export_txt: bool,
|
91
94
|
export_doctags: bool,
|
95
|
+
image_export_mode: ImageRefMode,
|
92
96
|
):
|
93
97
|
|
94
98
|
success_count = 0
|
@@ -99,33 +103,45 @@ def export_documents(
|
|
99
103
|
success_count += 1
|
100
104
|
doc_filename = conv_res.input.file.stem
|
101
105
|
|
102
|
-
# Export
|
106
|
+
# Export JSON format:
|
103
107
|
if export_json:
|
104
108
|
fname = output_dir / f"{doc_filename}.json"
|
105
|
-
|
106
|
-
|
107
|
-
|
109
|
+
_log.info(f"writing JSON output to {fname}")
|
110
|
+
conv_res.document.save_as_json(
|
111
|
+
filename=fname, image_mode=image_export_mode
|
112
|
+
)
|
113
|
+
|
114
|
+
# Export HTML format:
|
115
|
+
if export_html:
|
116
|
+
fname = output_dir / f"{doc_filename}.html"
|
117
|
+
_log.info(f"writing HTML output to {fname}")
|
118
|
+
conv_res.document.save_as_html(
|
119
|
+
filename=fname, image_mode=image_export_mode
|
120
|
+
)
|
108
121
|
|
109
122
|
# Export Text format:
|
110
123
|
if export_txt:
|
111
124
|
fname = output_dir / f"{doc_filename}.txt"
|
112
|
-
|
113
|
-
|
114
|
-
|
125
|
+
_log.info(f"writing TXT output to {fname}")
|
126
|
+
conv_res.document.save_as_markdown(
|
127
|
+
filename=fname,
|
128
|
+
strict_text=True,
|
129
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
130
|
+
)
|
115
131
|
|
116
132
|
# Export Markdown format:
|
117
133
|
if export_md:
|
118
134
|
fname = output_dir / f"{doc_filename}.md"
|
119
|
-
|
120
|
-
|
121
|
-
|
135
|
+
_log.info(f"writing Markdown output to {fname}")
|
136
|
+
conv_res.document.save_as_markdown(
|
137
|
+
filename=fname, image_mode=image_export_mode
|
138
|
+
)
|
122
139
|
|
123
140
|
# Export Document Tags format:
|
124
141
|
if export_doctags:
|
125
142
|
fname = output_dir / f"{doc_filename}.doctags"
|
126
|
-
|
127
|
-
|
128
|
-
fp.write(conv_res.document.export_to_document_tokens())
|
143
|
+
_log.info(f"writing Doc Tags output to {fname}")
|
144
|
+
conv_res.document.save_as_document_tokens(filename=fname)
|
129
145
|
|
130
146
|
else:
|
131
147
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
@@ -160,6 +176,13 @@ def convert(
|
|
160
176
|
to_formats: List[OutputFormat] = typer.Option(
|
161
177
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
162
178
|
),
|
179
|
+
image_export_mode: Annotated[
|
180
|
+
ImageRefMode,
|
181
|
+
typer.Option(
|
182
|
+
...,
|
183
|
+
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
184
|
+
),
|
185
|
+
] = ImageRefMode.EMBEDDED,
|
163
186
|
ocr: Annotated[
|
164
187
|
bool,
|
165
188
|
typer.Option(
|
@@ -260,24 +283,45 @@ def convert(
|
|
260
283
|
with tempfile.TemporaryDirectory() as tempdir:
|
261
284
|
input_doc_paths: List[Path] = []
|
262
285
|
for src in input_sources:
|
263
|
-
|
264
|
-
|
286
|
+
try:
|
287
|
+
# check if we can fetch some remote url
|
288
|
+
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
289
|
+
input_doc_paths.append(source)
|
290
|
+
except FileNotFoundError:
|
265
291
|
err_console.print(
|
266
|
-
f"[red]Error: The input file {
|
292
|
+
f"[red]Error: The input file {src} does not exist.[/red]"
|
267
293
|
)
|
268
294
|
raise typer.Abort()
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
295
|
+
except IsADirectoryError:
|
296
|
+
# if the input matches to a file or a folder
|
297
|
+
try:
|
298
|
+
local_path = TypeAdapter(Path).validate_python(src)
|
299
|
+
if local_path.exists() and local_path.is_dir():
|
300
|
+
for fmt in from_formats:
|
301
|
+
for ext in FormatToExtensions[fmt]:
|
302
|
+
input_doc_paths.extend(
|
303
|
+
list(local_path.glob(f"**/*.{ext}"))
|
304
|
+
)
|
305
|
+
input_doc_paths.extend(
|
306
|
+
list(local_path.glob(f"**/*.{ext.upper()}"))
|
307
|
+
)
|
308
|
+
elif local_path.exists():
|
309
|
+
input_doc_paths.append(local_path)
|
310
|
+
else:
|
311
|
+
err_console.print(
|
312
|
+
f"[red]Error: The input file {src} does not exist.[/red]"
|
313
|
+
)
|
314
|
+
raise typer.Abort()
|
315
|
+
except Exception as err:
|
316
|
+
err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
|
317
|
+
_log.info(err) # will print more details if verbose is activated
|
318
|
+
raise typer.Abort()
|
276
319
|
|
277
320
|
if to_formats is None:
|
278
321
|
to_formats = [OutputFormat.MARKDOWN]
|
279
322
|
|
280
323
|
export_json = OutputFormat.JSON in to_formats
|
324
|
+
export_html = OutputFormat.HTML in to_formats
|
281
325
|
export_md = OutputFormat.MARKDOWN in to_formats
|
282
326
|
export_txt = OutputFormat.TEXT in to_formats
|
283
327
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
@@ -309,6 +353,13 @@ def convert(
|
|
309
353
|
)
|
310
354
|
pipeline_options.table_structure_options.mode = table_mode
|
311
355
|
|
356
|
+
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
357
|
+
pipeline_options.generate_page_images = True
|
358
|
+
pipeline_options.generate_picture_images = (
|
359
|
+
True # FIXME: to be deprecated in verson 3
|
360
|
+
)
|
361
|
+
pipeline_options.images_scale = 2
|
362
|
+
|
312
363
|
if artifacts_path is not None:
|
313
364
|
pipeline_options.artifacts_path = artifacts_path
|
314
365
|
|
@@ -343,9 +394,11 @@ def convert(
|
|
343
394
|
conv_results,
|
344
395
|
output_dir=output,
|
345
396
|
export_json=export_json,
|
397
|
+
export_html=export_html,
|
346
398
|
export_md=export_md,
|
347
399
|
export_txt=export_txt,
|
348
400
|
export_doctags=export_doctags,
|
401
|
+
image_export_mode=image_export_mode,
|
349
402
|
)
|
350
403
|
|
351
404
|
end_time = time.time() - start_time
|
@@ -24,6 +24,7 @@ class ConversionStatus(str, Enum):
|
|
24
24
|
FAILURE = auto()
|
25
25
|
SUCCESS = auto()
|
26
26
|
PARTIAL_SUCCESS = auto()
|
27
|
+
SKIPPED = auto()
|
27
28
|
|
28
29
|
|
29
30
|
class InputFormat(str, Enum):
|
@@ -40,6 +41,7 @@ class InputFormat(str, Enum):
|
|
40
41
|
class OutputFormat(str, Enum):
|
41
42
|
MARKDOWN = "md"
|
42
43
|
JSON = "json"
|
44
|
+
HTML = "html"
|
43
45
|
TEXT = "text"
|
44
46
|
DOCTAGS = "doctags"
|
45
47
|
|
@@ -95,6 +97,7 @@ class DoclingComponentType(str, Enum):
|
|
95
97
|
DOCUMENT_BACKEND = auto()
|
96
98
|
MODEL = auto()
|
97
99
|
DOC_ASSEMBLER = auto()
|
100
|
+
USER_INPUT = auto()
|
98
101
|
|
99
102
|
|
100
103
|
class ErrorItem(BaseModel):
|
@@ -3,7 +3,7 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
7
7
|
|
8
8
|
import filetype
|
9
9
|
from docling_core.types.doc import (
|
@@ -164,12 +164,6 @@ class InputDocument(BaseModel):
|
|
164
164
|
backend: Type[AbstractDocumentBackend],
|
165
165
|
path_or_stream: Union[BytesIO, Path],
|
166
166
|
) -> None:
|
167
|
-
if backend is None:
|
168
|
-
raise RuntimeError(
|
169
|
-
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
170
|
-
f"Please check your format configuration on DocumentConverter."
|
171
|
-
)
|
172
|
-
|
173
167
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
174
168
|
if not self._backend.is_valid():
|
175
169
|
self.valid = False
|
@@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
|
|
450
444
|
return ds_doc
|
451
445
|
|
452
446
|
|
447
|
+
class _DummyBackend(AbstractDocumentBackend):
|
448
|
+
def __init__(self, *args, **kwargs):
|
449
|
+
super().__init__(*args, **kwargs)
|
450
|
+
|
451
|
+
def is_valid(self) -> bool:
|
452
|
+
return False
|
453
|
+
|
454
|
+
@classmethod
|
455
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
456
|
+
return set()
|
457
|
+
|
458
|
+
@classmethod
|
459
|
+
def supports_pagination(cls) -> bool:
|
460
|
+
return False
|
461
|
+
|
462
|
+
def unload(self):
|
463
|
+
return super().unload()
|
464
|
+
|
465
|
+
|
453
466
|
class _DocumentConversionInput(BaseModel):
|
454
467
|
|
455
468
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
@@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
|
|
461
474
|
for item in self.path_or_stream_iterator:
|
462
475
|
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
463
476
|
format = self._guess_format(obj)
|
477
|
+
backend: Type[AbstractDocumentBackend]
|
464
478
|
if format not in format_options.keys():
|
465
|
-
_log.
|
466
|
-
f"
|
479
|
+
_log.error(
|
480
|
+
f"Input document {obj.name} does not match any allowed format."
|
467
481
|
)
|
468
|
-
|
482
|
+
backend = _DummyBackend
|
469
483
|
else:
|
470
484
|
backend = format_options[format].backend
|
471
485
|
|
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
16
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
17
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
18
|
-
from docling.datamodel.base_models import
|
18
|
+
from docling.datamodel.base_models import (
|
19
|
+
ConversionStatus,
|
20
|
+
DoclingComponentType,
|
21
|
+
DocumentStream,
|
22
|
+
ErrorItem,
|
23
|
+
InputFormat,
|
24
|
+
)
|
19
25
|
from docling.datamodel.document import (
|
20
26
|
ConversionResult,
|
21
27
|
InputDocument,
|
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
|
|
23
29
|
)
|
24
30
|
from docling.datamodel.pipeline_options import PipelineOptions
|
25
31
|
from docling.datamodel.settings import DocumentLimits, settings
|
32
|
+
from docling.exceptions import ConversionError
|
26
33
|
from docling.pipeline.base_pipeline import BasePipeline
|
27
34
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
28
35
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
|
|
85
92
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
86
93
|
|
87
94
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
95
|
+
def _get_default_option(format: InputFormat) -> FormatOption:
|
96
|
+
format_to_default_options = {
|
97
|
+
InputFormat.XLSX: FormatOption(
|
98
|
+
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
99
|
+
),
|
100
|
+
InputFormat.DOCX: FormatOption(
|
101
|
+
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
102
|
+
),
|
103
|
+
InputFormat.PPTX: FormatOption(
|
104
|
+
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
105
|
+
),
|
106
|
+
InputFormat.MD: FormatOption(
|
107
|
+
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
108
|
+
),
|
109
|
+
InputFormat.ASCIIDOC: FormatOption(
|
110
|
+
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
111
|
+
),
|
112
|
+
InputFormat.HTML: FormatOption(
|
113
|
+
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
114
|
+
),
|
115
|
+
InputFormat.IMAGE: FormatOption(
|
116
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
117
|
+
),
|
118
|
+
InputFormat.PDF: FormatOption(
|
119
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
120
|
+
),
|
121
|
+
}
|
122
|
+
if (options := format_to_default_options.get(format)) is not None:
|
123
|
+
return options
|
124
|
+
else:
|
125
|
+
raise RuntimeError(f"No default options configured for {format}")
|
114
126
|
|
115
127
|
|
116
128
|
class DocumentConverter:
|
@@ -121,36 +133,26 @@ class DocumentConverter:
|
|
121
133
|
allowed_formats: Optional[List[InputFormat]] = None,
|
122
134
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
123
135
|
):
|
124
|
-
self.allowed_formats =
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
else:
|
136
|
-
for f in self.allowed_formats:
|
137
|
-
if f not in self.format_to_options.keys():
|
138
|
-
_log.debug(f"Requested format {f} will use default options.")
|
139
|
-
self.format_to_options[f] = _format_to_default_options[f]
|
140
|
-
|
141
|
-
remove_keys = []
|
142
|
-
for f in self.format_to_options.keys():
|
143
|
-
if f not in self.allowed_formats:
|
144
|
-
remove_keys.append(f)
|
145
|
-
|
146
|
-
for f in remove_keys:
|
147
|
-
self.format_to_options.pop(f)
|
148
|
-
|
136
|
+
self.allowed_formats = (
|
137
|
+
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
138
|
+
)
|
139
|
+
self.format_to_options = {
|
140
|
+
format: (
|
141
|
+
_get_default_option(format=format)
|
142
|
+
if (custom_option := (format_options or {}).get(format)) is None
|
143
|
+
else custom_option
|
144
|
+
)
|
145
|
+
for format in self.allowed_formats
|
146
|
+
}
|
149
147
|
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
150
148
|
|
151
149
|
def initialize_pipeline(self, format: InputFormat):
|
152
150
|
"""Initialize the conversion pipeline for the selected format."""
|
153
|
-
self._get_pipeline(doc_format=format)
|
151
|
+
pipeline = self._get_pipeline(doc_format=format)
|
152
|
+
if pipeline is None:
|
153
|
+
raise ConversionError(
|
154
|
+
f"No pipeline could be initialized for format {format}"
|
155
|
+
)
|
154
156
|
|
155
157
|
@validate_call(config=ConfigDict(strict=True))
|
156
158
|
def convert(
|
@@ -186,22 +188,28 @@ class DocumentConverter:
|
|
186
188
|
limits=limits,
|
187
189
|
)
|
188
190
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
191
|
+
|
192
|
+
had_result = False
|
189
193
|
for conv_res in conv_res_iter:
|
194
|
+
had_result = True
|
190
195
|
if raises_on_error and conv_res.status not in {
|
191
196
|
ConversionStatus.SUCCESS,
|
192
197
|
ConversionStatus.PARTIAL_SUCCESS,
|
193
198
|
}:
|
194
|
-
raise
|
199
|
+
raise ConversionError(
|
195
200
|
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
196
201
|
)
|
197
202
|
else:
|
198
203
|
yield conv_res
|
199
204
|
|
205
|
+
if not had_result and raises_on_error:
|
206
|
+
raise ConversionError(
|
207
|
+
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
208
|
+
)
|
209
|
+
|
200
210
|
def _convert(
|
201
211
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
202
212
|
) -> Iterator[ConversionResult]:
|
203
|
-
assert self.format_to_options is not None
|
204
|
-
|
205
213
|
start_time = time.monotonic()
|
206
214
|
|
207
215
|
for input_batch in chunkify(
|
@@ -223,27 +231,22 @@ class DocumentConverter:
|
|
223
231
|
):
|
224
232
|
elapsed = time.monotonic() - start_time
|
225
233
|
start_time = time.monotonic()
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
)
|
231
|
-
yield item
|
232
|
-
else:
|
233
|
-
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
234
|
+
_log.info(
|
235
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
236
|
+
)
|
237
|
+
yield item
|
234
238
|
|
235
239
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
236
|
-
assert self.format_to_options is not None
|
237
|
-
|
238
240
|
fopt = self.format_to_options.get(doc_format)
|
239
241
|
|
240
242
|
if fopt is None:
|
241
|
-
|
243
|
+
return None
|
242
244
|
else:
|
243
245
|
pipeline_class = fopt.pipeline_cls
|
244
246
|
pipeline_options = fopt.pipeline_options
|
245
247
|
|
246
|
-
|
248
|
+
if pipeline_options is None:
|
249
|
+
return None
|
247
250
|
# TODO this will ignore if different options have been defined for the same pipeline class.
|
248
251
|
if (
|
249
252
|
pipeline_class not in self.initialized_pipelines
|
@@ -257,11 +260,26 @@ class DocumentConverter:
|
|
257
260
|
|
258
261
|
def _process_document(
|
259
262
|
self, in_doc: InputDocument, raises_on_error: bool
|
260
|
-
) ->
|
261
|
-
assert self.allowed_formats is not None
|
262
|
-
assert in_doc.format in self.allowed_formats
|
263
|
+
) -> ConversionResult:
|
263
264
|
|
264
|
-
|
265
|
+
valid = (
|
266
|
+
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
267
|
+
)
|
268
|
+
if valid:
|
269
|
+
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
270
|
+
else:
|
271
|
+
error_message = f"File format not allowed: {in_doc.file}"
|
272
|
+
if raises_on_error:
|
273
|
+
raise ConversionError(error_message)
|
274
|
+
else:
|
275
|
+
error_item = ErrorItem(
|
276
|
+
component_type=DoclingComponentType.USER_INPUT,
|
277
|
+
module_name="",
|
278
|
+
error_message=error_message,
|
279
|
+
)
|
280
|
+
conv_res = ConversionResult(
|
281
|
+
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
282
|
+
)
|
265
283
|
|
266
284
|
return conv_res
|
267
285
|
|
@@ -270,26 +288,28 @@ class DocumentConverter:
|
|
270
288
|
) -> ConversionResult:
|
271
289
|
if in_doc.valid:
|
272
290
|
pipeline = self._get_pipeline(in_doc.format)
|
273
|
-
if pipeline is None:
|
291
|
+
if pipeline is not None:
|
292
|
+
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
293
|
+
else:
|
274
294
|
if raises_on_error:
|
275
|
-
raise
|
295
|
+
raise ConversionError(
|
276
296
|
f"No pipeline could be initialized for {in_doc.file}."
|
277
297
|
)
|
278
298
|
else:
|
279
|
-
conv_res = ConversionResult(
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
284
|
-
|
299
|
+
conv_res = ConversionResult(
|
300
|
+
input=in_doc,
|
301
|
+
status=ConversionStatus.FAILURE,
|
302
|
+
)
|
285
303
|
else:
|
286
304
|
if raises_on_error:
|
287
|
-
raise
|
305
|
+
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
288
306
|
|
289
307
|
else:
|
290
308
|
# invalid doc or not of desired format
|
291
|
-
conv_res = ConversionResult(
|
292
|
-
|
309
|
+
conv_res = ConversionResult(
|
310
|
+
input=in_doc,
|
311
|
+
status=ConversionStatus.FAILURE,
|
312
|
+
)
|
293
313
|
# TODO add error log why it failed.
|
294
314
|
|
295
315
|
return conv_res
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.9.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -25,8 +25,8 @@ packages = [{include = "docling"}]
|
|
25
25
|
# actual dependencies:
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
|
-
|
29
|
-
|
28
|
+
docling-core = { version = "^2.8.0", extras = ["chunking"] }
|
29
|
+
pydantic = "^2.0.0"
|
30
30
|
docling-ibm-models = "^2.0.6"
|
31
31
|
deepsearch-glm = "^0.26.1"
|
32
32
|
filetype = "^1.2.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|