docling 2.8.3__tar.gz → 2.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.8.3 → docling-2.9.0}/PKG-INFO +5 -5
- {docling-2.8.3 → docling-2.9.0}/README.md +2 -2
- {docling-2.8.3 → docling-2.9.0}/docling/backend/msword_backend.py +43 -27
- docling-2.9.0/docling/chunking/__init__.py +12 -0
- {docling-2.8.3 → docling-2.9.0}/docling/cli/main.py +76 -23
- {docling-2.8.3 → docling-2.9.0}/docling/datamodel/base_models.py +1 -0
- docling-2.9.0/docling/py.typed +1 -0
- {docling-2.8.3 → docling-2.9.0}/pyproject.toml +3 -3
- {docling-2.8.3 → docling-2.9.0}/LICENSE +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/__init__.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/__init__.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/html_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/md_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/cli/__init__.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/datamodel/document.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/datamodel/settings.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/document_converter.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/exceptions.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/__init__.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/base_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/layout_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/utils/__init__.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/utils/export.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/utils/layout_utils.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/utils/profiling.py +0 -0
- {docling-2.8.3 → docling-2.9.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.9.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
29
|
-
Requires-Dist: docling-core (>=2.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.8.0,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
39
39
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
40
40
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
41
41
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
42
|
-
Requires-Dist: pydantic (>=2.0.0,<
|
42
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
43
43
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
44
44
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
45
45
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
|
|
59
59
|
</a>
|
60
60
|
</p>
|
61
61
|
|
62
|
-
#
|
62
|
+
# Docling
|
63
63
|
|
64
64
|
<p align="center">
|
65
65
|
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
81
81
|
|
82
82
|
## Features
|
83
83
|
|
84
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
84
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
85
85
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
86
86
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
87
87
|
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
@@ -4,7 +4,7 @@
|
|
4
4
|
</a>
|
5
5
|
</p>
|
6
6
|
|
7
|
-
#
|
7
|
+
# Docling
|
8
8
|
|
9
9
|
<p align="center">
|
10
10
|
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
@@ -26,7 +26,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
26
26
|
|
27
27
|
## Features
|
28
28
|
|
29
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
29
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
30
30
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
31
31
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
32
32
|
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Set, Union
|
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
133
134
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
134
135
|
for element in body:
|
135
136
|
tag_name = etree.QName(element).localname
|
136
|
-
|
137
137
|
# Check for Inline Images (blip elements)
|
138
138
|
namespaces = {
|
139
139
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
153
153
|
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
154
154
|
# Check for Text
|
155
155
|
elif tag_name in ["p"]:
|
156
|
+
# "tcPr", "sectPr"
|
156
157
|
self.handle_text_elements(element, docx_obj, doc)
|
157
158
|
else:
|
158
159
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
166
167
|
except ValueError:
|
167
168
|
return default
|
168
169
|
|
170
|
+
def split_text_and_number(self, input_string):
|
171
|
+
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
172
|
+
if match:
|
173
|
+
parts = list(filter(None, match.groups()))
|
174
|
+
return parts
|
175
|
+
else:
|
176
|
+
return [input_string]
|
177
|
+
|
169
178
|
def get_numId_and_ilvl(self, paragraph):
|
170
179
|
# Access the XML element of the paragraph
|
171
180
|
numPr = paragraph._element.find(
|
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
188
197
|
def get_label_and_level(self, paragraph):
|
189
198
|
if paragraph.style is None:
|
190
199
|
return "Normal", None
|
191
|
-
label = paragraph.style.
|
200
|
+
label = paragraph.style.style_id
|
192
201
|
if label is None:
|
193
202
|
return "Normal", None
|
194
203
|
if ":" in label:
|
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
197
206
|
if len(parts) == 2:
|
198
207
|
return parts[0], int(parts[1])
|
199
208
|
|
200
|
-
parts =
|
209
|
+
parts = self.split_text_and_number(label)
|
201
210
|
|
202
211
|
if "Heading" in label and len(parts) == 2:
|
203
212
|
parts.sort()
|
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
219
228
|
if paragraph.text is None:
|
220
229
|
return
|
221
230
|
text = paragraph.text.strip()
|
222
|
-
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
223
231
|
|
224
232
|
# Common styles for bullet and numbered lists.
|
225
233
|
# "List Bullet", "List Number", "List Paragraph"
|
226
234
|
# Identify wether list is a numbered list or not
|
227
235
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
228
236
|
is_numbered = False
|
229
|
-
|
237
|
+
p_style_id, p_level = self.get_label_and_level(paragraph)
|
230
238
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
231
239
|
|
232
240
|
if numid == 0:
|
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
238
246
|
element,
|
239
247
|
docx_obj,
|
240
248
|
doc,
|
241
|
-
|
249
|
+
p_style_id,
|
242
250
|
p_level,
|
243
251
|
numid,
|
244
252
|
ilevel,
|
245
253
|
text,
|
246
254
|
is_numbered,
|
247
255
|
)
|
248
|
-
self.update_history(
|
256
|
+
self.update_history(p_style_id, p_level, numid, ilevel)
|
249
257
|
return
|
250
258
|
elif numid is None and self.prev_numid() is not None: # Close list
|
251
259
|
for key, val in self.parents.items():
|
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
253
261
|
self.parents[key] = None
|
254
262
|
self.level = self.level_at_new_list - 1
|
255
263
|
self.level_at_new_list = None
|
256
|
-
if
|
264
|
+
if p_style_id in ["Title"]:
|
257
265
|
for key, val in self.parents.items():
|
258
266
|
self.parents[key] = None
|
259
267
|
self.parents[0] = doc.add_text(
|
260
268
|
parent=None, label=DocItemLabel.TITLE, text=text
|
261
269
|
)
|
262
|
-
elif "Heading" in
|
263
|
-
self.add_header(element, docx_obj, doc,
|
270
|
+
elif "Heading" in p_style_id:
|
271
|
+
self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
|
264
272
|
|
265
|
-
elif
|
273
|
+
elif p_style_id in [
|
266
274
|
"Paragraph",
|
267
275
|
"Normal",
|
268
276
|
"Subtitle",
|
269
277
|
"Author",
|
270
|
-
"
|
271
|
-
"
|
272
|
-
"
|
278
|
+
"DefaultText",
|
279
|
+
"ListParagraph",
|
280
|
+
"ListBullet",
|
273
281
|
"Quote",
|
274
282
|
]:
|
275
283
|
level = self.get_level()
|
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
285
293
|
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
286
294
|
)
|
287
295
|
|
288
|
-
self.update_history(
|
296
|
+
self.update_history(p_style_id, p_level, numid, ilevel)
|
289
297
|
return
|
290
298
|
|
291
299
|
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
292
300
|
level = self.get_level()
|
293
301
|
if isinstance(curr_level, int):
|
294
|
-
|
295
302
|
if curr_level > level:
|
296
|
-
|
297
303
|
# add invisible group
|
298
304
|
for i in range(level, curr_level):
|
299
305
|
self.parents[i] = doc.add_group(
|
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
301
307
|
label=GroupLabel.SECTION,
|
302
308
|
name=f"header-{i}",
|
303
309
|
)
|
304
|
-
|
305
310
|
elif curr_level < level:
|
306
|
-
|
307
311
|
# remove the tail
|
308
312
|
for key, val in self.parents.items():
|
309
313
|
if key >= curr_level:
|
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
314
318
|
text=text,
|
315
319
|
level=curr_level,
|
316
320
|
)
|
317
|
-
|
318
321
|
else:
|
319
322
|
self.parents[self.level] = doc.add_heading(
|
320
323
|
parent=self.parents[self.level - 1],
|
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
328
331
|
element,
|
329
332
|
docx_obj,
|
330
333
|
doc,
|
331
|
-
|
334
|
+
p_style_id,
|
332
335
|
p_level,
|
333
336
|
numid,
|
334
337
|
ilevel,
|
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
346
349
|
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
347
350
|
)
|
348
351
|
|
349
|
-
#
|
352
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
350
353
|
self.listIter += 1
|
351
354
|
if is_numbered:
|
352
355
|
enum_marker = str(self.listIter) + "."
|
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
365
368
|
self.level_at_new_list + self.prev_indent() + 1,
|
366
369
|
self.level_at_new_list + ilevel + 1,
|
367
370
|
):
|
368
|
-
#
|
369
|
-
#
|
371
|
+
# Determine if this is an unordered list or an ordered list.
|
372
|
+
# Set GroupLabel.ORDERED_LIST when it fits.
|
370
373
|
self.listIter = 0
|
371
374
|
if is_numbered:
|
372
375
|
self.parents[i] = doc.add_group(
|
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
467
470
|
row_span = get_rowspan(cell)
|
468
471
|
col_span = get_colspan(cell)
|
469
472
|
|
473
|
+
cell_text = cell.text
|
474
|
+
# In case cell doesn't return text via docx library:
|
475
|
+
if len(cell_text) == 0:
|
476
|
+
cell_xml = cell._element
|
477
|
+
|
478
|
+
texts = [""]
|
479
|
+
for elem in cell_xml.iter():
|
480
|
+
if elem.tag.endswith("t"): # <w:t> tags that contain text
|
481
|
+
if elem.text:
|
482
|
+
texts.append(elem.text)
|
483
|
+
# Join the collected text
|
484
|
+
cell_text = " ".join(texts).strip()
|
485
|
+
|
470
486
|
# Find the next available column in the grid
|
471
487
|
while table_grid[row_idx][col_idx] is not None:
|
472
488
|
col_idx += 1
|
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
477
493
|
table_grid[row_idx + i][col_idx + j] = ""
|
478
494
|
|
479
495
|
cell = TableCell(
|
480
|
-
text=
|
496
|
+
text=cell_text,
|
481
497
|
row_span=row_span,
|
482
498
|
col_span=col_span,
|
483
499
|
start_row_offset_idx=row_idx,
|
484
500
|
end_row_offset_idx=row_idx + row_span,
|
485
501
|
start_col_offset_idx=col_idx,
|
486
502
|
end_col_offset_idx=col_idx + col_span,
|
487
|
-
col_header=False,
|
488
|
-
row_header=False,
|
503
|
+
col_header=False,
|
504
|
+
row_header=False,
|
489
505
|
)
|
490
506
|
|
491
507
|
data.table_cells.append(cell)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
3
|
+
# SPDX-License-Identifier: MIT
|
4
|
+
#
|
5
|
+
|
6
|
+
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
7
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
8
|
+
DocChunk,
|
9
|
+
DocMeta,
|
10
|
+
HierarchicalChunker,
|
11
|
+
)
|
12
|
+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
@@ -10,7 +10,9 @@ from pathlib import Path
|
|
10
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
11
11
|
|
12
12
|
import typer
|
13
|
+
from docling_core.types.doc import ImageRefMode
|
13
14
|
from docling_core.utils.file import resolve_source_to_path
|
15
|
+
from pydantic import TypeAdapter, ValidationError
|
14
16
|
|
15
17
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
16
18
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -86,9 +88,11 @@ def export_documents(
|
|
86
88
|
conv_results: Iterable[ConversionResult],
|
87
89
|
output_dir: Path,
|
88
90
|
export_json: bool,
|
91
|
+
export_html: bool,
|
89
92
|
export_md: bool,
|
90
93
|
export_txt: bool,
|
91
94
|
export_doctags: bool,
|
95
|
+
image_export_mode: ImageRefMode,
|
92
96
|
):
|
93
97
|
|
94
98
|
success_count = 0
|
@@ -99,33 +103,45 @@ def export_documents(
|
|
99
103
|
success_count += 1
|
100
104
|
doc_filename = conv_res.input.file.stem
|
101
105
|
|
102
|
-
# Export
|
106
|
+
# Export JSON format:
|
103
107
|
if export_json:
|
104
108
|
fname = output_dir / f"{doc_filename}.json"
|
105
|
-
|
106
|
-
|
107
|
-
|
109
|
+
_log.info(f"writing JSON output to {fname}")
|
110
|
+
conv_res.document.save_as_json(
|
111
|
+
filename=fname, image_mode=image_export_mode
|
112
|
+
)
|
113
|
+
|
114
|
+
# Export HTML format:
|
115
|
+
if export_html:
|
116
|
+
fname = output_dir / f"{doc_filename}.html"
|
117
|
+
_log.info(f"writing HTML output to {fname}")
|
118
|
+
conv_res.document.save_as_html(
|
119
|
+
filename=fname, image_mode=image_export_mode
|
120
|
+
)
|
108
121
|
|
109
122
|
# Export Text format:
|
110
123
|
if export_txt:
|
111
124
|
fname = output_dir / f"{doc_filename}.txt"
|
112
|
-
|
113
|
-
|
114
|
-
|
125
|
+
_log.info(f"writing TXT output to {fname}")
|
126
|
+
conv_res.document.save_as_markdown(
|
127
|
+
filename=fname,
|
128
|
+
strict_text=True,
|
129
|
+
image_mode=ImageRefMode.PLACEHOLDER,
|
130
|
+
)
|
115
131
|
|
116
132
|
# Export Markdown format:
|
117
133
|
if export_md:
|
118
134
|
fname = output_dir / f"{doc_filename}.md"
|
119
|
-
|
120
|
-
|
121
|
-
|
135
|
+
_log.info(f"writing Markdown output to {fname}")
|
136
|
+
conv_res.document.save_as_markdown(
|
137
|
+
filename=fname, image_mode=image_export_mode
|
138
|
+
)
|
122
139
|
|
123
140
|
# Export Document Tags format:
|
124
141
|
if export_doctags:
|
125
142
|
fname = output_dir / f"{doc_filename}.doctags"
|
126
|
-
|
127
|
-
|
128
|
-
fp.write(conv_res.document.export_to_document_tokens())
|
143
|
+
_log.info(f"writing Doc Tags output to {fname}")
|
144
|
+
conv_res.document.save_as_document_tokens(filename=fname)
|
129
145
|
|
130
146
|
else:
|
131
147
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
@@ -160,6 +176,13 @@ def convert(
|
|
160
176
|
to_formats: List[OutputFormat] = typer.Option(
|
161
177
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
162
178
|
),
|
179
|
+
image_export_mode: Annotated[
|
180
|
+
ImageRefMode,
|
181
|
+
typer.Option(
|
182
|
+
...,
|
183
|
+
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
184
|
+
),
|
185
|
+
] = ImageRefMode.EMBEDDED,
|
163
186
|
ocr: Annotated[
|
164
187
|
bool,
|
165
188
|
typer.Option(
|
@@ -260,24 +283,45 @@ def convert(
|
|
260
283
|
with tempfile.TemporaryDirectory() as tempdir:
|
261
284
|
input_doc_paths: List[Path] = []
|
262
285
|
for src in input_sources:
|
263
|
-
|
264
|
-
|
286
|
+
try:
|
287
|
+
# check if we can fetch some remote url
|
288
|
+
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
289
|
+
input_doc_paths.append(source)
|
290
|
+
except FileNotFoundError:
|
265
291
|
err_console.print(
|
266
|
-
f"[red]Error: The input file {
|
292
|
+
f"[red]Error: The input file {src} does not exist.[/red]"
|
267
293
|
)
|
268
294
|
raise typer.Abort()
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
295
|
+
except IsADirectoryError:
|
296
|
+
# if the input matches to a file or a folder
|
297
|
+
try:
|
298
|
+
local_path = TypeAdapter(Path).validate_python(src)
|
299
|
+
if local_path.exists() and local_path.is_dir():
|
300
|
+
for fmt in from_formats:
|
301
|
+
for ext in FormatToExtensions[fmt]:
|
302
|
+
input_doc_paths.extend(
|
303
|
+
list(local_path.glob(f"**/*.{ext}"))
|
304
|
+
)
|
305
|
+
input_doc_paths.extend(
|
306
|
+
list(local_path.glob(f"**/*.{ext.upper()}"))
|
307
|
+
)
|
308
|
+
elif local_path.exists():
|
309
|
+
input_doc_paths.append(local_path)
|
310
|
+
else:
|
311
|
+
err_console.print(
|
312
|
+
f"[red]Error: The input file {src} does not exist.[/red]"
|
313
|
+
)
|
314
|
+
raise typer.Abort()
|
315
|
+
except Exception as err:
|
316
|
+
err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
|
317
|
+
_log.info(err) # will print more details if verbose is activated
|
318
|
+
raise typer.Abort()
|
276
319
|
|
277
320
|
if to_formats is None:
|
278
321
|
to_formats = [OutputFormat.MARKDOWN]
|
279
322
|
|
280
323
|
export_json = OutputFormat.JSON in to_formats
|
324
|
+
export_html = OutputFormat.HTML in to_formats
|
281
325
|
export_md = OutputFormat.MARKDOWN in to_formats
|
282
326
|
export_txt = OutputFormat.TEXT in to_formats
|
283
327
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
@@ -309,6 +353,13 @@ def convert(
|
|
309
353
|
)
|
310
354
|
pipeline_options.table_structure_options.mode = table_mode
|
311
355
|
|
356
|
+
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
357
|
+
pipeline_options.generate_page_images = True
|
358
|
+
pipeline_options.generate_picture_images = (
|
359
|
+
True # FIXME: to be deprecated in verson 3
|
360
|
+
)
|
361
|
+
pipeline_options.images_scale = 2
|
362
|
+
|
312
363
|
if artifacts_path is not None:
|
313
364
|
pipeline_options.artifacts_path = artifacts_path
|
314
365
|
|
@@ -343,9 +394,11 @@ def convert(
|
|
343
394
|
conv_results,
|
344
395
|
output_dir=output,
|
345
396
|
export_json=export_json,
|
397
|
+
export_html=export_html,
|
346
398
|
export_md=export_md,
|
347
399
|
export_txt=export_txt,
|
348
400
|
export_doctags=export_doctags,
|
401
|
+
image_export_mode=image_export_mode,
|
349
402
|
)
|
350
403
|
|
351
404
|
end_time = time.time() - start_time
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.9.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -25,8 +25,8 @@ packages = [{include = "docling"}]
|
|
25
25
|
# actual dependencies:
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
|
-
|
29
|
-
|
28
|
+
docling-core = { version = "^2.8.0", extras = ["chunking"] }
|
29
|
+
pydantic = "^2.0.0"
|
30
30
|
docling-ibm-models = "^2.0.6"
|
31
31
|
deepsearch-glm = "^0.26.1"
|
32
32
|
filetype = "^1.2.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|