docling 2.8.0__tar.gz → 2.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.8.0 → docling-2.8.2}/PKG-INFO +22 -7
- {docling-2.8.0 → docling-2.8.2}/README.md +20 -4
- {docling-2.8.0 → docling-2.8.2}/docling/cli/main.py +112 -84
- {docling-2.8.0 → docling-2.8.2}/docling/datamodel/base_models.py +3 -8
- {docling-2.8.0 → docling-2.8.2}/docling/datamodel/document.py +2 -2
- {docling-2.8.0 → docling-2.8.2}/docling/datamodel/pipeline_options.py +20 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/tesseract_ocr_cli_model.py +12 -7
- {docling-2.8.0 → docling-2.8.2}/pyproject.toml +9 -5
- {docling-2.8.0 → docling-2.8.2}/LICENSE +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/__init__.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/__init__.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/abstract_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/html_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/md_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/msword_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/pdf_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/cli/__init__.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/datamodel/__init__.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/datamodel/settings.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/document_converter.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/__init__.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/base_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/base_ocr_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/ds_glm_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/easyocr_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/layout_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/page_assemble_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/table_structure_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/pipeline/__init__.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/utils/__init__.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/utils/export.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/utils/layout_utils.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/utils/profiling.py +0 -0
- {docling-2.8.0 → docling-2.8.2}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.8.
|
3
|
+
Version: 2.8.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
29
|
-
Requires-Dist: docling-core (>=2.
|
29
|
+
Requires-Dist: docling-core (>=2.6.1,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -39,7 +39,6 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
39
39
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
40
40
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
41
41
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
42
|
-
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
43
42
|
Requires-Dist: pydantic (>=2.0.0,<2.10)
|
44
43
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
45
44
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
@@ -60,7 +59,7 @@ Description-Content-Type: text/markdown
|
|
60
59
|
</a>
|
61
60
|
</p>
|
62
61
|
|
63
|
-
# Docling
|
62
|
+
# 🦆 Docling
|
64
63
|
|
65
64
|
<p align="center">
|
66
65
|
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
@@ -85,7 +84,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
85
84
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
86
85
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
87
86
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
88
|
-
* 🤖 Easy integration with LlamaIndex
|
87
|
+
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
89
88
|
* 🔍 OCR support for scanned PDFs
|
90
89
|
* 💻 Simple and convenient CLI
|
91
90
|
|
@@ -121,8 +120,24 @@ result = converter.convert(source)
|
|
121
120
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
122
121
|
```
|
123
122
|
|
124
|
-
|
125
|
-
|
123
|
+
More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
|
124
|
+
the docs.
|
125
|
+
|
126
|
+
## Documentation
|
127
|
+
|
128
|
+
Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
|
129
|
+
installation, usage, concepts, recipes, extensions, and more.
|
130
|
+
|
131
|
+
## Examples
|
132
|
+
|
133
|
+
Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
|
134
|
+
demonstrating how to address different application use cases with Docling.
|
135
|
+
|
136
|
+
## Integrations
|
137
|
+
|
138
|
+
To further accelerate your AI application development, check out Docling's native
|
139
|
+
[integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
|
140
|
+
and tools.
|
126
141
|
|
127
142
|
## Get help and support
|
128
143
|
|
@@ -4,7 +4,7 @@
|
|
4
4
|
</a>
|
5
5
|
</p>
|
6
6
|
|
7
|
-
# Docling
|
7
|
+
# 🦆 Docling
|
8
8
|
|
9
9
|
<p align="center">
|
10
10
|
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
@@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
29
29
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
30
30
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
31
31
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
32
|
-
* 🤖 Easy integration with LlamaIndex
|
32
|
+
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
33
33
|
* 🔍 OCR support for scanned PDFs
|
34
34
|
* 💻 Simple and convenient CLI
|
35
35
|
|
@@ -65,8 +65,24 @@ result = converter.convert(source)
|
|
65
65
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
66
66
|
```
|
67
67
|
|
68
|
-
|
69
|
-
|
68
|
+
More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
|
69
|
+
the docs.
|
70
|
+
|
71
|
+
## Documentation
|
72
|
+
|
73
|
+
Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
|
74
|
+
installation, usage, concepts, recipes, extensions, and more.
|
75
|
+
|
76
|
+
## Examples
|
77
|
+
|
78
|
+
Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
|
79
|
+
demonstrating how to address different application use cases with Docling.
|
80
|
+
|
81
|
+
## Integrations
|
82
|
+
|
83
|
+
To further accelerate your AI application development, check out Docling's native
|
84
|
+
[integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
|
85
|
+
and tools.
|
70
86
|
|
71
87
|
## Get help and support
|
72
88
|
|
@@ -2,6 +2,7 @@ import importlib
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import re
|
5
|
+
import tempfile
|
5
6
|
import time
|
6
7
|
import warnings
|
7
8
|
from enum import Enum
|
@@ -9,7 +10,7 @@ from pathlib import Path
|
|
9
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
10
11
|
|
11
12
|
import typer
|
12
|
-
from docling_core.utils.file import
|
13
|
+
from docling_core.utils.file import resolve_source_to_path
|
13
14
|
|
14
15
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
15
16
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -32,6 +33,7 @@ from docling.datamodel.pipeline_options import (
|
|
32
33
|
TesseractCliOcrOptions,
|
33
34
|
TesseractOcrOptions,
|
34
35
|
)
|
36
|
+
from docling.datamodel.settings import settings
|
35
37
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
36
38
|
|
37
39
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
@@ -212,6 +214,24 @@ def convert(
|
|
212
214
|
help="Set the verbosity level. -v for info logging, -vv for debug logging.",
|
213
215
|
),
|
214
216
|
] = 0,
|
217
|
+
debug_visualize_cells: Annotated[
|
218
|
+
bool,
|
219
|
+
typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
|
220
|
+
] = False,
|
221
|
+
debug_visualize_ocr: Annotated[
|
222
|
+
bool,
|
223
|
+
typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
|
224
|
+
] = False,
|
225
|
+
debug_visualize_layout: Annotated[
|
226
|
+
bool,
|
227
|
+
typer.Option(
|
228
|
+
..., help="Enable debug output which visualizes the layour clusters"
|
229
|
+
),
|
230
|
+
] = False,
|
231
|
+
debug_visualize_tables: Annotated[
|
232
|
+
bool,
|
233
|
+
typer.Option(..., help="Enable debug output which visualizes the table cells"),
|
234
|
+
] = False,
|
215
235
|
version: Annotated[
|
216
236
|
Optional[bool],
|
217
237
|
typer.Option(
|
@@ -229,98 +249,106 @@ def convert(
|
|
229
249
|
elif verbose == 2:
|
230
250
|
logging.basicConfig(level=logging.DEBUG)
|
231
251
|
|
252
|
+
settings.debug.visualize_cells = debug_visualize_cells
|
253
|
+
settings.debug.visualize_layout = debug_visualize_layout
|
254
|
+
settings.debug.visualize_tables = debug_visualize_tables
|
255
|
+
settings.debug.visualize_ocr = debug_visualize_ocr
|
256
|
+
|
232
257
|
if from_formats is None:
|
233
258
|
from_formats = [e for e in InputFormat]
|
234
259
|
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
for
|
246
|
-
|
247
|
-
|
260
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
261
|
+
input_doc_paths: List[Path] = []
|
262
|
+
for src in input_sources:
|
263
|
+
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
264
|
+
if not source.exists():
|
265
|
+
err_console.print(
|
266
|
+
f"[red]Error: The input file {source} does not exist.[/red]"
|
267
|
+
)
|
268
|
+
raise typer.Abort()
|
269
|
+
elif source.is_dir():
|
270
|
+
for fmt in from_formats:
|
271
|
+
for ext in FormatToExtensions[fmt]:
|
272
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
273
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
274
|
+
else:
|
275
|
+
input_doc_paths.append(source)
|
276
|
+
|
277
|
+
if to_formats is None:
|
278
|
+
to_formats = [OutputFormat.MARKDOWN]
|
279
|
+
|
280
|
+
export_json = OutputFormat.JSON in to_formats
|
281
|
+
export_md = OutputFormat.MARKDOWN in to_formats
|
282
|
+
export_txt = OutputFormat.TEXT in to_formats
|
283
|
+
export_doctags = OutputFormat.DOCTAGS in to_formats
|
284
|
+
|
285
|
+
if ocr_engine == OcrEngine.EASYOCR:
|
286
|
+
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
287
|
+
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
288
|
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
289
|
+
elif ocr_engine == OcrEngine.TESSERACT:
|
290
|
+
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
291
|
+
elif ocr_engine == OcrEngine.OCRMAC:
|
292
|
+
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
293
|
+
elif ocr_engine == OcrEngine.RAPIDOCR:
|
294
|
+
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
248
295
|
else:
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
if ocr_engine == OcrEngine.EASYOCR:
|
260
|
-
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
261
|
-
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
262
|
-
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
263
|
-
elif ocr_engine == OcrEngine.TESSERACT:
|
264
|
-
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
265
|
-
elif ocr_engine == OcrEngine.OCRMAC:
|
266
|
-
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
267
|
-
elif ocr_engine == OcrEngine.RAPIDOCR:
|
268
|
-
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
269
|
-
else:
|
270
|
-
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
271
|
-
|
272
|
-
ocr_lang_list = _split_list(ocr_lang)
|
273
|
-
if ocr_lang_list is not None:
|
274
|
-
ocr_options.lang = ocr_lang_list
|
275
|
-
|
276
|
-
pipeline_options = PdfPipelineOptions(
|
277
|
-
do_ocr=ocr,
|
278
|
-
ocr_options=ocr_options,
|
279
|
-
do_table_structure=True,
|
280
|
-
)
|
281
|
-
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
282
|
-
pipeline_options.table_structure_options.mode = table_mode
|
283
|
-
|
284
|
-
if artifacts_path is not None:
|
285
|
-
pipeline_options.artifacts_path = artifacts_path
|
286
|
-
|
287
|
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
288
|
-
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
289
|
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
290
|
-
backend = DoclingParseV2DocumentBackend
|
291
|
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
292
|
-
backend = PyPdfiumDocumentBackend
|
293
|
-
else:
|
294
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
295
|
-
|
296
|
-
format_options: Dict[InputFormat, FormatOption] = {
|
297
|
-
InputFormat.PDF: PdfFormatOption(
|
298
|
-
pipeline_options=pipeline_options,
|
299
|
-
backend=backend, # pdf_backend
|
296
|
+
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
297
|
+
|
298
|
+
ocr_lang_list = _split_list(ocr_lang)
|
299
|
+
if ocr_lang_list is not None:
|
300
|
+
ocr_options.lang = ocr_lang_list
|
301
|
+
|
302
|
+
pipeline_options = PdfPipelineOptions(
|
303
|
+
do_ocr=ocr,
|
304
|
+
ocr_options=ocr_options,
|
305
|
+
do_table_structure=True,
|
300
306
|
)
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
)
|
307
|
+
pipeline_options.table_structure_options.do_cell_matching = (
|
308
|
+
True # do_cell_matching
|
309
|
+
)
|
310
|
+
pipeline_options.table_structure_options.mode = table_mode
|
306
311
|
|
307
|
-
|
312
|
+
if artifacts_path is not None:
|
313
|
+
pipeline_options.artifacts_path = artifacts_path
|
308
314
|
|
309
|
-
|
310
|
-
|
311
|
-
|
315
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
316
|
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
317
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
318
|
+
backend = DoclingParseV2DocumentBackend
|
319
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
320
|
+
backend = PyPdfiumDocumentBackend
|
321
|
+
else:
|
322
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
312
323
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
324
|
+
format_options: Dict[InputFormat, FormatOption] = {
|
325
|
+
InputFormat.PDF: PdfFormatOption(
|
326
|
+
pipeline_options=pipeline_options,
|
327
|
+
backend=backend, # pdf_backend
|
328
|
+
)
|
329
|
+
}
|
330
|
+
doc_converter = DocumentConverter(
|
331
|
+
allowed_formats=from_formats,
|
332
|
+
format_options=format_options,
|
333
|
+
)
|
334
|
+
|
335
|
+
start_time = time.time()
|
336
|
+
|
337
|
+
conv_results = doc_converter.convert_all(
|
338
|
+
input_doc_paths, raises_on_error=abort_on_error
|
339
|
+
)
|
340
|
+
|
341
|
+
output.mkdir(parents=True, exist_ok=True)
|
342
|
+
export_documents(
|
343
|
+
conv_results,
|
344
|
+
output_dir=output,
|
345
|
+
export_json=export_json,
|
346
|
+
export_md=export_md,
|
347
|
+
export_txt=export_txt,
|
348
|
+
export_doctags=export_doctags,
|
349
|
+
)
|
322
350
|
|
323
|
-
|
351
|
+
end_time = time.time() - start_time
|
324
352
|
|
325
353
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
326
354
|
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from enum import Enum, auto
|
2
|
-
from io import BytesIO
|
3
2
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
3
|
|
5
4
|
from docling_core.types.doc import (
|
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
|
9
8
|
Size,
|
10
9
|
TableCell,
|
11
10
|
)
|
11
|
+
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
12
|
+
DocumentStream,
|
13
|
+
)
|
12
14
|
from PIL.Image import Image
|
13
15
|
from pydantic import BaseModel, ConfigDict
|
14
16
|
|
@@ -207,10 +209,3 @@ class Page(BaseModel):
|
|
207
209
|
@property
|
208
210
|
def image(self) -> Optional[Image]:
|
209
211
|
return self.get_image(scale=self._default_image_scale)
|
210
|
-
|
211
|
-
|
212
|
-
class DocumentStream(BaseModel):
|
213
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
214
|
-
|
215
|
-
name: str
|
216
|
-
stream: BytesIO
|
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
|
32
32
|
)
|
33
33
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
34
34
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
35
|
-
from docling_core.utils.file import
|
35
|
+
from docling_core.utils.file import resolve_source_to_stream
|
36
36
|
from pydantic import BaseModel
|
37
37
|
from typing_extensions import deprecated
|
38
38
|
|
@@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
|
|
459
459
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
460
460
|
) -> Iterable[InputDocument]:
|
461
461
|
for item in self.path_or_stream_iterator:
|
462
|
-
obj =
|
462
|
+
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
463
463
|
format = self._guess_format(obj)
|
464
464
|
if format not in format_options.keys():
|
465
465
|
_log.info(
|
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
6
6
|
|
7
7
|
|
8
8
|
class TableFormerMode(str, Enum):
|
9
|
+
"""Modes for the TableFormer model."""
|
10
|
+
|
9
11
|
FAST = "fast"
|
10
12
|
ACCURATE = "accurate"
|
11
13
|
|
12
14
|
|
13
15
|
class TableStructureOptions(BaseModel):
|
16
|
+
"""Options for the table structure."""
|
17
|
+
|
14
18
|
do_cell_matching: bool = (
|
15
19
|
True
|
16
20
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
|
|
21
25
|
|
22
26
|
|
23
27
|
class OcrOptions(BaseModel):
|
28
|
+
"""OCR options."""
|
29
|
+
|
24
30
|
kind: str
|
25
31
|
lang: List[str]
|
26
32
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
|
|
30
36
|
|
31
37
|
|
32
38
|
class RapidOcrOptions(OcrOptions):
|
39
|
+
"""Options for the RapidOCR engine."""
|
40
|
+
|
33
41
|
kind: Literal["rapidocr"] = "rapidocr"
|
34
42
|
|
35
43
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
|
|
66
74
|
|
67
75
|
|
68
76
|
class EasyOcrOptions(OcrOptions):
|
77
|
+
"""Options for the EasyOCR engine."""
|
78
|
+
|
69
79
|
kind: Literal["easyocr"] = "easyocr"
|
70
80
|
lang: List[str] = ["fr", "de", "es", "en"]
|
71
81
|
use_gpu: bool = True # same default as easyocr.Reader
|
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
|
|
79
89
|
|
80
90
|
|
81
91
|
class TesseractCliOcrOptions(OcrOptions):
|
92
|
+
"""Options for the TesseractCli engine."""
|
93
|
+
|
82
94
|
kind: Literal["tesseract"] = "tesseract"
|
83
95
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
84
96
|
tesseract_cmd: str = "tesseract"
|
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
|
|
90
102
|
|
91
103
|
|
92
104
|
class TesseractOcrOptions(OcrOptions):
|
105
|
+
"""Options for the Tesseract engine."""
|
106
|
+
|
93
107
|
kind: Literal["tesserocr"] = "tesserocr"
|
94
108
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
95
109
|
path: Optional[str] = None
|
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
|
|
100
114
|
|
101
115
|
|
102
116
|
class OcrMacOptions(OcrOptions):
|
117
|
+
"""Options for the Mac OCR engine."""
|
118
|
+
|
103
119
|
kind: Literal["ocrmac"] = "ocrmac"
|
104
120
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
105
121
|
recognition: str = "accurate"
|
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
|
|
111
127
|
|
112
128
|
|
113
129
|
class PipelineOptions(BaseModel):
|
130
|
+
"""Base pipeline options."""
|
131
|
+
|
114
132
|
create_legacy_output: bool = (
|
115
133
|
True # This defautl will be set to False on a future version of docling
|
116
134
|
)
|
117
135
|
|
118
136
|
|
119
137
|
class PdfPipelineOptions(PipelineOptions):
|
138
|
+
"""Options for the PDF pipeline."""
|
139
|
+
|
120
140
|
artifacts_path: Optional[Union[Path, str]] = None
|
121
141
|
do_table_structure: bool = True # True: perform table structure extraction
|
122
142
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
@@ -1,5 +1,7 @@
|
|
1
|
+
import csv
|
1
2
|
import io
|
2
3
|
import logging
|
4
|
+
import os
|
3
5
|
import tempfile
|
4
6
|
from subprocess import DEVNULL, PIPE, Popen
|
5
7
|
from typing import Iterable, Optional, Tuple
|
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
95
97
|
# _log.info(decoded_data)
|
96
98
|
|
97
99
|
# Read the TSV file generated by Tesseract
|
98
|
-
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
100
|
+
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
99
101
|
|
100
102
|
# Display the dataframe (optional)
|
101
103
|
# _log.info("df: ", df.head())
|
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
130
132
|
high_res_image = page._backend.get_page_image(
|
131
133
|
scale=self.scale, cropbox=ocr_rect
|
132
134
|
)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
135
|
+
try:
|
136
|
+
with tempfile.NamedTemporaryFile(
|
137
|
+
suffix=".png", mode="w+b", delete=False
|
138
|
+
) as image_file:
|
139
|
+
fname = image_file.name
|
140
|
+
high_res_image.save(image_file)
|
139
141
|
|
140
142
|
df = self._run_tesseract(fname)
|
143
|
+
finally:
|
144
|
+
if os.path.exists(fname):
|
145
|
+
os.remove(fname)
|
141
146
|
|
142
147
|
# _log.info(df)
|
143
148
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.8.
|
3
|
+
version = "2.8.2" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
28
|
pydantic = ">=2.0.0,<2.10"
|
29
|
-
docling-core = "^2.
|
29
|
+
docling-core = "^2.6.1"
|
30
30
|
docling-ibm-models = "^2.0.6"
|
31
31
|
deepsearch-glm = "^0.26.1"
|
32
32
|
filetype = "^1.2.0"
|
@@ -40,7 +40,6 @@ docling-parse = "^2.0.5"
|
|
40
40
|
certifi = ">=2024.7.4"
|
41
41
|
rtree = "^1.3.0"
|
42
42
|
scipy = "^1.6.0"
|
43
|
-
pyarrow = "^16.1.0"
|
44
43
|
typer = "^0.12.5"
|
45
44
|
python-docx = "^1.1.2"
|
46
45
|
python-pptx = "^1.0.2"
|
@@ -81,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114"
|
|
81
80
|
mkdocs-material = "^9.5.40"
|
82
81
|
mkdocs-jupyter = "^0.25.0"
|
83
82
|
mkdocs-click = "^0.8.1"
|
83
|
+
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
|
84
|
+
griffe-pydantic = "^1.1.0"
|
84
85
|
|
85
86
|
[tool.poetry.group.examples.dependencies]
|
86
87
|
datasets = "^2.21.0"
|
@@ -89,10 +90,13 @@ langchain-huggingface = "^0.0.3"
|
|
89
90
|
langchain-milvus = "^0.1.4"
|
90
91
|
langchain-text-splitters = "^0.2.4"
|
91
92
|
|
93
|
+
[tool.poetry.group.constraints]
|
94
|
+
optional = true
|
95
|
+
|
92
96
|
[tool.poetry.group.constraints.dependencies]
|
93
97
|
numpy = [
|
94
|
-
{ version = "
|
95
|
-
{ version = "
|
98
|
+
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
99
|
+
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
96
100
|
]
|
97
101
|
|
98
102
|
[tool.poetry.group.mac_intel]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|