docling 2.4.0__tar.gz → 2.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.4.0 → docling-2.4.2}/PKG-INFO +16 -9
- {docling-2.4.0 → docling-2.4.2}/README.md +15 -8
- {docling-2.4.0 → docling-2.4.2}/docling/backend/docling_parse_backend.py +1 -1
- {docling-2.4.0 → docling-2.4.2}/docling/backend/docling_parse_v2_backend.py +1 -1
- {docling-2.4.0 → docling-2.4.2}/docling/backend/pypdfium2_backend.py +1 -1
- {docling-2.4.0 → docling-2.4.2}/docling/models/easyocr_model.py +5 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/tesseract_ocr_model.py +19 -7
- {docling-2.4.0 → docling-2.4.2}/pyproject.toml +1 -1
- {docling-2.4.0 → docling-2.4.2}/LICENSE +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/__init__.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/__init__.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/abstract_backend.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/html_backend.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/md_backend.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/msword_backend.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/backend/pdf_backend.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/cli/__init__.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/cli/main.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/datamodel/__init__.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/datamodel/base_models.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/datamodel/document.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/datamodel/settings.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/document_converter.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/__init__.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/base_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/base_ocr_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/ds_glm_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/layout_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/page_assemble_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/table_structure_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/pipeline/__init__.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/utils/__init__.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/utils/export.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/utils/layout_utils.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/utils/profiling.py +0 -0
- {docling-2.4.0 → docling-2.4.2}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.4.
|
3
|
+
Version: 2.4.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -53,6 +53,10 @@ Description-Content-Type: text/markdown
|
|
53
53
|
|
54
54
|
# Docling
|
55
55
|
|
56
|
+
<p align="center">
|
57
|
+
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
58
|
+
</p>
|
59
|
+
|
56
60
|
[](https://arxiv.org/abs/2408.09869)
|
57
61
|
[](https://ds4sd.github.io/docling/)
|
58
62
|
[](https://pypi.org/project/docling/)
|
@@ -66,19 +70,22 @@ Description-Content-Type: text/markdown
|
|
66
70
|
|
67
71
|
Docling parses documents and exports them to the desired format with ease and speed.
|
68
72
|
|
69
|
-
|
70
73
|
## Features
|
71
74
|
|
72
75
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
|
73
76
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
74
77
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
75
|
-
*
|
76
|
-
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
78
|
+
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
|
77
79
|
* 🔍 OCR support for scanned PDFs
|
78
80
|
* 💻 Simple and convenient CLI
|
79
81
|
|
80
82
|
Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
|
81
83
|
|
84
|
+
### Coming soon
|
85
|
+
|
86
|
+
* ♾️ Equation & code extraction
|
87
|
+
* 📝 Metadata extraction, including title, authors, references & language
|
88
|
+
* 🦜🔗 Native LangChain extension
|
82
89
|
|
83
90
|
## Installation
|
84
91
|
|
@@ -104,16 +111,13 @@ result = converter.convert(source)
|
|
104
111
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
105
112
|
```
|
106
113
|
|
107
|
-
|
108
114
|
Check out [Getting started](https://ds4sd.github.io/docling/).
|
109
115
|
You will find lots of tuning options to leverage all the advanced capabilities.
|
110
116
|
|
111
|
-
|
112
117
|
## Get help and support
|
113
118
|
|
114
119
|
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
115
120
|
|
116
|
-
|
117
121
|
## Technical report
|
118
122
|
|
119
123
|
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
@@ -122,7 +126,6 @@ For more details on Docling's inner workings, check out the [Docling Technical R
|
|
122
126
|
|
123
127
|
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
124
128
|
|
125
|
-
|
126
129
|
## References
|
127
130
|
|
128
131
|
If you use Docling in your projects, please consider citing the following:
|
@@ -142,6 +145,10 @@ If you use Docling in your projects, please consider citing the following:
|
|
142
145
|
|
143
146
|
## License
|
144
147
|
|
145
|
-
The Docling codebase is under MIT license.
|
148
|
+
The Docling codebase is under MIT license.
|
146
149
|
For individual model usage, please refer to the model licenses found in the original packages.
|
147
150
|
|
151
|
+
## IBM ❤️ Open Source AI
|
152
|
+
|
153
|
+
Docling has been brought to you by IBM.
|
154
|
+
|
@@ -6,6 +6,10 @@
|
|
6
6
|
|
7
7
|
# Docling
|
8
8
|
|
9
|
+
<p align="center">
|
10
|
+
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
11
|
+
</p>
|
12
|
+
|
9
13
|
[](https://arxiv.org/abs/2408.09869)
|
10
14
|
[](https://ds4sd.github.io/docling/)
|
11
15
|
[](https://pypi.org/project/docling/)
|
@@ -19,19 +23,22 @@
|
|
19
23
|
|
20
24
|
Docling parses documents and exports them to the desired format with ease and speed.
|
21
25
|
|
22
|
-
|
23
26
|
## Features
|
24
27
|
|
25
28
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
|
26
29
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
27
30
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
28
|
-
*
|
29
|
-
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
31
|
+
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
|
30
32
|
* 🔍 OCR support for scanned PDFs
|
31
33
|
* 💻 Simple and convenient CLI
|
32
34
|
|
33
35
|
Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
|
34
36
|
|
37
|
+
### Coming soon
|
38
|
+
|
39
|
+
* ♾️ Equation & code extraction
|
40
|
+
* 📝 Metadata extraction, including title, authors, references & language
|
41
|
+
* 🦜🔗 Native LangChain extension
|
35
42
|
|
36
43
|
## Installation
|
37
44
|
|
@@ -57,16 +64,13 @@ result = converter.convert(source)
|
|
57
64
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
58
65
|
```
|
59
66
|
|
60
|
-
|
61
67
|
Check out [Getting started](https://ds4sd.github.io/docling/).
|
62
68
|
You will find lots of tuning options to leverage all the advanced capabilities.
|
63
69
|
|
64
|
-
|
65
70
|
## Get help and support
|
66
71
|
|
67
72
|
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
68
73
|
|
69
|
-
|
70
74
|
## Technical report
|
71
75
|
|
72
76
|
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
@@ -75,7 +79,6 @@ For more details on Docling's inner workings, check out the [Docling Technical R
|
|
75
79
|
|
76
80
|
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
77
81
|
|
78
|
-
|
79
82
|
## References
|
80
83
|
|
81
84
|
If you use Docling in your projects, please consider citing the following:
|
@@ -95,5 +98,9 @@ If you use Docling in your projects, please consider citing the following:
|
|
95
98
|
|
96
99
|
## License
|
97
100
|
|
98
|
-
The Docling codebase is under MIT license.
|
101
|
+
The Docling codebase is under MIT license.
|
99
102
|
For individual model usage, please refer to the model licenses found in the original packages.
|
103
|
+
|
104
|
+
## IBM ❤️ Open Source AI
|
105
|
+
|
106
|
+
Docling has been brought to you by IBM.
|
@@ -29,7 +29,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
29
29
|
self._dpage = parsed_page["pages"][0]
|
30
30
|
else:
|
31
31
|
_log.info(
|
32
|
-
f"An error
|
32
|
+
f"An error occurred when loading page {page_no} of document {document_hash}."
|
33
33
|
)
|
34
34
|
|
35
35
|
def is_valid(self) -> bool:
|
@@ -31,7 +31,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
31
31
|
self._dpage = parsed_page["pages"][0]
|
32
32
|
else:
|
33
33
|
_log.info(
|
34
|
-
f"An error
|
34
|
+
f"An error occurred when loading page {page_no} of document {document_hash}."
|
35
35
|
)
|
36
36
|
|
37
37
|
def is_valid(self) -> bool:
|
@@ -29,7 +29,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
29
29
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
30
30
|
except PdfiumError as e:
|
31
31
|
_log.info(
|
32
|
-
f"An exception
|
32
|
+
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
33
33
|
exc_info=True,
|
34
34
|
)
|
35
35
|
self.valid = False
|
@@ -2,6 +2,7 @@ import logging
|
|
2
2
|
from typing import Iterable
|
3
3
|
|
4
4
|
import numpy
|
5
|
+
import torch
|
5
6
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
7
|
|
7
8
|
from docling.datamodel.base_models import OcrCell, Page
|
@@ -30,8 +31,12 @@ class EasyOcrModel(BaseOcrModel):
|
|
30
31
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
31
32
|
)
|
32
33
|
|
34
|
+
use_gpu = (
|
35
|
+
False if torch.backends.mps.is_available() else self.options.use_gpu
|
36
|
+
)
|
33
37
|
self.reader = easyocr.Reader(
|
34
38
|
lang_list=self.options.lang,
|
39
|
+
gpu=use_gpu,
|
35
40
|
model_storage_directory=self.options.model_storage_directory,
|
36
41
|
download_enabled=self.options.download_enabled,
|
37
42
|
)
|
@@ -22,25 +22,37 @@ class TesseractOcrModel(BaseOcrModel):
|
|
22
22
|
self.reader = None
|
23
23
|
|
24
24
|
if self.enabled:
|
25
|
-
|
25
|
+
install_errmsg = (
|
26
26
|
"tesserocr is not correctly installed. "
|
27
27
|
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
28
|
-
"Note that tesserocr might have to be manually compiled for working with"
|
28
|
+
"Note that tesserocr might have to be manually compiled for working with "
|
29
29
|
"your Tesseract installation. The Docling documentation provides examples for it. "
|
30
|
-
"Alternatively, Docling has support for other OCR engines. See the documentation
|
30
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
31
|
+
"https://ds4sd.github.io/docling/installation/"
|
31
32
|
)
|
33
|
+
missing_langs_errmsg = (
|
34
|
+
"tesserocr is not correctly configured. No language models have been detected. "
|
35
|
+
"Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
|
36
|
+
"You can find more information how to setup other OCR engines in Docling "
|
37
|
+
"documentation: "
|
38
|
+
"https://ds4sd.github.io/docling/installation/"
|
39
|
+
)
|
40
|
+
|
32
41
|
try:
|
33
42
|
import tesserocr
|
34
43
|
except ImportError:
|
35
|
-
raise ImportError(
|
36
|
-
|
44
|
+
raise ImportError(install_errmsg)
|
37
45
|
try:
|
38
46
|
tesseract_version = tesserocr.tesseract_version()
|
39
|
-
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
40
47
|
except:
|
41
|
-
raise ImportError(
|
48
|
+
raise ImportError(install_errmsg)
|
49
|
+
|
50
|
+
_, tesserocr_languages = tesserocr.get_languages()
|
51
|
+
if not tesserocr_languages:
|
52
|
+
raise ImportError(missing_langs_errmsg)
|
42
53
|
|
43
54
|
# Initialize the tesseractAPI
|
55
|
+
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
44
56
|
lang = "+".join(self.options.lang)
|
45
57
|
if self.options.path is not None:
|
46
58
|
self.reader = tesserocr.PyTessBaseAPI(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.4.
|
3
|
+
version = "2.4.2" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|