docling 1.13.1__py3-none-any.whl → 1.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/cli/main.py +24 -56
- {docling-1.13.1.dist-info → docling-1.14.0.dist-info}/METADATA +69 -27
- {docling-1.13.1.dist-info → docling-1.14.0.dist-info}/RECORD +6 -6
- {docling-1.13.1.dist-info → docling-1.14.0.dist-info}/LICENSE +0 -0
- {docling-1.13.1.dist-info → docling-1.14.0.dist-info}/WHEEL +0 -0
- {docling-1.13.1.dist-info → docling-1.14.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -8,7 +8,7 @@ from pathlib import Path
|
|
8
8
|
from typing import Annotated, Iterable, List, Optional
|
9
9
|
|
10
10
|
import typer
|
11
|
-
from
|
11
|
+
from docling_core.utils.file import resolve_file_source
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
14
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
@@ -109,11 +109,11 @@ def export_documents(
|
|
109
109
|
@app.command(no_args_is_help=True)
|
110
110
|
def convert(
|
111
111
|
input_sources: Annotated[
|
112
|
-
List[
|
112
|
+
List[str],
|
113
113
|
typer.Argument(
|
114
114
|
...,
|
115
115
|
metavar="source",
|
116
|
-
help="PDF files to convert.
|
116
|
+
help="PDF files to convert. Can be local file / directory paths or URL.",
|
117
117
|
),
|
118
118
|
],
|
119
119
|
export_json: Annotated[
|
@@ -167,7 +167,8 @@ def convert(
|
|
167
167
|
logging.basicConfig(level=logging.INFO)
|
168
168
|
|
169
169
|
input_doc_paths: List[Path] = []
|
170
|
-
for
|
170
|
+
for src in input_sources:
|
171
|
+
source = resolve_file_source(source=src)
|
171
172
|
if not source.exists():
|
172
173
|
err_console.print(
|
173
174
|
f"[red]Error: The input file {source} does not exist.[/red]"
|
@@ -179,58 +180,25 @@ def convert(
|
|
179
180
|
else:
|
180
181
|
input_doc_paths.append(source)
|
181
182
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
pipeline_options = PipelineOptions()
|
202
|
-
pipeline_options.do_ocr = False
|
203
|
-
pipeline_options.do_table_structure = True
|
204
|
-
pipeline_options.table_structure_options.do_cell_matching = True
|
205
|
-
|
206
|
-
doc_converter = DocumentConverter(
|
207
|
-
pipeline_options=pipeline_options,
|
208
|
-
pdf_backend=PyPdfiumDocumentBackend,
|
209
|
-
)
|
210
|
-
|
211
|
-
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
|
212
|
-
pipeline_options = PipelineOptions()
|
213
|
-
pipeline_options.do_ocr = False
|
214
|
-
pipeline_options.do_table_structure = True
|
215
|
-
pipeline_options.table_structure_options.do_cell_matching = True
|
216
|
-
|
217
|
-
doc_converter = DocumentConverter(
|
218
|
-
pipeline_options=pipeline_options,
|
219
|
-
pdf_backend=DoclingParseDocumentBackend,
|
220
|
-
)
|
221
|
-
|
222
|
-
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
|
223
|
-
pipeline_options = PipelineOptions()
|
224
|
-
pipeline_options.do_ocr = True
|
225
|
-
pipeline_options.do_table_structure = True
|
226
|
-
pipeline_options.table_structure_options.do_cell_matching = True
|
227
|
-
|
228
|
-
doc_converter = DocumentConverter(
|
229
|
-
pipeline_options=pipeline_options,
|
230
|
-
pdf_backend=DoclingParseDocumentBackend,
|
231
|
-
)
|
232
|
-
|
233
|
-
###########################################################################
|
183
|
+
match backend:
|
184
|
+
case Backend.PYPDFIUM2:
|
185
|
+
do_cell_matching = ocr # only do cell matching when OCR enabled
|
186
|
+
pdf_backend = PyPdfiumDocumentBackend
|
187
|
+
case Backend.DOCLING:
|
188
|
+
do_cell_matching = True
|
189
|
+
pdf_backend = DoclingParseDocumentBackend
|
190
|
+
case _:
|
191
|
+
raise RuntimeError(f"Unexpected backend type {backend}")
|
192
|
+
|
193
|
+
pipeline_options = PipelineOptions(
|
194
|
+
do_ocr=ocr,
|
195
|
+
do_table_structure=True,
|
196
|
+
)
|
197
|
+
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
198
|
+
doc_converter = DocumentConverter(
|
199
|
+
pipeline_options=pipeline_options,
|
200
|
+
pdf_backend=pdf_backend,
|
201
|
+
)
|
234
202
|
|
235
203
|
# Define input files
|
236
204
|
input = DocumentConversionInput.from_paths(input_doc_paths)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.14.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -74,8 +74,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
74
74
|
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
75
75
|
* 📑 Understands detailed page layout, reading order and recovers table structures
|
76
76
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
77
|
-
* 🔍
|
77
|
+
* 🔍 Includes OCR support for scanned PDFs
|
78
78
|
* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
|
79
|
+
* 💻 Provides a simple and convenient CLI
|
79
80
|
|
80
81
|
## Installation
|
81
82
|
|
@@ -87,31 +88,33 @@ pip install docling
|
|
87
88
|
> [!NOTE]
|
88
89
|
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
89
90
|
|
91
|
+
<details>
|
92
|
+
<summary><b>Alternative PyTorch distributions</b></summary>
|
90
93
|
|
91
|
-
|
94
|
+
The Docling models depend on the [PyTorch](https://pytorch.org/) library.
|
95
|
+
Depending on your architecture, you might want to use a different distribution of `torch`.
|
96
|
+
For example, you might want support for different accelerator or for a cpu-only version.
|
97
|
+
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
92
98
|
|
93
|
-
|
94
|
-
|
95
|
-
For example, you might want support for different accelerator or for a cpu-only version.
|
96
|
-
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
99
|
+
One common situation is the installation on Linux systems with cpu-only support.
|
100
|
+
In this case, we suggest the installation of Docling with the following options
|
97
101
|
|
98
|
-
|
99
|
-
|
102
|
+
```bash
|
103
|
+
# Example for installing on the Linux cpu-only version
|
104
|
+
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
105
|
+
```
|
106
|
+
</details>
|
100
107
|
|
101
|
-
|
102
|
-
|
103
|
-
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
104
|
-
```
|
108
|
+
<details>
|
109
|
+
<summary><b>Docling development setup</b></summary>
|
105
110
|
|
111
|
+
To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
|
112
|
+
```bash
|
113
|
+
poetry install --all-extras
|
114
|
+
```
|
115
|
+
</details>
|
106
116
|
|
107
|
-
|
108
|
-
|
109
|
-
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
110
|
-
```bash
|
111
|
-
poetry install --all-extras
|
112
|
-
```
|
113
|
-
|
114
|
-
## Usage
|
117
|
+
## Getting started
|
115
118
|
|
116
119
|
### Convert a single document
|
117
120
|
|
@@ -122,7 +125,6 @@ from docling.document_converter import DocumentConverter
|
|
122
125
|
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
123
126
|
converter = DocumentConverter()
|
124
127
|
result = converter.convert_single(source)
|
125
|
-
|
126
128
|
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
127
129
|
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
|
128
130
|
```
|
@@ -138,6 +140,51 @@ python examples/batch_convert.py
|
|
138
140
|
```
|
139
141
|
The output of the above command will be written to `./scratch`.
|
140
142
|
|
143
|
+
### CLI
|
144
|
+
|
145
|
+
You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
|
146
|
+
|
147
|
+
A simple example would look like this:
|
148
|
+
```console
|
149
|
+
docling https://arxiv.org/pdf/2206.01062
|
150
|
+
```
|
151
|
+
|
152
|
+
To see all available options (export formats etc.) run `docling --help`.
|
153
|
+
|
154
|
+
<details>
|
155
|
+
<summary><b>CLI reference</b></summary>
|
156
|
+
|
157
|
+
Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
|
158
|
+
|
159
|
+
```console
|
160
|
+
$ docling --help
|
161
|
+
|
162
|
+
Usage: docling [OPTIONS] source
|
163
|
+
|
164
|
+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
165
|
+
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
|
166
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
167
|
+
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
168
|
+
│ --json --no-json If enabled the document is exported as JSON. [default: no-json] │
|
169
|
+
│ --md --no-md If enabled the document is exported as Markdown. [default: md] │
|
170
|
+
│ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │
|
171
|
+
│ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │
|
172
|
+
│ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │
|
173
|
+
│ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │
|
174
|
+
│ --output PATH Output directory where results are saved. [default: .] │
|
175
|
+
│ --version Show version information. │
|
176
|
+
│ --help Show this message and exit. │
|
177
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
178
|
+
```
|
179
|
+
</details>
|
180
|
+
|
181
|
+
### RAG
|
182
|
+
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
183
|
+
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
184
|
+
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
185
|
+
|
186
|
+
## Advanced features
|
187
|
+
|
141
188
|
### Adjust pipeline features
|
142
189
|
|
143
190
|
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
@@ -196,11 +243,6 @@ results = doc_converter.convert(conv_input)
|
|
196
243
|
|
197
244
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
198
245
|
|
199
|
-
### RAG
|
200
|
-
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
201
|
-
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
202
|
-
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
203
|
-
|
204
246
|
## Technical report
|
205
247
|
|
206
248
|
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
@@ -4,7 +4,7 @@ docling/backend/abstract_backend.py,sha256=clJtGxLedpLriEhpx7oyxjmlwMLPorkv-1tdf
|
|
4
4
|
docling/backend/docling_parse_backend.py,sha256=RUWWZbx2cUotZeeTkc-Lbg2k8MVFXFxaDjM4sPfaFZE,7475
|
5
5
|
docling/backend/pypdfium2_backend.py,sha256=bIIImVM73wmcVcKMqjl4JF8CD-Qj2W5rZbI4G7clU4s,8877
|
6
6
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/cli/main.py,sha256=
|
7
|
+
docling/cli/main.py,sha256=gJBxgZIGza0UBUAPP8pVFp_Ma3rzB9CCw-w3Bs5wieE,7121
|
8
8
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
docling/datamodel/base_models.py,sha256=tE2Sxoe3e_fBZjq3GDo2NCughDMU5xDeAfkQgT72TRI,9168
|
10
10
|
docling/datamodel/document.py,sha256=7HnPXTin5r_XvIxbqPe7uV6keIr90RhXGGo22uHbTeA,16064
|
@@ -24,8 +24,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
docling/utils/export.py,sha256=bKLdbeUcR-rQsGPV1IqJkCHKMCv7X2QOHyxmjNuH3HE,4655
|
25
25
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
26
26
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
27
|
-
docling-1.
|
28
|
-
docling-1.
|
29
|
-
docling-1.
|
30
|
-
docling-1.
|
31
|
-
docling-1.
|
27
|
+
docling-1.14.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
28
|
+
docling-1.14.0.dist-info/METADATA,sha256=w4Awxrivz6rQfeoHVsH3KYZKQ-Gfovj6OC-cqSb0Kb8,13208
|
29
|
+
docling-1.14.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
30
|
+
docling-1.14.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
31
|
+
docling-1.14.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|