docling 1.13.0__tar.gz → 1.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.13.0 → docling-1.14.0}/PKG-INFO +71 -27
- {docling-1.13.0 → docling-1.14.0}/README.md +69 -25
- {docling-1.13.0 → docling-1.14.0}/docling/cli/main.py +24 -56
- {docling-1.13.0 → docling-1.14.0}/docling/datamodel/document.py +18 -8
- {docling-1.13.0 → docling-1.14.0}/docling/utils/export.py +1 -1
- {docling-1.13.0 → docling-1.14.0}/pyproject.toml +2 -2
- {docling-1.13.0 → docling-1.14.0}/LICENSE +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/__init__.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/backend/__init__.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/backend/abstract_backend.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/cli/__init__.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/datamodel/__init__.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/datamodel/base_models.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/datamodel/settings.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/document_converter.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/models/__init__.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/models/base_ocr_model.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/models/ds_glm_model.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/models/easyocr_model.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/models/layout_model.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/models/page_assemble_model.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/models/table_structure_model.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/pipeline/__init__.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/utils/__init__.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/utils/layout_utils.py +0 -0
- {docling-1.13.0 → docling-1.14.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.14.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -22,7 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Provides-Extra: examples
|
23
23
|
Requires-Dist: certifi (>=2024.7.4)
|
24
24
|
Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
|
25
|
-
Requires-Dist: docling-core (>=1.
|
25
|
+
Requires-Dist: docling-core (>=1.5.0,<2.0.0)
|
26
26
|
Requires-Dist: docling-ibm-models (>=1.2.0,<2.0.0)
|
27
27
|
Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
28
28
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -74,8 +74,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
74
74
|
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
75
75
|
* 📑 Understands detailed page layout, reading order and recovers table structures
|
76
76
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
77
|
-
* 🔍
|
77
|
+
* 🔍 Includes OCR support for scanned PDFs
|
78
78
|
* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
|
79
|
+
* 💻 Provides a simple and convenient CLI
|
79
80
|
|
80
81
|
## Installation
|
81
82
|
|
@@ -87,31 +88,33 @@ pip install docling
|
|
87
88
|
> [!NOTE]
|
88
89
|
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
89
90
|
|
91
|
+
<details>
|
92
|
+
<summary><b>Alternative PyTorch distributions</b></summary>
|
90
93
|
|
91
|
-
|
94
|
+
The Docling models depend on the [PyTorch](https://pytorch.org/) library.
|
95
|
+
Depending on your architecture, you might want to use a different distribution of `torch`.
|
96
|
+
For example, you might want support for different accelerator or for a cpu-only version.
|
97
|
+
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
92
98
|
|
93
|
-
|
94
|
-
|
95
|
-
For example, you might want support for different accelerator or for a cpu-only version.
|
96
|
-
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
99
|
+
One common situation is the installation on Linux systems with cpu-only support.
|
100
|
+
In this case, we suggest the installation of Docling with the following options
|
97
101
|
|
98
|
-
|
99
|
-
|
102
|
+
```bash
|
103
|
+
# Example for installing on the Linux cpu-only version
|
104
|
+
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
105
|
+
```
|
106
|
+
</details>
|
100
107
|
|
101
|
-
|
102
|
-
|
103
|
-
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
104
|
-
```
|
108
|
+
<details>
|
109
|
+
<summary><b>Docling development setup</b></summary>
|
105
110
|
|
111
|
+
To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
|
112
|
+
```bash
|
113
|
+
poetry install --all-extras
|
114
|
+
```
|
115
|
+
</details>
|
106
116
|
|
107
|
-
|
108
|
-
|
109
|
-
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
110
|
-
```bash
|
111
|
-
poetry install --all-extras
|
112
|
-
```
|
113
|
-
|
114
|
-
## Usage
|
117
|
+
## Getting started
|
115
118
|
|
116
119
|
### Convert a single document
|
117
120
|
|
@@ -123,6 +126,7 @@ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
|
123
126
|
converter = DocumentConverter()
|
124
127
|
result = converter.convert_single(source)
|
125
128
|
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
129
|
+
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
|
126
130
|
```
|
127
131
|
|
128
132
|
### Convert a batch of documents
|
@@ -136,6 +140,51 @@ python examples/batch_convert.py
|
|
136
140
|
```
|
137
141
|
The output of the above command will be written to `./scratch`.
|
138
142
|
|
143
|
+
### CLI
|
144
|
+
|
145
|
+
You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
|
146
|
+
|
147
|
+
A simple example would look like this:
|
148
|
+
```console
|
149
|
+
docling https://arxiv.org/pdf/2206.01062
|
150
|
+
```
|
151
|
+
|
152
|
+
To see all available options (export formats etc.) run `docling --help`.
|
153
|
+
|
154
|
+
<details>
|
155
|
+
<summary><b>CLI reference</b></summary>
|
156
|
+
|
157
|
+
Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
|
158
|
+
|
159
|
+
```console
|
160
|
+
$ docling --help
|
161
|
+
|
162
|
+
Usage: docling [OPTIONS] source
|
163
|
+
|
164
|
+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
165
|
+
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
|
166
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
167
|
+
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
168
|
+
│ --json --no-json If enabled the document is exported as JSON. [default: no-json] │
|
169
|
+
│ --md --no-md If enabled the document is exported as Markdown. [default: md] │
|
170
|
+
│ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │
|
171
|
+
│ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │
|
172
|
+
│ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │
|
173
|
+
│ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │
|
174
|
+
│ --output PATH Output directory where results are saved. [default: .] │
|
175
|
+
│ --version Show version information. │
|
176
|
+
│ --help Show this message and exit. │
|
177
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
178
|
+
```
|
179
|
+
</details>
|
180
|
+
|
181
|
+
### RAG
|
182
|
+
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
183
|
+
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
184
|
+
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
185
|
+
|
186
|
+
## Advanced features
|
187
|
+
|
139
188
|
### Adjust pipeline features
|
140
189
|
|
141
190
|
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
@@ -194,11 +243,6 @@ results = doc_converter.convert(conv_input)
|
|
194
243
|
|
195
244
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
196
245
|
|
197
|
-
### RAG
|
198
|
-
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
199
|
-
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
200
|
-
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
201
|
-
|
202
246
|
## Technical report
|
203
247
|
|
204
248
|
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
@@ -22,8 +22,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
22
22
|
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
23
23
|
* 📑 Understands detailed page layout, reading order and recovers table structures
|
24
24
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
25
|
-
* 🔍
|
25
|
+
* 🔍 Includes OCR support for scanned PDFs
|
26
26
|
* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
|
27
|
+
* 💻 Provides a simple and convenient CLI
|
27
28
|
|
28
29
|
## Installation
|
29
30
|
|
@@ -35,31 +36,33 @@ pip install docling
|
|
35
36
|
> [!NOTE]
|
36
37
|
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
37
38
|
|
39
|
+
<details>
|
40
|
+
<summary><b>Alternative PyTorch distributions</b></summary>
|
38
41
|
|
39
|
-
|
42
|
+
The Docling models depend on the [PyTorch](https://pytorch.org/) library.
|
43
|
+
Depending on your architecture, you might want to use a different distribution of `torch`.
|
44
|
+
For example, you might want support for different accelerator or for a cpu-only version.
|
45
|
+
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
40
46
|
|
41
|
-
|
42
|
-
|
43
|
-
For example, you might want support for different accelerator or for a cpu-only version.
|
44
|
-
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
47
|
+
One common situation is the installation on Linux systems with cpu-only support.
|
48
|
+
In this case, we suggest the installation of Docling with the following options
|
45
49
|
|
46
|
-
|
47
|
-
|
50
|
+
```bash
|
51
|
+
# Example for installing on the Linux cpu-only version
|
52
|
+
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
53
|
+
```
|
54
|
+
</details>
|
48
55
|
|
49
|
-
|
50
|
-
|
51
|
-
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
52
|
-
```
|
56
|
+
<details>
|
57
|
+
<summary><b>Docling development setup</b></summary>
|
53
58
|
|
59
|
+
To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
|
60
|
+
```bash
|
61
|
+
poetry install --all-extras
|
62
|
+
```
|
63
|
+
</details>
|
54
64
|
|
55
|
-
|
56
|
-
|
57
|
-
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
58
|
-
```bash
|
59
|
-
poetry install --all-extras
|
60
|
-
```
|
61
|
-
|
62
|
-
## Usage
|
65
|
+
## Getting started
|
63
66
|
|
64
67
|
### Convert a single document
|
65
68
|
|
@@ -71,6 +74,7 @@ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
|
71
74
|
converter = DocumentConverter()
|
72
75
|
result = converter.convert_single(source)
|
73
76
|
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
77
|
+
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
|
74
78
|
```
|
75
79
|
|
76
80
|
### Convert a batch of documents
|
@@ -84,6 +88,51 @@ python examples/batch_convert.py
|
|
84
88
|
```
|
85
89
|
The output of the above command will be written to `./scratch`.
|
86
90
|
|
91
|
+
### CLI
|
92
|
+
|
93
|
+
You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
|
94
|
+
|
95
|
+
A simple example would look like this:
|
96
|
+
```console
|
97
|
+
docling https://arxiv.org/pdf/2206.01062
|
98
|
+
```
|
99
|
+
|
100
|
+
To see all available options (export formats etc.) run `docling --help`.
|
101
|
+
|
102
|
+
<details>
|
103
|
+
<summary><b>CLI reference</b></summary>
|
104
|
+
|
105
|
+
Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
|
106
|
+
|
107
|
+
```console
|
108
|
+
$ docling --help
|
109
|
+
|
110
|
+
Usage: docling [OPTIONS] source
|
111
|
+
|
112
|
+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
113
|
+
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
|
114
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
115
|
+
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
116
|
+
│ --json --no-json If enabled the document is exported as JSON. [default: no-json] │
|
117
|
+
│ --md --no-md If enabled the document is exported as Markdown. [default: md] │
|
118
|
+
│ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │
|
119
|
+
│ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │
|
120
|
+
│ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │
|
121
|
+
│ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │
|
122
|
+
│ --output PATH Output directory where results are saved. [default: .] │
|
123
|
+
│ --version Show version information. │
|
124
|
+
│ --help Show this message and exit. │
|
125
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
126
|
+
```
|
127
|
+
</details>
|
128
|
+
|
129
|
+
### RAG
|
130
|
+
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
131
|
+
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
132
|
+
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
133
|
+
|
134
|
+
## Advanced features
|
135
|
+
|
87
136
|
### Adjust pipeline features
|
88
137
|
|
89
138
|
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
@@ -142,11 +191,6 @@ results = doc_converter.convert(conv_input)
|
|
142
191
|
|
143
192
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
144
193
|
|
145
|
-
### RAG
|
146
|
-
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
147
|
-
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
148
|
-
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
149
|
-
|
150
194
|
## Technical report
|
151
195
|
|
152
196
|
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
@@ -8,7 +8,7 @@ from pathlib import Path
|
|
8
8
|
from typing import Annotated, Iterable, List, Optional
|
9
9
|
|
10
10
|
import typer
|
11
|
-
from
|
11
|
+
from docling_core.utils.file import resolve_file_source
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
14
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
@@ -109,11 +109,11 @@ def export_documents(
|
|
109
109
|
@app.command(no_args_is_help=True)
|
110
110
|
def convert(
|
111
111
|
input_sources: Annotated[
|
112
|
-
List[
|
112
|
+
List[str],
|
113
113
|
typer.Argument(
|
114
114
|
...,
|
115
115
|
metavar="source",
|
116
|
-
help="PDF files to convert.
|
116
|
+
help="PDF files to convert. Can be local file / directory paths or URL.",
|
117
117
|
),
|
118
118
|
],
|
119
119
|
export_json: Annotated[
|
@@ -167,7 +167,8 @@ def convert(
|
|
167
167
|
logging.basicConfig(level=logging.INFO)
|
168
168
|
|
169
169
|
input_doc_paths: List[Path] = []
|
170
|
-
for
|
170
|
+
for src in input_sources:
|
171
|
+
source = resolve_file_source(source=src)
|
171
172
|
if not source.exists():
|
172
173
|
err_console.print(
|
173
174
|
f"[red]Error: The input file {source} does not exist.[/red]"
|
@@ -179,58 +180,25 @@ def convert(
|
|
179
180
|
else:
|
180
181
|
input_doc_paths.append(source)
|
181
182
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
pipeline_options = PipelineOptions()
|
202
|
-
pipeline_options.do_ocr = False
|
203
|
-
pipeline_options.do_table_structure = True
|
204
|
-
pipeline_options.table_structure_options.do_cell_matching = True
|
205
|
-
|
206
|
-
doc_converter = DocumentConverter(
|
207
|
-
pipeline_options=pipeline_options,
|
208
|
-
pdf_backend=PyPdfiumDocumentBackend,
|
209
|
-
)
|
210
|
-
|
211
|
-
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
|
212
|
-
pipeline_options = PipelineOptions()
|
213
|
-
pipeline_options.do_ocr = False
|
214
|
-
pipeline_options.do_table_structure = True
|
215
|
-
pipeline_options.table_structure_options.do_cell_matching = True
|
216
|
-
|
217
|
-
doc_converter = DocumentConverter(
|
218
|
-
pipeline_options=pipeline_options,
|
219
|
-
pdf_backend=DoclingParseDocumentBackend,
|
220
|
-
)
|
221
|
-
|
222
|
-
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
|
223
|
-
pipeline_options = PipelineOptions()
|
224
|
-
pipeline_options.do_ocr = True
|
225
|
-
pipeline_options.do_table_structure = True
|
226
|
-
pipeline_options.table_structure_options.do_cell_matching = True
|
227
|
-
|
228
|
-
doc_converter = DocumentConverter(
|
229
|
-
pipeline_options=pipeline_options,
|
230
|
-
pdf_backend=DoclingParseDocumentBackend,
|
231
|
-
)
|
232
|
-
|
233
|
-
###########################################################################
|
183
|
+
match backend:
|
184
|
+
case Backend.PYPDFIUM2:
|
185
|
+
do_cell_matching = ocr # only do cell matching when OCR enabled
|
186
|
+
pdf_backend = PyPdfiumDocumentBackend
|
187
|
+
case Backend.DOCLING:
|
188
|
+
do_cell_matching = True
|
189
|
+
pdf_backend = DoclingParseDocumentBackend
|
190
|
+
case _:
|
191
|
+
raise RuntimeError(f"Unexpected backend type {backend}")
|
192
|
+
|
193
|
+
pipeline_options = PipelineOptions(
|
194
|
+
do_ocr=ocr,
|
195
|
+
do_table_structure=True,
|
196
|
+
)
|
197
|
+
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
198
|
+
doc_converter = DocumentConverter(
|
199
|
+
pipeline_options=pipeline_options,
|
200
|
+
pdf_backend=pdf_backend,
|
201
|
+
)
|
234
202
|
|
235
203
|
# Define input files
|
236
204
|
input = DocumentConversionInput.from_paths(input_doc_paths)
|
@@ -368,20 +368,30 @@ class ConvertedDocument(BaseModel):
|
|
368
368
|
"table",
|
369
369
|
"figure",
|
370
370
|
],
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
371
|
+
xsize: int = 100,
|
372
|
+
ysize: int = 100,
|
373
|
+
add_location: bool = True,
|
374
|
+
add_content: bool = True,
|
375
|
+
add_page_index: bool = True,
|
376
|
+
# table specific flags
|
377
|
+
add_table_cell_location: bool = False,
|
378
|
+
add_table_cell_label: bool = True,
|
379
|
+
add_table_cell_text: bool = True,
|
375
380
|
) -> str:
|
376
381
|
return self.output.export_to_document_tokens(
|
377
382
|
delim=delim,
|
378
383
|
main_text_start=main_text_start,
|
379
384
|
main_text_stop=main_text_stop,
|
380
385
|
main_text_labels=main_text_labels,
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
386
|
+
xsize=xsize,
|
387
|
+
ysize=ysize,
|
388
|
+
add_location=add_location,
|
389
|
+
add_content=add_content,
|
390
|
+
add_page_index=add_page_index,
|
391
|
+
# table specific flags
|
392
|
+
add_table_cell_location=add_table_cell_location,
|
393
|
+
add_table_cell_label=add_table_cell_label,
|
394
|
+
add_table_cell_text=add_table_cell_text,
|
385
395
|
)
|
386
396
|
|
387
397
|
def render_element_images(
|
@@ -111,7 +111,7 @@ def generate_multimodal_pages(
|
|
111
111
|
)
|
112
112
|
# No page-tagging since we only do 1 page at the time
|
113
113
|
content_dt = doc.export_to_document_tokens(
|
114
|
-
main_text_start=start_ix, main_text_stop=end_ix,
|
114
|
+
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
|
115
115
|
)
|
116
116
|
|
117
117
|
return content_text, content_md, content_dt, page_cells, page_segments, page
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.14.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
|
|
23
23
|
[tool.poetry.dependencies]
|
24
24
|
python = "^3.10"
|
25
25
|
pydantic = "^2.0.0"
|
26
|
-
docling-core = "^1.
|
26
|
+
docling-core = "^1.5.0"
|
27
27
|
docling-ibm-models = "^1.2.0"
|
28
28
|
deepsearch-glm = "^0.21.1"
|
29
29
|
filetype = "^1.2.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|