docling 1.13.1__tar.gz → 1.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {docling-1.13.1 → docling-1.14.0}/PKG-INFO +69 -27
  2. {docling-1.13.1 → docling-1.14.0}/README.md +68 -26
  3. {docling-1.13.1 → docling-1.14.0}/docling/cli/main.py +24 -56
  4. {docling-1.13.1 → docling-1.14.0}/pyproject.toml +1 -1
  5. {docling-1.13.1 → docling-1.14.0}/LICENSE +0 -0
  6. {docling-1.13.1 → docling-1.14.0}/docling/__init__.py +0 -0
  7. {docling-1.13.1 → docling-1.14.0}/docling/backend/__init__.py +0 -0
  8. {docling-1.13.1 → docling-1.14.0}/docling/backend/abstract_backend.py +0 -0
  9. {docling-1.13.1 → docling-1.14.0}/docling/backend/docling_parse_backend.py +0 -0
  10. {docling-1.13.1 → docling-1.14.0}/docling/backend/pypdfium2_backend.py +0 -0
  11. {docling-1.13.1 → docling-1.14.0}/docling/cli/__init__.py +0 -0
  12. {docling-1.13.1 → docling-1.14.0}/docling/datamodel/__init__.py +0 -0
  13. {docling-1.13.1 → docling-1.14.0}/docling/datamodel/base_models.py +0 -0
  14. {docling-1.13.1 → docling-1.14.0}/docling/datamodel/document.py +0 -0
  15. {docling-1.13.1 → docling-1.14.0}/docling/datamodel/settings.py +0 -0
  16. {docling-1.13.1 → docling-1.14.0}/docling/document_converter.py +0 -0
  17. {docling-1.13.1 → docling-1.14.0}/docling/models/__init__.py +0 -0
  18. {docling-1.13.1 → docling-1.14.0}/docling/models/base_ocr_model.py +0 -0
  19. {docling-1.13.1 → docling-1.14.0}/docling/models/ds_glm_model.py +0 -0
  20. {docling-1.13.1 → docling-1.14.0}/docling/models/easyocr_model.py +0 -0
  21. {docling-1.13.1 → docling-1.14.0}/docling/models/layout_model.py +0 -0
  22. {docling-1.13.1 → docling-1.14.0}/docling/models/page_assemble_model.py +0 -0
  23. {docling-1.13.1 → docling-1.14.0}/docling/models/table_structure_model.py +0 -0
  24. {docling-1.13.1 → docling-1.14.0}/docling/pipeline/__init__.py +0 -0
  25. {docling-1.13.1 → docling-1.14.0}/docling/pipeline/base_model_pipeline.py +0 -0
  26. {docling-1.13.1 → docling-1.14.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  27. {docling-1.13.1 → docling-1.14.0}/docling/utils/__init__.py +0 -0
  28. {docling-1.13.1 → docling-1.14.0}/docling/utils/export.py +0 -0
  29. {docling-1.13.1 → docling-1.14.0}/docling/utils/layout_utils.py +0 -0
  30. {docling-1.13.1 → docling-1.14.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.13.1
3
+ Version: 1.14.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -74,8 +74,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
74
74
  * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
75
75
  * 📑 Understands detailed page layout, reading order and recovers table structures
76
76
  * 📝 Extracts metadata from the document, such as title, authors, references and language
77
- * 🔍 Optionally applies OCR (use with scanned PDFs)
77
+ * 🔍 Includes OCR support for scanned PDFs
78
78
  * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
79
+ * 💻 Provides a simple and convenient CLI
79
80
 
80
81
  ## Installation
81
82
 
@@ -87,31 +88,33 @@ pip install docling
87
88
  > [!NOTE]
88
89
  > Works on macOS and Linux environments. Windows platforms are currently not tested.
89
90
 
91
+ <details>
92
+ <summary><b>Alternative PyTorch distributions</b></summary>
90
93
 
91
- ### Use alternative PyTorch distributions
94
+ The Docling models depend on the [PyTorch](https://pytorch.org/) library.
95
+ Depending on your architecture, you might want to use a different distribution of `torch`.
96
+ For example, you might want support for different accelerator or for a cpu-only version.
97
+ All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
92
98
 
93
- The Docling models depend on the [PyTorch](https://pytorch.org/) library.
94
- Depending on your architecture, you might want to use a different distribution of `torch`.
95
- For example, you might want support for different accelerator or for a cpu-only version.
96
- All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
99
+ One common situation is the installation on Linux systems with cpu-only support.
100
+ In this case, we suggest the installation of Docling with the following options
97
101
 
98
- One common situation is the installation on Linux systems with cpu-only support.
99
- In this case, we suggest the installation of Docling with the following options
102
+ ```bash
103
+ # Example for installing on the Linux cpu-only version
104
+ pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
105
+ ```
106
+ </details>
100
107
 
101
- ```bash
102
- # Example for installing on the Linux cpu-only version
103
- pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
104
- ```
108
+ <details>
109
+ <summary><b>Docling development setup</b></summary>
105
110
 
111
+ To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
112
+ ```bash
113
+ poetry install --all-extras
114
+ ```
115
+ </details>
106
116
 
107
- ### Development setup
108
-
109
- To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
110
- ```bash
111
- poetry install --all-extras
112
- ```
113
-
114
- ## Usage
117
+ ## Getting started
115
118
 
116
119
  ### Convert a single document
117
120
 
@@ -122,7 +125,6 @@ from docling.document_converter import DocumentConverter
122
125
  source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
123
126
  converter = DocumentConverter()
124
127
  result = converter.convert_single(source)
125
-
126
128
  print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
127
129
  print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
128
130
  ```
@@ -138,6 +140,51 @@ python examples/batch_convert.py
138
140
  ```
139
141
  The output of the above command will be written to `./scratch`.
140
142
 
143
+ ### CLI
144
+
145
+ You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
146
+
147
+ A simple example would look like this:
148
+ ```console
149
+ docling https://arxiv.org/pdf/2206.01062
150
+ ```
151
+
152
+ To see all available options (export formats etc.) run `docling --help`.
153
+
154
+ <details>
155
+ <summary><b>CLI reference</b></summary>
156
+
157
+ Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
158
+
159
+ ```console
160
+ $ docling --help
161
+
162
+ Usage: docling [OPTIONS] source
163
+
164
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
165
+ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
166
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
167
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
168
+ │ --json --no-json If enabled the document is exported as JSON. [default: no-json] │
169
+ │ --md --no-md If enabled the document is exported as Markdown. [default: md] │
170
+ │ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │
171
+ │ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │
172
+ │ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │
173
+ │ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │
174
+ │ --output PATH Output directory where results are saved. [default: .] │
175
+ │ --version Show version information. │
176
+ │ --help Show this message and exit. │
177
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
178
+ ```
179
+ </details>
180
+
181
+ ### RAG
182
+ Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
183
+ - [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
184
+ - [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
185
+
186
+ ## Advanced features
187
+
141
188
  ### Adjust pipeline features
142
189
 
143
190
  The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
@@ -196,11 +243,6 @@ results = doc_converter.convert(conv_input)
196
243
 
197
244
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
198
245
 
199
- ### RAG
200
- Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
201
- - [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
202
- - [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
203
-
204
246
  ## Technical report
205
247
 
206
248
  For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -22,8 +22,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
22
22
  * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
23
23
  * 📑 Understands detailed page layout, reading order and recovers table structures
24
24
  * 📝 Extracts metadata from the document, such as title, authors, references and language
25
- * 🔍 Optionally applies OCR (use with scanned PDFs)
25
+ * 🔍 Includes OCR support for scanned PDFs
26
26
  * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
27
+ * 💻 Provides a simple and convenient CLI
27
28
 
28
29
  ## Installation
29
30
 
@@ -35,31 +36,33 @@ pip install docling
35
36
  > [!NOTE]
36
37
  > Works on macOS and Linux environments. Windows platforms are currently not tested.
37
38
 
39
+ <details>
40
+ <summary><b>Alternative PyTorch distributions</b></summary>
38
41
 
39
- ### Use alternative PyTorch distributions
42
+ The Docling models depend on the [PyTorch](https://pytorch.org/) library.
43
+ Depending on your architecture, you might want to use a different distribution of `torch`.
44
+ For example, you might want support for different accelerator or for a cpu-only version.
45
+ All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
40
46
 
41
- The Docling models depend on the [PyTorch](https://pytorch.org/) library.
42
- Depending on your architecture, you might want to use a different distribution of `torch`.
43
- For example, you might want support for different accelerator or for a cpu-only version.
44
- All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
47
+ One common situation is the installation on Linux systems with cpu-only support.
48
+ In this case, we suggest the installation of Docling with the following options
45
49
 
46
- One common situation is the installation on Linux systems with cpu-only support.
47
- In this case, we suggest the installation of Docling with the following options
50
+ ```bash
51
+ # Example for installing on the Linux cpu-only version
52
+ pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
53
+ ```
54
+ </details>
48
55
 
49
- ```bash
50
- # Example for installing on the Linux cpu-only version
51
- pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
52
- ```
56
+ <details>
57
+ <summary><b>Docling development setup</b></summary>
53
58
 
59
+ To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
60
+ ```bash
61
+ poetry install --all-extras
62
+ ```
63
+ </details>
54
64
 
55
- ### Development setup
56
-
57
- To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
58
- ```bash
59
- poetry install --all-extras
60
- ```
61
-
62
- ## Usage
65
+ ## Getting started
63
66
 
64
67
  ### Convert a single document
65
68
 
@@ -70,7 +73,6 @@ from docling.document_converter import DocumentConverter
70
73
  source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
71
74
  converter = DocumentConverter()
72
75
  result = converter.convert_single(source)
73
-
74
76
  print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
75
77
  print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
76
78
  ```
@@ -86,6 +88,51 @@ python examples/batch_convert.py
86
88
  ```
87
89
  The output of the above command will be written to `./scratch`.
88
90
 
91
+ ### CLI
92
+
93
+ You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
94
+
95
+ A simple example would look like this:
96
+ ```console
97
+ docling https://arxiv.org/pdf/2206.01062
98
+ ```
99
+
100
+ To see all available options (export formats etc.) run `docling --help`.
101
+
102
+ <details>
103
+ <summary><b>CLI reference</b></summary>
104
+
105
+ Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
106
+
107
+ ```console
108
+ $ docling --help
109
+
110
+ Usage: docling [OPTIONS] source
111
+
112
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
113
+ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
114
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
115
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
116
+ │ --json --no-json If enabled the document is exported as JSON. [default: no-json] │
117
+ │ --md --no-md If enabled the document is exported as Markdown. [default: md] │
118
+ │ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │
119
+ │ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │
120
+ │ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │
121
+ │ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │
122
+ │ --output PATH Output directory where results are saved. [default: .] │
123
+ │ --version Show version information. │
124
+ │ --help Show this message and exit. │
125
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
126
+ ```
127
+ </details>
128
+
129
+ ### RAG
130
+ Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
131
+ - [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
132
+ - [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
133
+
134
+ ## Advanced features
135
+
89
136
  ### Adjust pipeline features
90
137
 
91
138
  The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
@@ -144,11 +191,6 @@ results = doc_converter.convert(conv_input)
144
191
 
145
192
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
146
193
 
147
- ### RAG
148
- Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
149
- - [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
150
- - [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
151
-
152
194
  ## Technical report
153
195
 
154
196
  For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
  from typing import Annotated, Iterable, List, Optional
9
9
 
10
10
  import typer
11
- from pydantic import AnyUrl
11
+ from docling_core.utils.file import resolve_file_source
12
12
 
13
13
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
14
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -109,11 +109,11 @@ def export_documents(
109
109
  @app.command(no_args_is_help=True)
110
110
  def convert(
111
111
  input_sources: Annotated[
112
- List[Path],
112
+ List[str],
113
113
  typer.Argument(
114
114
  ...,
115
115
  metavar="source",
116
- help="PDF files to convert. Directories are also accepted.",
116
+ help="PDF files to convert. Can be local file / directory paths or URL.",
117
117
  ),
118
118
  ],
119
119
  export_json: Annotated[
@@ -167,7 +167,8 @@ def convert(
167
167
  logging.basicConfig(level=logging.INFO)
168
168
 
169
169
  input_doc_paths: List[Path] = []
170
- for source in input_sources:
170
+ for src in input_sources:
171
+ source = resolve_file_source(source=src)
171
172
  if not source.exists():
172
173
  err_console.print(
173
174
  f"[red]Error: The input file {source} does not exist.[/red]"
@@ -179,58 +180,25 @@ def convert(
179
180
  else:
180
181
  input_doc_paths.append(source)
181
182
 
182
- ###########################################################################
183
-
184
- # The following sections contain a combination of PipelineOptions
185
- # and PDF Backends for various configurations.
186
- # Uncomment one section at the time to see the differences in the output.
187
-
188
- doc_converter = None
189
- if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
190
- pipeline_options = PipelineOptions()
191
- pipeline_options.do_ocr = False
192
- pipeline_options.do_table_structure = True
193
- pipeline_options.table_structure_options.do_cell_matching = False
194
-
195
- doc_converter = DocumentConverter(
196
- pipeline_options=pipeline_options,
197
- pdf_backend=PyPdfiumDocumentBackend,
198
- )
199
-
200
- elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
201
- pipeline_options = PipelineOptions()
202
- pipeline_options.do_ocr = False
203
- pipeline_options.do_table_structure = True
204
- pipeline_options.table_structure_options.do_cell_matching = True
205
-
206
- doc_converter = DocumentConverter(
207
- pipeline_options=pipeline_options,
208
- pdf_backend=PyPdfiumDocumentBackend,
209
- )
210
-
211
- elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
212
- pipeline_options = PipelineOptions()
213
- pipeline_options.do_ocr = False
214
- pipeline_options.do_table_structure = True
215
- pipeline_options.table_structure_options.do_cell_matching = True
216
-
217
- doc_converter = DocumentConverter(
218
- pipeline_options=pipeline_options,
219
- pdf_backend=DoclingParseDocumentBackend,
220
- )
221
-
222
- elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
223
- pipeline_options = PipelineOptions()
224
- pipeline_options.do_ocr = True
225
- pipeline_options.do_table_structure = True
226
- pipeline_options.table_structure_options.do_cell_matching = True
227
-
228
- doc_converter = DocumentConverter(
229
- pipeline_options=pipeline_options,
230
- pdf_backend=DoclingParseDocumentBackend,
231
- )
232
-
233
- ###########################################################################
183
+ match backend:
184
+ case Backend.PYPDFIUM2:
185
+ do_cell_matching = ocr # only do cell matching when OCR enabled
186
+ pdf_backend = PyPdfiumDocumentBackend
187
+ case Backend.DOCLING:
188
+ do_cell_matching = True
189
+ pdf_backend = DoclingParseDocumentBackend
190
+ case _:
191
+ raise RuntimeError(f"Unexpected backend type {backend}")
192
+
193
+ pipeline_options = PipelineOptions(
194
+ do_ocr=ocr,
195
+ do_table_structure=True,
196
+ )
197
+ pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
198
+ doc_converter = DocumentConverter(
199
+ pipeline_options=pipeline_options,
200
+ pdf_backend=pdf_backend,
201
+ )
234
202
 
235
203
  # Define input files
236
204
  input = DocumentConversionInput.from_paths(input_doc_paths)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.13.1" # DO NOT EDIT, updated automatically
3
+ version = "1.14.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes