docling 1.13.1__py3-none-any.whl → 1.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
  from typing import Annotated, Iterable, List, Optional
9
9
 
10
10
  import typer
11
- from pydantic import AnyUrl
11
+ from docling_core.utils.file import resolve_file_source
12
12
 
13
13
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
14
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -109,11 +109,11 @@ def export_documents(
109
109
  @app.command(no_args_is_help=True)
110
110
  def convert(
111
111
  input_sources: Annotated[
112
- List[Path],
112
+ List[str],
113
113
  typer.Argument(
114
114
  ...,
115
115
  metavar="source",
116
- help="PDF files to convert. Directories are also accepted.",
116
+ help="PDF files to convert. Can be local file / directory paths or URL.",
117
117
  ),
118
118
  ],
119
119
  export_json: Annotated[
@@ -167,7 +167,8 @@ def convert(
167
167
  logging.basicConfig(level=logging.INFO)
168
168
 
169
169
  input_doc_paths: List[Path] = []
170
- for source in input_sources:
170
+ for src in input_sources:
171
+ source = resolve_file_source(source=src)
171
172
  if not source.exists():
172
173
  err_console.print(
173
174
  f"[red]Error: The input file {source} does not exist.[/red]"
@@ -179,58 +180,25 @@ def convert(
179
180
  else:
180
181
  input_doc_paths.append(source)
181
182
 
182
- ###########################################################################
183
-
184
- # The following sections contain a combination of PipelineOptions
185
- # and PDF Backends for various configurations.
186
- # Uncomment one section at the time to see the differences in the output.
187
-
188
- doc_converter = None
189
- if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
190
- pipeline_options = PipelineOptions()
191
- pipeline_options.do_ocr = False
192
- pipeline_options.do_table_structure = True
193
- pipeline_options.table_structure_options.do_cell_matching = False
194
-
195
- doc_converter = DocumentConverter(
196
- pipeline_options=pipeline_options,
197
- pdf_backend=PyPdfiumDocumentBackend,
198
- )
199
-
200
- elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
201
- pipeline_options = PipelineOptions()
202
- pipeline_options.do_ocr = False
203
- pipeline_options.do_table_structure = True
204
- pipeline_options.table_structure_options.do_cell_matching = True
205
-
206
- doc_converter = DocumentConverter(
207
- pipeline_options=pipeline_options,
208
- pdf_backend=PyPdfiumDocumentBackend,
209
- )
210
-
211
- elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
212
- pipeline_options = PipelineOptions()
213
- pipeline_options.do_ocr = False
214
- pipeline_options.do_table_structure = True
215
- pipeline_options.table_structure_options.do_cell_matching = True
216
-
217
- doc_converter = DocumentConverter(
218
- pipeline_options=pipeline_options,
219
- pdf_backend=DoclingParseDocumentBackend,
220
- )
221
-
222
- elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
223
- pipeline_options = PipelineOptions()
224
- pipeline_options.do_ocr = True
225
- pipeline_options.do_table_structure = True
226
- pipeline_options.table_structure_options.do_cell_matching = True
227
-
228
- doc_converter = DocumentConverter(
229
- pipeline_options=pipeline_options,
230
- pdf_backend=DoclingParseDocumentBackend,
231
- )
232
-
233
- ###########################################################################
183
+ match backend:
184
+ case Backend.PYPDFIUM2:
185
+ do_cell_matching = ocr # only do cell matching when OCR enabled
186
+ pdf_backend = PyPdfiumDocumentBackend
187
+ case Backend.DOCLING:
188
+ do_cell_matching = True
189
+ pdf_backend = DoclingParseDocumentBackend
190
+ case _:
191
+ raise RuntimeError(f"Unexpected backend type {backend}")
192
+
193
+ pipeline_options = PipelineOptions(
194
+ do_ocr=ocr,
195
+ do_table_structure=True,
196
+ )
197
+ pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
198
+ doc_converter = DocumentConverter(
199
+ pipeline_options=pipeline_options,
200
+ pdf_backend=pdf_backend,
201
+ )
234
202
 
235
203
  # Define input files
236
204
  input = DocumentConversionInput.from_paths(input_doc_paths)
@@ -324,8 +324,10 @@ class ConvertedDocument(BaseModel):
324
324
  "paragraph",
325
325
  "caption",
326
326
  "table",
327
+ "figure",
327
328
  ],
328
329
  strict_text: bool = False,
330
+ image_placeholder: str = "<!-- image -->",
329
331
  ):
330
332
  return self.output.export_to_markdown(
331
333
  delim=delim,
@@ -333,6 +335,7 @@ class ConvertedDocument(BaseModel):
333
335
  main_text_stop=main_text_stop,
334
336
  main_text_labels=main_text_labels,
335
337
  strict_text=strict_text,
338
+ image_placeholder=image_placeholder,
336
339
  )
337
340
 
338
341
  def render_as_text(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.13.1
3
+ Version: 1.15.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -22,7 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: examples
23
23
  Requires-Dist: certifi (>=2024.7.4)
24
24
  Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
25
- Requires-Dist: docling-core (>=1.5.0,<2.0.0)
25
+ Requires-Dist: docling-core (>=1.6.2,<2.0.0)
26
26
  Requires-Dist: docling-ibm-models (>=1.2.0,<2.0.0)
27
27
  Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
28
28
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -74,8 +74,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
74
74
  * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
75
75
  * 📑 Understands detailed page layout, reading order and recovers table structures
76
76
  * 📝 Extracts metadata from the document, such as title, authors, references and language
77
- * 🔍 Optionally applies OCR (use with scanned PDFs)
77
+ * 🔍 Includes OCR support for scanned PDFs
78
78
  * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
79
+ * 💻 Provides a simple and convenient CLI
79
80
 
80
81
  ## Installation
81
82
 
@@ -87,31 +88,33 @@ pip install docling
87
88
  > [!NOTE]
88
89
  > Works on macOS and Linux environments. Windows platforms are currently not tested.
89
90
 
91
+ <details>
92
+ <summary><b>Alternative PyTorch distributions</b></summary>
90
93
 
91
- ### Use alternative PyTorch distributions
94
+ The Docling models depend on the [PyTorch](https://pytorch.org/) library.
95
+ Depending on your architecture, you might want to use a different distribution of `torch`.
96
+ For example, you might want support for different accelerator or for a cpu-only version.
97
+ All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
92
98
 
93
- The Docling models depend on the [PyTorch](https://pytorch.org/) library.
94
- Depending on your architecture, you might want to use a different distribution of `torch`.
95
- For example, you might want support for different accelerator or for a cpu-only version.
96
- All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
99
+ One common situation is the installation on Linux systems with cpu-only support.
100
+ In this case, we suggest the installation of Docling with the following options
97
101
 
98
- One common situation is the installation on Linux systems with cpu-only support.
99
- In this case, we suggest the installation of Docling with the following options
102
+ ```bash
103
+ # Example for installing on the Linux cpu-only version
104
+ pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
105
+ ```
106
+ </details>
100
107
 
101
- ```bash
102
- # Example for installing on the Linux cpu-only version
103
- pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
104
- ```
108
+ <details>
109
+ <summary><b>Docling development setup</b></summary>
105
110
 
111
+ To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
112
+ ```bash
113
+ poetry install --all-extras
114
+ ```
115
+ </details>
106
116
 
107
- ### Development setup
108
-
109
- To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
110
- ```bash
111
- poetry install --all-extras
112
- ```
113
-
114
- ## Usage
117
+ ## Getting started
115
118
 
116
119
  ### Convert a single document
117
120
 
@@ -122,7 +125,6 @@ from docling.document_converter import DocumentConverter
122
125
  source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
123
126
  converter = DocumentConverter()
124
127
  result = converter.convert_single(source)
125
-
126
128
  print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
127
129
  print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
128
130
  ```
@@ -138,6 +140,51 @@ python examples/batch_convert.py
138
140
  ```
139
141
  The output of the above command will be written to `./scratch`.
140
142
 
143
+ ### CLI
144
+
145
+ You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
146
+
147
+ A simple example would look like this:
148
+ ```console
149
+ docling https://arxiv.org/pdf/2206.01062
150
+ ```
151
+
152
+ To see all available options (export formats etc.) run `docling --help`.
153
+
154
+ <details>
155
+ <summary><b>CLI reference</b></summary>
156
+
157
+ Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
158
+
159
+ ```console
160
+ $ docling --help
161
+
162
+ Usage: docling [OPTIONS] source
163
+
164
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
165
+ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
166
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
167
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
168
+ │ --json --no-json If enabled the document is exported as JSON. [default: no-json] │
169
+ │ --md --no-md If enabled the document is exported as Markdown. [default: md] │
170
+ │ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │
171
+ │ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │
172
+ │ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │
173
+ │ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │
174
+ │ --output PATH Output directory where results are saved. [default: .] │
175
+ │ --version Show version information. │
176
+ │ --help Show this message and exit. │
177
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
178
+ ```
179
+ </details>
180
+
181
+ ### RAG
182
+ Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
183
+ - [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
184
+ - [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
185
+
186
+ ## Advanced features
187
+
141
188
  ### Adjust pipeline features
142
189
 
143
190
  The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
@@ -196,11 +243,6 @@ results = doc_converter.convert(conv_input)
196
243
 
197
244
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
198
245
 
199
- ### RAG
200
- Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
201
- - [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
202
- - [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
203
-
204
246
  ## Technical report
205
247
 
206
248
  For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -4,10 +4,10 @@ docling/backend/abstract_backend.py,sha256=clJtGxLedpLriEhpx7oyxjmlwMLPorkv-1tdf
4
4
  docling/backend/docling_parse_backend.py,sha256=RUWWZbx2cUotZeeTkc-Lbg2k8MVFXFxaDjM4sPfaFZE,7475
5
5
  docling/backend/pypdfium2_backend.py,sha256=bIIImVM73wmcVcKMqjl4JF8CD-Qj2W5rZbI4G7clU4s,8877
6
6
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/cli/main.py,sha256=VUzm4vOijPo2F2Ht20zTnMI5alJLixfC5WK2NJCbyng,8492
7
+ docling/cli/main.py,sha256=gJBxgZIGza0UBUAPP8pVFp_Ma3rzB9CCw-w3Bs5wieE,7121
8
8
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  docling/datamodel/base_models.py,sha256=tE2Sxoe3e_fBZjq3GDo2NCughDMU5xDeAfkQgT72TRI,9168
10
- docling/datamodel/document.py,sha256=7HnPXTin5r_XvIxbqPe7uV6keIr90RhXGGo22uHbTeA,16064
10
+ docling/datamodel/document.py,sha256=hzWObTCtPPU7tvMr5FRKAT-7JGK4lGoOJuAHyULYuxc,16186
11
11
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
12
12
  docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
13
13
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -24,8 +24,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  docling/utils/export.py,sha256=bKLdbeUcR-rQsGPV1IqJkCHKMCv7X2QOHyxmjNuH3HE,4655
25
25
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
26
26
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
27
- docling-1.13.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
28
- docling-1.13.1.dist-info/METADATA,sha256=YbOdVls3nn2uE7XZPZeeE_irTAYcOqshA9eqmdom8pM,9629
29
- docling-1.13.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
30
- docling-1.13.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
31
- docling-1.13.1.dist-info/RECORD,,
27
+ docling-1.15.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
28
+ docling-1.15.0.dist-info/METADATA,sha256=rPvnvD2kQvVibj_Iwf3U6LGjxDaX1Bm8p9dXBuNWPcY,13208
29
+ docling-1.15.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
30
+ docling-1.15.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
31
+ docling-1.15.0.dist-info/RECORD,,