docling 1.0.1__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {docling-1.0.1 → docling-1.1.0}/PKG-INFO +24 -6
  2. {docling-1.0.1 → docling-1.1.0}/README.md +21 -5
  3. {docling-1.0.1 → docling-1.1.0}/docling/backend/pypdfium2_backend.py +1 -7
  4. {docling-1.0.1 → docling-1.1.0}/docling/document_converter.py +56 -0
  5. {docling-1.0.1 → docling-1.1.0}/pyproject.toml +3 -4
  6. {docling-1.0.1 → docling-1.1.0}/LICENSE +0 -0
  7. {docling-1.0.1 → docling-1.1.0}/docling/__init__.py +0 -0
  8. {docling-1.0.1 → docling-1.1.0}/docling/backend/__init__.py +0 -0
  9. {docling-1.0.1 → docling-1.1.0}/docling/backend/abstract_backend.py +0 -0
  10. {docling-1.0.1 → docling-1.1.0}/docling/datamodel/__init__.py +0 -0
  11. {docling-1.0.1 → docling-1.1.0}/docling/datamodel/base_models.py +0 -0
  12. {docling-1.0.1 → docling-1.1.0}/docling/datamodel/document.py +0 -0
  13. {docling-1.0.1 → docling-1.1.0}/docling/datamodel/settings.py +0 -0
  14. {docling-1.0.1 → docling-1.1.0}/docling/models/__init__.py +0 -0
  15. {docling-1.0.1 → docling-1.1.0}/docling/models/ds_glm_model.py +0 -0
  16. {docling-1.0.1 → docling-1.1.0}/docling/models/easyocr_model.py +0 -0
  17. {docling-1.0.1 → docling-1.1.0}/docling/models/layout_model.py +0 -0
  18. {docling-1.0.1 → docling-1.1.0}/docling/models/page_assemble_model.py +0 -0
  19. {docling-1.0.1 → docling-1.1.0}/docling/models/table_structure_model.py +0 -0
  20. {docling-1.0.1 → docling-1.1.0}/docling/pipeline/__init__.py +0 -0
  21. {docling-1.0.1 → docling-1.1.0}/docling/pipeline/base_model_pipeline.py +0 -0
  22. {docling-1.0.1 → docling-1.1.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  23. {docling-1.0.1 → docling-1.1.0}/docling/utils/__init__.py +0 -0
  24. {docling-1.0.1 → docling-1.1.0}/docling/utils/layout_utils.py +0 -0
  25. {docling-1.0.1 → docling-1.1.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,11 +24,13 @@ Provides-Extra: ocr
24
24
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
25
25
  Requires-Dist: docling-core (>=1.1.0,<2.0.0)
26
26
  Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
27
+ Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
27
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
28
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
29
30
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
30
31
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
31
32
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
33
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
32
34
  Project-URL: Repository, https://github.com/DS4SD/docling
33
35
  Description-Content-Type: text/markdown
34
36
 
@@ -64,19 +66,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
64
66
  pip install docling
65
67
  ```
66
68
 
67
- > [!NOTE]
69
+ > [!NOTE]
68
70
  > Works on macOS and Linux environments. Windows platforms are currently not tested.
69
71
 
70
72
  ### Development setup
71
73
 
72
74
  To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
73
75
  ```bash
74
- poetry install
76
+ poetry install --all-extras
75
77
  ```
76
78
 
77
79
  ## Usage
78
80
 
79
- For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
81
+ ### Convert a single document
82
+
83
+ To convert invidual PDF documents, use `convert_single()`, for example:
84
+ ```python
85
+ from docling.document_converter import DocumentConverter
86
+
87
+ source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
88
+ converter = DocumentConverter()
89
+ doc = converter.convert_single(source)
90
+ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
91
+ ```
92
+
93
+ ### Convert a batch of documents
94
+
95
+ For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
96
+
97
+ From a local repo clone, you can run it with:
80
98
 
81
99
  ```
82
100
  python examples/convert.py
@@ -92,7 +110,7 @@ You can control if table structure recognition or OCR should be performed by arg
92
110
  doc_converter = DocumentConverter(
93
111
  artifacts_path=artifacts_path,
94
112
  pipeline_options=PipelineOptions(
95
- do_table_structure=False, # controls if table structure is recovered
113
+ do_table_structure=False, # controls if table structure is recovered
96
114
  do_ocr=True, # controls if OCR is applied (ignores programmatic content)
97
115
  ),
98
116
  )
@@ -124,7 +142,7 @@ conv_input = DocumentConversionInput.from_paths(
124
142
  )
125
143
  ```
126
144
 
127
- ### Convert from binary PDF streams
145
+ ### Convert from binary PDF streams
128
146
 
129
147
  You can convert PDFs from a binary stream instead of from the filesystem as follows:
130
148
  ```python
@@ -30,19 +30,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
30
30
  pip install docling
31
31
  ```
32
32
 
33
- > [!NOTE]
33
+ > [!NOTE]
34
34
  > Works on macOS and Linux environments. Windows platforms are currently not tested.
35
35
 
36
36
  ### Development setup
37
37
 
38
38
  To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
39
39
  ```bash
40
- poetry install
40
+ poetry install --all-extras
41
41
  ```
42
42
 
43
43
  ## Usage
44
44
 
45
- For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
45
+ ### Convert a single document
46
+
47
+ To convert invidual PDF documents, use `convert_single()`, for example:
48
+ ```python
49
+ from docling.document_converter import DocumentConverter
50
+
51
+ source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
52
+ converter = DocumentConverter()
53
+ doc = converter.convert_single(source)
54
+ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
55
+ ```
56
+
57
+ ### Convert a batch of documents
58
+
59
+ For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
60
+
61
+ From a local repo clone, you can run it with:
46
62
 
47
63
  ```
48
64
  python examples/convert.py
@@ -58,7 +74,7 @@ You can control if table structure recognition or OCR should be performed by arg
58
74
  doc_converter = DocumentConverter(
59
75
  artifacts_path=artifacts_path,
60
76
  pipeline_options=PipelineOptions(
61
- do_table_structure=False, # controls if table structure is recovered
77
+ do_table_structure=False, # controls if table structure is recovered
62
78
  do_ocr=True, # controls if OCR is applied (ignores programmatic content)
63
79
  ),
64
80
  )
@@ -90,7 +106,7 @@ conv_input = DocumentConversionInput.from_paths(
90
106
  )
91
107
  ```
92
108
 
93
- ### Convert from binary PDF streams
109
+ ### Convert from binary PDF streams
94
110
 
95
111
  You can convert PDFs from a binary stream instead of from the filesystem as follows:
96
112
  ```python
@@ -201,13 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
201
201
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
202
  def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
203
203
  super().__init__(path_or_stream)
204
-
205
- if isinstance(path_or_stream, Path):
206
- self._pdoc = pdfium.PdfDocument(path_or_stream)
207
- elif isinstance(path_or_stream, BytesIO):
208
- self._pdoc = pdfium.PdfDocument(
209
- path_or_stream
210
- ) # TODO Fix me, won't accept bytes.
204
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
211
205
 
212
206
  def page_count(self) -> int:
213
207
  return len(self._pdoc)
@@ -1,11 +1,15 @@
1
1
  import functools
2
2
  import logging
3
+ import tempfile
3
4
  import time
4
5
  import traceback
5
6
  from pathlib import Path
6
7
  from typing import Iterable, Optional, Type, Union
7
8
 
9
+ import requests
10
+ from docling_core.types import Document
8
11
  from PIL import ImageDraw
12
+ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
9
13
 
10
14
  from docling.backend.abstract_backend import PdfDocumentBackend
11
15
  from docling.datamodel.base_models import (
@@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
32
36
  class DocumentConverter:
33
37
  _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
34
38
  _table_model_path = "model_artifacts/tableformer"
39
+ _default_download_filename = "file.pdf"
35
40
 
36
41
  def __init__(
37
42
  self,
@@ -80,6 +85,57 @@ class DocumentConverter:
80
85
  # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
81
86
  yield from map(self.process_document, input_batch)
82
87
 
88
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
89
+ """Convert a single document.
90
+
91
+ Args:
92
+ source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
93
+
94
+ Raises:
95
+ ValueError: If source is of unexpected type.
96
+ RuntimeError: If conversion fails.
97
+
98
+ Returns:
99
+ Document: The converted document object.
100
+ """
101
+ with tempfile.TemporaryDirectory() as temp_dir:
102
+ try:
103
+ http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
104
+ res = requests.get(http_url, stream=True)
105
+ res.raise_for_status()
106
+ fname = None
107
+ # try to get filename from response header
108
+ if cont_disp := res.headers.get("Content-Disposition"):
109
+ for par in cont_disp.strip().split(";"):
110
+ # currently only handling directive "filename" (not "*filename")
111
+ if (split := par.split("=")) and split[0].strip() == "filename":
112
+ fname = "=".join(split[1:]).strip().strip("'\"") or None
113
+ break
114
+ # otherwise, use name from URL:
115
+ if fname is None:
116
+ fname = Path(http_url.path).name or self._default_download_filename
117
+ local_path = Path(temp_dir) / fname
118
+ with open(local_path, "wb") as f:
119
+ for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
120
+ f.write(chunk)
121
+ except ValidationError:
122
+ try:
123
+ local_path = TypeAdapter(Path).validate_python(source)
124
+ except ValidationError:
125
+ raise ValueError(
126
+ f"Unexpected file path type encountered: {type(source)}"
127
+ )
128
+ conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
129
+ converted_docs_iter = self.convert(conv_inp)
130
+ converted_doc: ConvertedDocument = next(converted_docs_iter)
131
+ if converted_doc.status not in {
132
+ ConversionStatus.SUCCESS,
133
+ ConversionStatus.SUCCESS_WITH_ERRORS,
134
+ }:
135
+ raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
136
+ doc = converted_doc.to_ds_document()
137
+ return doc
138
+
83
139
  def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
84
140
  start_doc_time = time.time()
85
141
  converted_doc = ConvertedDocument(input=in_doc)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.0.1" # DO NOT EDIT, updated automatically
3
+ version = "1.1.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -30,9 +30,8 @@ filetype = "^1.2.0"
30
30
  pypdfium2 = "^4.30.0"
31
31
  pydantic-settings = "^2.3.0"
32
32
  huggingface_hub = ">=0.23,<1"
33
-
34
- [tool.poetry.group.ocr.dependencies]
35
- easyocr = "^1.7"
33
+ requests = "^2.32.3"
34
+ easyocr = { version = "^1.7", optional = true }
36
35
 
37
36
  [tool.poetry.group.dev.dependencies]
38
37
  black = {extras = ["jupyter"], version = "^24.4.2"}
File without changes
File without changes
File without changes