docling 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -201,13 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
201
201
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
202
  def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
203
203
  super().__init__(path_or_stream)
204
-
205
- if isinstance(path_or_stream, Path):
206
- self._pdoc = pdfium.PdfDocument(path_or_stream)
207
- elif isinstance(path_or_stream, BytesIO):
208
- self._pdoc = pdfium.PdfDocument(
209
- path_or_stream
210
- ) # TODO Fix me, won't accept bytes.
204
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
211
205
 
212
206
  def page_count(self) -> int:
213
207
  return len(self._pdoc)
@@ -1,11 +1,15 @@
1
1
  import functools
2
2
  import logging
3
+ import tempfile
3
4
  import time
4
5
  import traceback
5
6
  from pathlib import Path
6
7
  from typing import Iterable, Optional, Type, Union
7
8
 
9
+ import requests
10
+ from docling_core.types import Document
8
11
  from PIL import ImageDraw
12
+ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
9
13
 
10
14
  from docling.backend.abstract_backend import PdfDocumentBackend
11
15
  from docling.datamodel.base_models import (
@@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
32
36
  class DocumentConverter:
33
37
  _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
34
38
  _table_model_path = "model_artifacts/tableformer"
39
+ _default_download_filename = "file.pdf"
35
40
 
36
41
  def __init__(
37
42
  self,
@@ -80,6 +85,57 @@ class DocumentConverter:
80
85
  # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
81
86
  yield from map(self.process_document, input_batch)
82
87
 
88
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
89
+ """Convert a single document.
90
+
91
+ Args:
92
+ source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
93
+
94
+ Raises:
95
+ ValueError: If source is of unexpected type.
96
+ RuntimeError: If conversion fails.
97
+
98
+ Returns:
99
+ Document: The converted document object.
100
+ """
101
+ with tempfile.TemporaryDirectory() as temp_dir:
102
+ try:
103
+ http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
104
+ res = requests.get(http_url, stream=True)
105
+ res.raise_for_status()
106
+ fname = None
107
+ # try to get filename from response header
108
+ if cont_disp := res.headers.get("Content-Disposition"):
109
+ for par in cont_disp.strip().split(";"):
110
+ # currently only handling directive "filename" (not "*filename")
111
+ if (split := par.split("=")) and split[0].strip() == "filename":
112
+ fname = "=".join(split[1:]).strip().strip("'\"") or None
113
+ break
114
+ # otherwise, use name from URL:
115
+ if fname is None:
116
+ fname = Path(http_url.path).name or self._default_download_filename
117
+ local_path = Path(temp_dir) / fname
118
+ with open(local_path, "wb") as f:
119
+ for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
120
+ f.write(chunk)
121
+ except ValidationError:
122
+ try:
123
+ local_path = TypeAdapter(Path).validate_python(source)
124
+ except ValidationError:
125
+ raise ValueError(
126
+ f"Unexpected file path type encountered: {type(source)}"
127
+ )
128
+ conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
129
+ converted_docs_iter = self.convert(conv_inp)
130
+ converted_doc: ConvertedDocument = next(converted_docs_iter)
131
+ if converted_doc.status not in {
132
+ ConversionStatus.SUCCESS,
133
+ ConversionStatus.SUCCESS_WITH_ERRORS,
134
+ }:
135
+ raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
136
+ doc = converted_doc.to_ds_document()
137
+ return doc
138
+
83
139
  def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
84
140
  start_doc_time = time.time()
85
141
  converted_doc = ConvertedDocument(input=in_doc)
@@ -114,12 +114,15 @@ class TableStructureModel:
114
114
  for element in table_out["tf_responses"]:
115
115
 
116
116
  if not self.do_cell_matching:
117
- the_bbox = BoundingBox.model_validate(element["bbox"])
117
+ the_bbox = BoundingBox.model_validate(
118
+ element["bbox"]
119
+ ).scaled(1 / self.scale)
118
120
  text_piece = page._backend.get_text_in_rect(the_bbox)
119
121
  element["bbox"]["token"] = text_piece
120
122
 
121
123
  tc = TableCell.model_validate(element)
122
- tc.bbox = tc.bbox.scaled(1 / self.scale)
124
+ if self.do_cell_matching:
125
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
123
126
  table_cells.append(tc)
124
127
 
125
128
  # Retrieving cols/rows, after post processing:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.0.2
3
+ Version: 1.1.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -30,6 +30,7 @@ Requires-Dist: huggingface_hub (>=0.23,<1)
30
30
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
31
31
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
32
32
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
33
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
33
34
  Project-URL: Repository, https://github.com/DS4SD/docling
34
35
  Description-Content-Type: text/markdown
35
36
 
@@ -65,19 +66,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
65
66
  pip install docling
66
67
  ```
67
68
 
68
- > [!NOTE]
69
+ > [!NOTE]
69
70
  > Works on macOS and Linux environments. Windows platforms are currently not tested.
70
71
 
71
72
  ### Development setup
72
73
 
73
74
  To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
74
75
  ```bash
75
- poetry install
76
+ poetry install --all-extras
76
77
  ```
77
78
 
78
79
  ## Usage
79
80
 
80
- For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
81
+ ### Convert a single document
82
+
83
+ To convert invidual PDF documents, use `convert_single()`, for example:
84
+ ```python
85
+ from docling.document_converter import DocumentConverter
86
+
87
+ source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
88
+ converter = DocumentConverter()
89
+ doc = converter.convert_single(source)
90
+ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
91
+ ```
92
+
93
+ ### Convert a batch of documents
94
+
95
+ For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
96
+
97
+ From a local repo clone, you can run it with:
81
98
 
82
99
  ```
83
100
  python examples/convert.py
@@ -93,7 +110,7 @@ You can control if table structure recognition or OCR should be performed by arg
93
110
  doc_converter = DocumentConverter(
94
111
  artifacts_path=artifacts_path,
95
112
  pipeline_options=PipelineOptions(
96
- do_table_structure=False, # controls if table structure is recovered
113
+ do_table_structure=False, # controls if table structure is recovered
97
114
  do_ocr=True, # controls if OCR is applied (ignores programmatic content)
98
115
  ),
99
116
  )
@@ -125,7 +142,7 @@ conv_input = DocumentConversionInput.from_paths(
125
142
  )
126
143
  ```
127
144
 
128
- ### Convert from binary PDF streams
145
+ ### Convert from binary PDF streams
129
146
 
130
147
  You can convert PDFs from a binary stream instead of from the filesystem as follows:
131
148
  ```python
@@ -1,25 +1,25 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
4
- docling/backend/pypdfium2_backend.py,sha256=sJMoActFyc3qdKB6RFly3auHXuXM4noQAG0ypUlj26o,7647
4
+ docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
5
5
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
7
7
  docling/datamodel/document.py,sha256=7caefzaii6itMQgtXfA4SJhB1TAF32v1c8zRwbiU03s,12497
8
8
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
9
- docling/document_converter.py,sha256=MZw23oPlRmRi1ggzoD1PukUnqo-6boO3RZB06dZ5Xt0,7305
9
+ docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
10
10
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
12
12
  docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
13
13
  docling/models/layout_model.py,sha256=4AfPFiu6pXc8wIQ1sQlEZnHRt7SnBmfzDdctiRveOWw,10944
14
14
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
15
- docling/models/table_structure_model.py,sha256=ryZrmkNkCbw5SCpgdQabkmcRAEi_4VqOMv2VGdpvGZo,5499
15
+ docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
16
16
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
18
18
  docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
19
19
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
21
21
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
22
- docling-1.0.2.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
23
- docling-1.0.2.dist-info/METADATA,sha256=U_gVZvm1gsFXII9a6INpjbH64zH00tiHbch3EaJFvgc,6188
24
- docling-1.0.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
- docling-1.0.2.dist-info/RECORD,,
22
+ docling-1.1.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
23
+ docling-1.1.1.dist-info/METADATA,sha256=hnIPHm49bjWcFKBSCJ-aPsqim6aqHkWZiMdhkQli9Lk,6759
24
+ docling-1.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
+ docling-1.1.1.dist-info/RECORD,,