docling 1.8.0__tar.gz → 1.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {docling-1.8.0 → docling-1.8.2}/PKG-INFO +5 -5
  2. {docling-1.8.0 → docling-1.8.2}/README.md +4 -4
  3. {docling-1.8.0 → docling-1.8.2}/docling/datamodel/base_models.py +3 -3
  4. {docling-1.8.0 → docling-1.8.2}/docling/datamodel/document.py +20 -11
  5. {docling-1.8.0 → docling-1.8.2}/docling/document_converter.py +34 -38
  6. {docling-1.8.0 → docling-1.8.2}/docling/models/ds_glm_model.py +5 -5
  7. {docling-1.8.0 → docling-1.8.2}/pyproject.toml +1 -1
  8. {docling-1.8.0 → docling-1.8.2}/LICENSE +0 -0
  9. {docling-1.8.0 → docling-1.8.2}/docling/__init__.py +0 -0
  10. {docling-1.8.0 → docling-1.8.2}/docling/backend/__init__.py +0 -0
  11. {docling-1.8.0 → docling-1.8.2}/docling/backend/abstract_backend.py +0 -0
  12. {docling-1.8.0 → docling-1.8.2}/docling/backend/docling_parse_backend.py +0 -0
  13. {docling-1.8.0 → docling-1.8.2}/docling/backend/pypdfium2_backend.py +0 -0
  14. {docling-1.8.0 → docling-1.8.2}/docling/datamodel/__init__.py +0 -0
  15. {docling-1.8.0 → docling-1.8.2}/docling/datamodel/settings.py +0 -0
  16. {docling-1.8.0 → docling-1.8.2}/docling/models/__init__.py +0 -0
  17. {docling-1.8.0 → docling-1.8.2}/docling/models/base_ocr_model.py +0 -0
  18. {docling-1.8.0 → docling-1.8.2}/docling/models/easyocr_model.py +0 -0
  19. {docling-1.8.0 → docling-1.8.2}/docling/models/layout_model.py +0 -0
  20. {docling-1.8.0 → docling-1.8.2}/docling/models/page_assemble_model.py +0 -0
  21. {docling-1.8.0 → docling-1.8.2}/docling/models/table_structure_model.py +0 -0
  22. {docling-1.8.0 → docling-1.8.2}/docling/pipeline/__init__.py +0 -0
  23. {docling-1.8.0 → docling-1.8.2}/docling/pipeline/base_model_pipeline.py +0 -0
  24. {docling-1.8.0 → docling-1.8.2}/docling/pipeline/standard_model_pipeline.py +0 -0
  25. {docling-1.8.0 → docling-1.8.2}/docling/utils/__init__.py +0 -0
  26. {docling-1.8.0 → docling-1.8.2}/docling/utils/layout_utils.py +0 -0
  27. {docling-1.8.0 → docling-1.8.2}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.8.0
3
+ Version: 1.8.2
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -87,10 +87,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
87
87
  ```python
88
88
  from docling.document_converter import DocumentConverter
89
89
 
90
- source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
90
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
91
91
  converter = DocumentConverter()
92
- doc = converter.convert_single(source)
93
- print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
92
+ result = converter.convert_single(source)
93
+ print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
94
94
  ```
95
95
 
96
96
  ### Convert a batch of documents
@@ -156,7 +156,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
156
156
  buf = BytesIO(your_binary_stream)
157
157
  docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
158
158
  conv_input = DocumentConversionInput.from_streams(docs)
159
- converted_docs = doc_converter.convert(conv_input)
159
+ results = doc_converter.convert(conv_input)
160
160
  ```
161
161
  ### Limit resource usage
162
162
 
@@ -49,10 +49,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
49
49
  ```python
50
50
  from docling.document_converter import DocumentConverter
51
51
 
52
- source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
52
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
53
53
  converter = DocumentConverter()
54
- doc = converter.convert_single(source)
55
- print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
54
+ result = converter.convert_single(source)
55
+ print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
56
56
  ```
57
57
 
58
58
  ### Convert a batch of documents
@@ -118,7 +118,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
118
118
  buf = BytesIO(your_binary_stream)
119
119
  docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
120
120
  conv_input = DocumentConversionInput.from_streams(docs)
121
- converted_docs = doc_converter.convert(conv_input)
121
+ results = doc_converter.convert(conv_input)
122
122
  ```
123
123
  ### Limit resource usage
124
124
 
@@ -247,9 +247,9 @@ PageElement = Union[TextElement, TableElement, FigureElement]
247
247
 
248
248
 
249
249
  class AssembledUnit(BaseModel):
250
- elements: List[PageElement]
251
- body: List[PageElement]
252
- headers: List[PageElement]
250
+ elements: List[PageElement] = []
251
+ body: List[PageElement] = []
252
+ headers: List[PageElement] = []
253
253
 
254
254
 
255
255
  class Page(BaseModel):
@@ -12,6 +12,7 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
12
12
  from docling_core.types import Table as DsSchemaTable
13
13
  from docling_core.types import TableCell
14
14
  from pydantic import BaseModel
15
+ from typing_extensions import deprecated
15
16
 
16
17
  from docling.backend.abstract_backend import PdfDocumentBackend
17
18
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -49,6 +50,15 @@ layout_label_to_ds_type = {
49
50
  "Text": "paragraph",
50
51
  }
51
52
 
53
+ _EMPTY_DOC = DsDocument(
54
+ _name="",
55
+ description=DsDocumentDescription(logs=[]),
56
+ file_info=DsFileInfoObject(
57
+ filename="",
58
+ document_hash="",
59
+ ),
60
+ )
61
+
52
62
 
53
63
  class InputDocument(BaseModel):
54
64
  file: PurePath = None
@@ -115,6 +125,7 @@ class InputDocument(BaseModel):
115
125
  # raise
116
126
 
117
127
 
128
+ @deprecated("Use `ConversionResult` instead.")
118
129
  class ConvertedDocument(BaseModel):
119
130
  input: InputDocument
120
131
 
@@ -122,11 +133,11 @@ class ConvertedDocument(BaseModel):
122
133
  errors: List[ErrorItem] = [] # structure to keep errors
123
134
 
124
135
  pages: List[Page] = []
125
- assembled: Optional[AssembledUnit] = None
136
+ assembled: AssembledUnit = AssembledUnit()
126
137
 
127
- output: Optional[DsDocument] = None
138
+ output: DsDocument = _EMPTY_DOC
128
139
 
129
- def to_ds_document(self) -> DsDocument:
140
+ def _to_ds_document(self) -> DsDocument:
130
141
  title = ""
131
142
  desc = DsDocumentDescription(logs=[])
132
143
 
@@ -297,16 +308,10 @@ class ConvertedDocument(BaseModel):
297
308
  return ds_doc
298
309
 
299
310
  def render_as_dict(self):
300
- if self.output:
301
- return self.output.model_dump(by_alias=True, exclude_none=True)
302
- else:
303
- return {}
311
+ return self.output.model_dump(by_alias=True, exclude_none=True)
304
312
 
305
313
  def render_as_markdown(self):
306
- if self.output:
307
- return self.output.export_to_markdown()
308
- else:
309
- return ""
314
+ return self.output.export_to_markdown()
310
315
 
311
316
  def render_element_images(
312
317
  self, element_types: Tuple[PageElement] = (FigureElement,)
@@ -323,6 +328,10 @@ class ConvertedDocument(BaseModel):
323
328
  yield element, cropped_im
324
329
 
325
330
 
331
+ class ConversionResult(ConvertedDocument):
332
+ pass
333
+
334
+
326
335
  class DocumentConversionInput(BaseModel):
327
336
 
328
337
  _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
@@ -7,7 +7,6 @@ from pathlib import Path
7
7
  from typing import Iterable, Optional, Type, Union
8
8
 
9
9
  import requests
10
- from docling_core.types import Document
11
10
  from PIL import ImageDraw
12
11
  from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
13
12
 
@@ -22,7 +21,7 @@ from docling.datamodel.base_models import (
22
21
  PipelineOptions,
23
22
  )
24
23
  from docling.datamodel.document import (
25
- ConvertedDocument,
24
+ ConversionResult,
26
25
  DocumentConversionInput,
27
26
  InputDocument,
28
27
  )
@@ -73,7 +72,7 @@ class DocumentConverter:
73
72
 
74
73
  return Path(download_path)
75
74
 
76
- def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
75
+ def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
77
76
 
78
77
  for input_batch in chunkify(
79
78
  input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
@@ -86,9 +85,9 @@ class DocumentConverter:
86
85
  # yield from pool.map(self.process_document, input_batch)
87
86
 
88
87
  # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
89
- yield from map(self.process_document, input_batch)
88
+ yield from map(self._process_document, input_batch)
90
89
 
91
- def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
90
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
92
91
  """Convert a single document.
93
92
 
94
93
  Args:
@@ -99,7 +98,7 @@ class DocumentConverter:
99
98
  RuntimeError: If conversion fails.
100
99
 
101
100
  Returns:
102
- Document: The converted document object.
101
+ ConversionResult: The conversion result object.
103
102
  """
104
103
  with tempfile.TemporaryDirectory() as temp_dir:
105
104
  try:
@@ -129,52 +128,49 @@ class DocumentConverter:
129
128
  f"Unexpected file path type encountered: {type(source)}"
130
129
  )
131
130
  conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
132
- converted_docs_iter = self.convert(conv_inp)
133
- converted_doc: ConvertedDocument = next(converted_docs_iter)
134
- if converted_doc.status not in {
131
+ conv_res_iter = self.convert(conv_inp)
132
+ conv_res: ConversionResult = next(conv_res_iter)
133
+ if conv_res.status not in {
135
134
  ConversionStatus.SUCCESS,
136
- ConversionStatus.SUCCESS_WITH_ERRORS,
135
+ ConversionStatus.PARTIAL_SUCCESS,
137
136
  }:
138
- raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
139
- doc = converted_doc.to_ds_document()
140
- return doc
137
+ raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
138
+ return conv_res
141
139
 
142
- def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
140
+ def _process_document(self, in_doc: InputDocument) -> ConversionResult:
143
141
  start_doc_time = time.time()
144
- converted_doc = ConvertedDocument(input=in_doc)
142
+ conv_res = ConversionResult(input=in_doc)
145
143
 
146
144
  _log.info(f"Processing document {in_doc.file.name}")
147
145
 
148
146
  if not in_doc.valid:
149
- converted_doc.status = ConversionStatus.FAILURE
150
- return converted_doc
147
+ conv_res.status = ConversionStatus.FAILURE
148
+ return conv_res
151
149
 
152
150
  for i in range(0, in_doc.page_count):
153
- converted_doc.pages.append(Page(page_no=i))
151
+ conv_res.pages.append(Page(page_no=i))
154
152
 
155
153
  all_assembled_pages = []
156
154
 
157
155
  try:
158
156
  # Iterate batches of pages (page_batch_size) in the doc
159
- for page_batch in chunkify(
160
- converted_doc.pages, settings.perf.page_batch_size
161
- ):
157
+ for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
162
158
  start_pb_time = time.time()
163
159
  # Pipeline
164
160
 
165
161
  # 1. Initialise the page resources
166
162
  init_pages = map(
167
- functools.partial(self.initialize_page, in_doc), page_batch
163
+ functools.partial(self._initialize_page, in_doc), page_batch
168
164
  )
169
165
 
170
166
  # 2. Populate page image
171
167
  pages_with_images = map(
172
- functools.partial(self.populate_page_images, in_doc), init_pages
168
+ functools.partial(self._populate_page_images, in_doc), init_pages
173
169
  )
174
170
 
175
171
  # 3. Populate programmatic page cells
176
172
  pages_with_cells = map(
177
- functools.partial(self.parse_page_cells, in_doc),
173
+ functools.partial(self._parse_page_cells, in_doc),
178
174
  pages_with_images,
179
175
  )
180
176
 
@@ -203,13 +199,13 @@ class DocumentConverter:
203
199
  # Free up mem resources of PDF backend
204
200
  in_doc._backend.unload()
205
201
 
206
- converted_doc.pages = all_assembled_pages
207
- self.assemble_doc(converted_doc)
202
+ conv_res.pages = all_assembled_pages
203
+ self._assemble_doc(conv_res)
208
204
 
209
205
  status = ConversionStatus.SUCCESS
210
- for page in converted_doc.pages:
206
+ for page in conv_res.pages:
211
207
  if not page._backend.is_valid():
212
- converted_doc.errors.append(
208
+ conv_res.errors.append(
213
209
  ErrorItem(
214
210
  component_type=DoclingComponentType.PDF_BACKEND,
215
211
  module_name=type(page._backend).__name__,
@@ -218,10 +214,10 @@ class DocumentConverter:
218
214
  )
219
215
  status = ConversionStatus.PARTIAL_SUCCESS
220
216
 
221
- converted_doc.status = status
217
+ conv_res.status = status
222
218
 
223
219
  except Exception as e:
224
- converted_doc.status = ConversionStatus.FAILURE
220
+ conv_res.status = ConversionStatus.FAILURE
225
221
  trace = "\n".join(traceback.format_exception(e))
226
222
  _log.info(
227
223
  f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
@@ -233,10 +229,10 @@ class DocumentConverter:
233
229
  f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
234
230
  )
235
231
 
236
- return converted_doc
232
+ return conv_res
237
233
 
238
234
  # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
239
- def initialize_page(self, doc: InputDocument, page: Page) -> Page:
235
+ def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
240
236
  page._backend = doc._backend.load_page(page.page_no)
241
237
  page.size = page._backend.get_size()
242
238
  page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
@@ -244,7 +240,7 @@ class DocumentConverter:
244
240
  return page
245
241
 
246
242
  # Generate the page image and store it in the page object
247
- def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
243
+ def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
248
244
  # default scale
249
245
  page.get_image(
250
246
  scale=1.0
@@ -260,7 +256,7 @@ class DocumentConverter:
260
256
  return page
261
257
 
262
258
  # Extract and populate the page cells and store it in the page object
263
- def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
259
+ def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
264
260
  page.cells = page._backend.get_text_cells()
265
261
 
266
262
  # DEBUG code:
@@ -275,12 +271,12 @@ class DocumentConverter:
275
271
 
276
272
  return page
277
273
 
278
- def assemble_doc(self, converted_doc: ConvertedDocument):
274
+ def _assemble_doc(self, conv_res: ConversionResult):
279
275
  all_elements = []
280
276
  all_headers = []
281
277
  all_body = []
282
278
 
283
- for p in converted_doc.pages:
279
+ for p in conv_res.pages:
284
280
 
285
281
  for el in p.assembled.body:
286
282
  all_body.append(el)
@@ -289,8 +285,8 @@ class DocumentConverter:
289
285
  for el in p.assembled.elements:
290
286
  all_elements.append(el)
291
287
 
292
- converted_doc.assembled = AssembledUnit(
288
+ conv_res.assembled = AssembledUnit(
293
289
  elements=all_elements, headers=all_headers, body=all_body
294
290
  )
295
291
 
296
- converted_doc.output = self.glm_model(converted_doc)
292
+ conv_res.output = self.glm_model(conv_res)
@@ -10,7 +10,7 @@ from docling_core.types import Ref
10
10
  from PIL import ImageDraw
11
11
 
12
12
  from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
- from docling.datamodel.document import ConvertedDocument
13
+ from docling.datamodel.document import ConversionResult
14
14
 
15
15
 
16
16
  class GlmModel:
@@ -20,8 +20,8 @@ class GlmModel:
20
20
  model = init_nlp_model(model_names="language;term;reference")
21
21
  self.model = model
22
22
 
23
- def __call__(self, document: ConvertedDocument) -> DsDocument:
24
- ds_doc = document.to_ds_document()
23
+ def __call__(self, conv_res: ConversionResult) -> DsDocument:
24
+ ds_doc = conv_res._to_ds_document()
25
25
  ds_doc_dict = ds_doc.model_dump(by_alias=True)
26
26
 
27
27
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
@@ -34,7 +34,7 @@ class GlmModel:
34
34
  # DEBUG code:
35
35
  def draw_clusters_and_cells(ds_document, page_no):
36
36
  clusters_to_draw = []
37
- image = copy.deepcopy(document.pages[page_no].image)
37
+ image = copy.deepcopy(conv_res.pages[page_no].image)
38
38
  for ix, elem in enumerate(ds_document.main_text):
39
39
  if isinstance(elem, BaseText):
40
40
  prov = elem.prov[0]
@@ -56,7 +56,7 @@ class GlmModel:
56
56
  bbox=BoundingBox.from_tuple(
57
57
  coord=prov.bbox,
58
58
  origin=CoordOrigin.BOTTOMLEFT,
59
- ).to_top_left_origin(document.pages[page_no].size.height),
59
+ ).to_top_left_origin(conv_res.pages[page_no].size.height),
60
60
  )
61
61
  )
62
62
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.8.0" # DO NOT EDIT, updated automatically
3
+ version = "1.8.2" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes