docling 1.8.1__tar.gz → 1.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {docling-1.8.1 → docling-1.8.2}/PKG-INFO +5 -5
  2. {docling-1.8.1 → docling-1.8.2}/README.md +4 -4
  3. {docling-1.8.1 → docling-1.8.2}/docling/datamodel/base_models.py +3 -3
  4. {docling-1.8.1 → docling-1.8.2}/docling/datamodel/document.py +20 -11
  5. {docling-1.8.1 → docling-1.8.2}/docling/document_converter.py +33 -36
  6. {docling-1.8.1 → docling-1.8.2}/docling/models/ds_glm_model.py +5 -5
  7. {docling-1.8.1 → docling-1.8.2}/pyproject.toml +1 -1
  8. {docling-1.8.1 → docling-1.8.2}/LICENSE +0 -0
  9. {docling-1.8.1 → docling-1.8.2}/docling/__init__.py +0 -0
  10. {docling-1.8.1 → docling-1.8.2}/docling/backend/__init__.py +0 -0
  11. {docling-1.8.1 → docling-1.8.2}/docling/backend/abstract_backend.py +0 -0
  12. {docling-1.8.1 → docling-1.8.2}/docling/backend/docling_parse_backend.py +0 -0
  13. {docling-1.8.1 → docling-1.8.2}/docling/backend/pypdfium2_backend.py +0 -0
  14. {docling-1.8.1 → docling-1.8.2}/docling/datamodel/__init__.py +0 -0
  15. {docling-1.8.1 → docling-1.8.2}/docling/datamodel/settings.py +0 -0
  16. {docling-1.8.1 → docling-1.8.2}/docling/models/__init__.py +0 -0
  17. {docling-1.8.1 → docling-1.8.2}/docling/models/base_ocr_model.py +0 -0
  18. {docling-1.8.1 → docling-1.8.2}/docling/models/easyocr_model.py +0 -0
  19. {docling-1.8.1 → docling-1.8.2}/docling/models/layout_model.py +0 -0
  20. {docling-1.8.1 → docling-1.8.2}/docling/models/page_assemble_model.py +0 -0
  21. {docling-1.8.1 → docling-1.8.2}/docling/models/table_structure_model.py +0 -0
  22. {docling-1.8.1 → docling-1.8.2}/docling/pipeline/__init__.py +0 -0
  23. {docling-1.8.1 → docling-1.8.2}/docling/pipeline/base_model_pipeline.py +0 -0
  24. {docling-1.8.1 → docling-1.8.2}/docling/pipeline/standard_model_pipeline.py +0 -0
  25. {docling-1.8.1 → docling-1.8.2}/docling/utils/__init__.py +0 -0
  26. {docling-1.8.1 → docling-1.8.2}/docling/utils/layout_utils.py +0 -0
  27. {docling-1.8.1 → docling-1.8.2}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.8.1
3
+ Version: 1.8.2
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -87,10 +87,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
87
87
  ```python
88
88
  from docling.document_converter import DocumentConverter
89
89
 
90
- source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
90
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
91
91
  converter = DocumentConverter()
92
- doc = converter.convert_single(source)
93
- print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
92
+ result = converter.convert_single(source)
93
+ print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
94
94
  ```
95
95
 
96
96
  ### Convert a batch of documents
@@ -156,7 +156,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
156
156
  buf = BytesIO(your_binary_stream)
157
157
  docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
158
158
  conv_input = DocumentConversionInput.from_streams(docs)
159
- converted_docs = doc_converter.convert(conv_input)
159
+ results = doc_converter.convert(conv_input)
160
160
  ```
161
161
  ### Limit resource usage
162
162
 
@@ -49,10 +49,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
49
49
  ```python
50
50
  from docling.document_converter import DocumentConverter
51
51
 
52
- source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
52
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
53
53
  converter = DocumentConverter()
54
- doc = converter.convert_single(source)
55
- print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
54
+ result = converter.convert_single(source)
55
+ print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
56
56
  ```
57
57
 
58
58
  ### Convert a batch of documents
@@ -118,7 +118,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
118
118
  buf = BytesIO(your_binary_stream)
119
119
  docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
120
120
  conv_input = DocumentConversionInput.from_streams(docs)
121
- converted_docs = doc_converter.convert(conv_input)
121
+ results = doc_converter.convert(conv_input)
122
122
  ```
123
123
  ### Limit resource usage
124
124
 
@@ -247,9 +247,9 @@ PageElement = Union[TextElement, TableElement, FigureElement]
247
247
 
248
248
 
249
249
  class AssembledUnit(BaseModel):
250
- elements: List[PageElement]
251
- body: List[PageElement]
252
- headers: List[PageElement]
250
+ elements: List[PageElement] = []
251
+ body: List[PageElement] = []
252
+ headers: List[PageElement] = []
253
253
 
254
254
 
255
255
  class Page(BaseModel):
@@ -12,6 +12,7 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
12
12
  from docling_core.types import Table as DsSchemaTable
13
13
  from docling_core.types import TableCell
14
14
  from pydantic import BaseModel
15
+ from typing_extensions import deprecated
15
16
 
16
17
  from docling.backend.abstract_backend import PdfDocumentBackend
17
18
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -49,6 +50,15 @@ layout_label_to_ds_type = {
49
50
  "Text": "paragraph",
50
51
  }
51
52
 
53
+ _EMPTY_DOC = DsDocument(
54
+ _name="",
55
+ description=DsDocumentDescription(logs=[]),
56
+ file_info=DsFileInfoObject(
57
+ filename="",
58
+ document_hash="",
59
+ ),
60
+ )
61
+
52
62
 
53
63
  class InputDocument(BaseModel):
54
64
  file: PurePath = None
@@ -115,6 +125,7 @@ class InputDocument(BaseModel):
115
125
  # raise
116
126
 
117
127
 
128
+ @deprecated("Use `ConversionResult` instead.")
118
129
  class ConvertedDocument(BaseModel):
119
130
  input: InputDocument
120
131
 
@@ -122,11 +133,11 @@ class ConvertedDocument(BaseModel):
122
133
  errors: List[ErrorItem] = [] # structure to keep errors
123
134
 
124
135
  pages: List[Page] = []
125
- assembled: Optional[AssembledUnit] = None
136
+ assembled: AssembledUnit = AssembledUnit()
126
137
 
127
- output: Optional[DsDocument] = None
138
+ output: DsDocument = _EMPTY_DOC
128
139
 
129
- def to_ds_document(self) -> DsDocument:
140
+ def _to_ds_document(self) -> DsDocument:
130
141
  title = ""
131
142
  desc = DsDocumentDescription(logs=[])
132
143
 
@@ -297,16 +308,10 @@ class ConvertedDocument(BaseModel):
297
308
  return ds_doc
298
309
 
299
310
  def render_as_dict(self):
300
- if self.output:
301
- return self.output.model_dump(by_alias=True, exclude_none=True)
302
- else:
303
- return {}
311
+ return self.output.model_dump(by_alias=True, exclude_none=True)
304
312
 
305
313
  def render_as_markdown(self):
306
- if self.output:
307
- return self.output.export_to_markdown()
308
- else:
309
- return ""
314
+ return self.output.export_to_markdown()
310
315
 
311
316
  def render_element_images(
312
317
  self, element_types: Tuple[PageElement] = (FigureElement,)
@@ -323,6 +328,10 @@ class ConvertedDocument(BaseModel):
323
328
  yield element, cropped_im
324
329
 
325
330
 
331
+ class ConversionResult(ConvertedDocument):
332
+ pass
333
+
334
+
326
335
  class DocumentConversionInput(BaseModel):
327
336
 
328
337
  _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
@@ -7,7 +7,6 @@ from pathlib import Path
7
7
  from typing import Iterable, Optional, Type, Union
8
8
 
9
9
  import requests
10
- from docling_core.types import Document
11
10
  from PIL import ImageDraw
12
11
  from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
13
12
 
@@ -22,7 +21,7 @@ from docling.datamodel.base_models import (
22
21
  PipelineOptions,
23
22
  )
24
23
  from docling.datamodel.document import (
25
- ConvertedDocument,
24
+ ConversionResult,
26
25
  DocumentConversionInput,
27
26
  InputDocument,
28
27
  )
@@ -73,7 +72,7 @@ class DocumentConverter:
73
72
 
74
73
  return Path(download_path)
75
74
 
76
- def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
75
+ def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
77
76
 
78
77
  for input_batch in chunkify(
79
78
  input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
@@ -86,9 +85,9 @@ class DocumentConverter:
86
85
  # yield from pool.map(self.process_document, input_batch)
87
86
 
88
87
  # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
89
- yield from map(self.process_document, input_batch)
88
+ yield from map(self._process_document, input_batch)
90
89
 
91
- def convert_single(self, source: Path | AnyHttpUrl | str) -> ConvertedDocument:
90
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
92
91
  """Convert a single document.
93
92
 
94
93
  Args:
@@ -99,7 +98,7 @@ class DocumentConverter:
99
98
  RuntimeError: If conversion fails.
100
99
 
101
100
  Returns:
102
- Document: The converted document object.
101
+ ConversionResult: The conversion result object.
103
102
  """
104
103
  with tempfile.TemporaryDirectory() as temp_dir:
105
104
  try:
@@ -129,51 +128,49 @@ class DocumentConverter:
129
128
  f"Unexpected file path type encountered: {type(source)}"
130
129
  )
131
130
  conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
132
- converted_docs_iter = self.convert(conv_inp)
133
- converted_doc: ConvertedDocument = next(converted_docs_iter)
134
- if converted_doc.status not in {
131
+ conv_res_iter = self.convert(conv_inp)
132
+ conv_res: ConversionResult = next(conv_res_iter)
133
+ if conv_res.status not in {
135
134
  ConversionStatus.SUCCESS,
136
135
  ConversionStatus.PARTIAL_SUCCESS,
137
136
  }:
138
- raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
139
- return converted_doc
137
+ raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
138
+ return conv_res
140
139
 
141
- def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
140
+ def _process_document(self, in_doc: InputDocument) -> ConversionResult:
142
141
  start_doc_time = time.time()
143
- converted_doc = ConvertedDocument(input=in_doc)
142
+ conv_res = ConversionResult(input=in_doc)
144
143
 
145
144
  _log.info(f"Processing document {in_doc.file.name}")
146
145
 
147
146
  if not in_doc.valid:
148
- converted_doc.status = ConversionStatus.FAILURE
149
- return converted_doc
147
+ conv_res.status = ConversionStatus.FAILURE
148
+ return conv_res
150
149
 
151
150
  for i in range(0, in_doc.page_count):
152
- converted_doc.pages.append(Page(page_no=i))
151
+ conv_res.pages.append(Page(page_no=i))
153
152
 
154
153
  all_assembled_pages = []
155
154
 
156
155
  try:
157
156
  # Iterate batches of pages (page_batch_size) in the doc
158
- for page_batch in chunkify(
159
- converted_doc.pages, settings.perf.page_batch_size
160
- ):
157
+ for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
161
158
  start_pb_time = time.time()
162
159
  # Pipeline
163
160
 
164
161
  # 1. Initialise the page resources
165
162
  init_pages = map(
166
- functools.partial(self.initialize_page, in_doc), page_batch
163
+ functools.partial(self._initialize_page, in_doc), page_batch
167
164
  )
168
165
 
169
166
  # 2. Populate page image
170
167
  pages_with_images = map(
171
- functools.partial(self.populate_page_images, in_doc), init_pages
168
+ functools.partial(self._populate_page_images, in_doc), init_pages
172
169
  )
173
170
 
174
171
  # 3. Populate programmatic page cells
175
172
  pages_with_cells = map(
176
- functools.partial(self.parse_page_cells, in_doc),
173
+ functools.partial(self._parse_page_cells, in_doc),
177
174
  pages_with_images,
178
175
  )
179
176
 
@@ -202,13 +199,13 @@ class DocumentConverter:
202
199
  # Free up mem resources of PDF backend
203
200
  in_doc._backend.unload()
204
201
 
205
- converted_doc.pages = all_assembled_pages
206
- self.assemble_doc(converted_doc)
202
+ conv_res.pages = all_assembled_pages
203
+ self._assemble_doc(conv_res)
207
204
 
208
205
  status = ConversionStatus.SUCCESS
209
- for page in converted_doc.pages:
206
+ for page in conv_res.pages:
210
207
  if not page._backend.is_valid():
211
- converted_doc.errors.append(
208
+ conv_res.errors.append(
212
209
  ErrorItem(
213
210
  component_type=DoclingComponentType.PDF_BACKEND,
214
211
  module_name=type(page._backend).__name__,
@@ -217,10 +214,10 @@ class DocumentConverter:
217
214
  )
218
215
  status = ConversionStatus.PARTIAL_SUCCESS
219
216
 
220
- converted_doc.status = status
217
+ conv_res.status = status
221
218
 
222
219
  except Exception as e:
223
- converted_doc.status = ConversionStatus.FAILURE
220
+ conv_res.status = ConversionStatus.FAILURE
224
221
  trace = "\n".join(traceback.format_exception(e))
225
222
  _log.info(
226
223
  f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
@@ -232,10 +229,10 @@ class DocumentConverter:
232
229
  f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
233
230
  )
234
231
 
235
- return converted_doc
232
+ return conv_res
236
233
 
237
234
  # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
238
- def initialize_page(self, doc: InputDocument, page: Page) -> Page:
235
+ def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
239
236
  page._backend = doc._backend.load_page(page.page_no)
240
237
  page.size = page._backend.get_size()
241
238
  page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
@@ -243,7 +240,7 @@ class DocumentConverter:
243
240
  return page
244
241
 
245
242
  # Generate the page image and store it in the page object
246
- def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
243
+ def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
247
244
  # default scale
248
245
  page.get_image(
249
246
  scale=1.0
@@ -259,7 +256,7 @@ class DocumentConverter:
259
256
  return page
260
257
 
261
258
  # Extract and populate the page cells and store it in the page object
262
- def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
259
+ def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
263
260
  page.cells = page._backend.get_text_cells()
264
261
 
265
262
  # DEBUG code:
@@ -274,12 +271,12 @@ class DocumentConverter:
274
271
 
275
272
  return page
276
273
 
277
- def assemble_doc(self, converted_doc: ConvertedDocument):
274
+ def _assemble_doc(self, conv_res: ConversionResult):
278
275
  all_elements = []
279
276
  all_headers = []
280
277
  all_body = []
281
278
 
282
- for p in converted_doc.pages:
279
+ for p in conv_res.pages:
283
280
 
284
281
  for el in p.assembled.body:
285
282
  all_body.append(el)
@@ -288,8 +285,8 @@ class DocumentConverter:
288
285
  for el in p.assembled.elements:
289
286
  all_elements.append(el)
290
287
 
291
- converted_doc.assembled = AssembledUnit(
288
+ conv_res.assembled = AssembledUnit(
292
289
  elements=all_elements, headers=all_headers, body=all_body
293
290
  )
294
291
 
295
- converted_doc.output = self.glm_model(converted_doc)
292
+ conv_res.output = self.glm_model(conv_res)
@@ -10,7 +10,7 @@ from docling_core.types import Ref
10
10
  from PIL import ImageDraw
11
11
 
12
12
  from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
- from docling.datamodel.document import ConvertedDocument
13
+ from docling.datamodel.document import ConversionResult
14
14
 
15
15
 
16
16
  class GlmModel:
@@ -20,8 +20,8 @@ class GlmModel:
20
20
  model = init_nlp_model(model_names="language;term;reference")
21
21
  self.model = model
22
22
 
23
- def __call__(self, document: ConvertedDocument) -> DsDocument:
24
- ds_doc = document.to_ds_document()
23
+ def __call__(self, conv_res: ConversionResult) -> DsDocument:
24
+ ds_doc = conv_res._to_ds_document()
25
25
  ds_doc_dict = ds_doc.model_dump(by_alias=True)
26
26
 
27
27
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
@@ -34,7 +34,7 @@ class GlmModel:
34
34
  # DEBUG code:
35
35
  def draw_clusters_and_cells(ds_document, page_no):
36
36
  clusters_to_draw = []
37
- image = copy.deepcopy(document.pages[page_no].image)
37
+ image = copy.deepcopy(conv_res.pages[page_no].image)
38
38
  for ix, elem in enumerate(ds_document.main_text):
39
39
  if isinstance(elem, BaseText):
40
40
  prov = elem.prov[0]
@@ -56,7 +56,7 @@ class GlmModel:
56
56
  bbox=BoundingBox.from_tuple(
57
57
  coord=prov.bbox,
58
58
  origin=CoordOrigin.BOTTOMLEFT,
59
- ).to_top_left_origin(document.pages[page_no].size.height),
59
+ ).to_top_left_origin(conv_res.pages[page_no].size.height),
60
60
  )
61
61
  )
62
62
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.8.1" # DO NOT EDIT, updated automatically
3
+ version = "1.8.2" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes