docling 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
- from docling_parse.docling_parse import pdf_parser_v1
9
+ from docling_parse.pdf_parsers import pdf_parser_v1
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_parse.docling_parse import pdf_parser_v2
9
+ from docling_parse.pdf_parsers import pdf_parser_v2
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
210
210
  self.parser = pdf_parser_v2("fatal")
211
211
 
212
212
  success = False
213
- if isinstance(path_or_stream, BytesIO):
213
+ if isinstance(self.path_or_stream, BytesIO):
214
214
  success = self.parser.load_document_from_bytesio(
215
- self.document_hash, path_or_stream
215
+ self.document_hash, self.path_or_stream
216
+ )
217
+ elif isinstance(self.path_or_stream, Path):
218
+ success = self.parser.load_document(
219
+ self.document_hash, str(self.path_or_stream)
216
220
  )
217
- elif isinstance(path_or_stream, Path):
218
- success = self.parser.load_document(self.document_hash, str(path_or_stream))
219
221
 
220
222
  if not success:
221
223
  raise RuntimeError(
docling/cli/main.py CHANGED
@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
27
27
  from docling.datamodel.document import ConversionResult
28
28
  from docling.datamodel.pipeline_options import (
29
29
  EasyOcrOptions,
30
+ OcrEngine,
30
31
  OcrMacOptions,
31
32
  OcrOptions,
33
+ PdfBackend,
32
34
  PdfPipelineOptions,
33
35
  RapidOcrOptions,
34
36
  TableFormerMode,
@@ -68,22 +70,6 @@ def version_callback(value: bool):
68
70
  raise typer.Exit()
69
71
 
70
72
 
71
- # Define an enum for the backend options
72
- class PdfBackend(str, Enum):
73
- PYPDFIUM2 = "pypdfium2"
74
- DLPARSE_V1 = "dlparse_v1"
75
- DLPARSE_V2 = "dlparse_v2"
76
-
77
-
78
- # Define an enum for the ocr engines
79
- class OcrEngine(str, Enum):
80
- EASYOCR = "easyocr"
81
- TESSERACT_CLI = "tesseract_cli"
82
- TESSERACT = "tesseract"
83
- OCRMAC = "ocrmac"
84
- RAPIDOCR = "rapidocr"
85
-
86
-
87
73
  def export_documents(
88
74
  conv_results: Iterable[ConversionResult],
89
75
  output_dir: Path,
@@ -208,7 +194,7 @@ def convert(
208
194
  ] = None,
209
195
  pdf_backend: Annotated[
210
196
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
211
- ] = PdfBackend.DLPARSE_V1,
197
+ ] = PdfBackend.DLPARSE_V2,
212
198
  table_mode: Annotated[
213
199
  TableFormerMode,
214
200
  typer.Option(..., help="The mode to use in the table structure model."),
@@ -264,6 +250,13 @@ def convert(
264
250
  help="Show version information.",
265
251
  ),
266
252
  ] = None,
253
+ document_timeout: Annotated[
254
+ Optional[float],
255
+ typer.Option(
256
+ ...,
257
+ help="The timeout for processing each document, in seconds.",
258
+ ),
259
+ ] = None,
267
260
  ):
268
261
  if verbose == 0:
269
262
  logging.basicConfig(level=logging.WARNING)
@@ -347,6 +340,7 @@ def convert(
347
340
  do_ocr=ocr,
348
341
  ocr_options=ocr_options,
349
342
  do_table_structure=True,
343
+ document_timeout=document_timeout,
350
344
  )
351
345
  pipeline_options.table_structure_options.do_cell_matching = (
352
346
  True # do_cell_matching
@@ -372,11 +366,13 @@ def convert(
372
366
  else:
373
367
  raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
374
368
 
369
+ pdf_format_option = PdfFormatOption(
370
+ pipeline_options=pipeline_options,
371
+ backend=backend, # pdf_backend
372
+ )
375
373
  format_options: Dict[InputFormat, FormatOption] = {
376
- InputFormat.PDF: PdfFormatOption(
377
- pipeline_options=pipeline_options,
378
- backend=backend, # pdf_backend
379
- )
374
+ InputFormat.PDF: pdf_format_option,
375
+ InputFormat.IMAGE: pdf_format_option,
380
376
  }
381
377
  doc_converter = DocumentConverter(
382
378
  allowed_formats=from_formats,
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
 
20
20
 
21
21
  class ConversionStatus(str, Enum):
22
- PENDING = auto()
23
- STARTED = auto()
24
- FAILURE = auto()
25
- SUCCESS = auto()
26
- PARTIAL_SUCCESS = auto()
27
- SKIPPED = auto()
22
+ PENDING = "pending"
23
+ STARTED = "started"
24
+ FAILURE = "failure"
25
+ SUCCESS = "success"
26
+ PARTIAL_SUCCESS = "partial_success"
27
+ SKIPPED = "skipped"
28
28
 
29
29
 
30
30
  class InputFormat(str, Enum):
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
89
89
 
90
90
 
91
91
  class DocInputType(str, Enum):
92
- PATH = auto()
93
- STREAM = auto()
92
+ PATH = "path"
93
+ STREAM = "stream"
94
94
 
95
95
 
96
96
  class DoclingComponentType(str, Enum):
97
- DOCUMENT_BACKEND = auto()
98
- MODEL = auto()
99
- DOC_ASSEMBLER = auto()
100
- USER_INPUT = auto()
97
+ DOCUMENT_BACKEND = "document_backend"
98
+ MODEL = "model"
99
+ DOC_ASSEMBLER = "doc_assembler"
100
+ USER_INPUT = "user_input"
101
101
 
102
102
 
103
103
  class ErrorItem(BaseModel):
@@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
33
33
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
34
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
35
  from docling_core.utils.file import resolve_source_to_stream
36
+ from docling_core.utils.legacy import docling_document_to_legacy
36
37
  from pydantic import BaseModel
37
38
  from typing_extensions import deprecated
38
39
 
@@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
189
190
  @property
190
191
  @deprecated("Use document instead.")
191
192
  def legacy_document(self):
192
- reverse_label_mapping = {
193
- DocItemLabel.CAPTION.value: "Caption",
194
- DocItemLabel.FOOTNOTE.value: "Footnote",
195
- DocItemLabel.FORMULA.value: "Formula",
196
- DocItemLabel.LIST_ITEM.value: "List-item",
197
- DocItemLabel.PAGE_FOOTER.value: "Page-footer",
198
- DocItemLabel.PAGE_HEADER.value: "Page-header",
199
- DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
200
- DocItemLabel.SECTION_HEADER.value: "Section-header",
201
- DocItemLabel.TABLE.value: "Table",
202
- DocItemLabel.TEXT.value: "Text",
203
- DocItemLabel.TITLE.value: "Title",
204
- DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
205
- DocItemLabel.CODE.value: "Code",
206
- DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
207
- DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
208
- DocItemLabel.FORM.value: "Form",
209
- DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
210
- DocItemLabel.PARAGRAPH.value: "paragraph",
211
- }
212
-
213
- title = ""
214
- desc = DsDocumentDescription(logs=[])
215
-
216
- page_hashes = [
217
- PageReference(
218
- hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
219
- page=p.page_no,
220
- model="default",
221
- )
222
- for p in self.document.pages.values()
223
- ]
224
-
225
- file_info = DsFileInfoObject(
226
- filename=self.input.file.name,
227
- document_hash=self.input.document_hash,
228
- num_pages=self.input.page_count,
229
- page_hashes=page_hashes,
230
- )
231
-
232
- main_text = []
233
- tables = []
234
- figures = []
235
- equations = []
236
- footnotes = []
237
- page_headers = []
238
- page_footers = []
239
-
240
- embedded_captions = set()
241
- for ix, (item, level) in enumerate(
242
- self.document.iterate_items(self.document.body)
243
- ):
244
-
245
- if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
246
- caption = item.caption_text(self.document)
247
- if caption:
248
- embedded_captions.add(caption)
249
-
250
- for item, level in self.document.iterate_items():
251
- if isinstance(item, DocItem):
252
- item_type = item.label
253
-
254
- if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
255
-
256
- if isinstance(item, ListItem) and item.marker:
257
- text = f"{item.marker} {item.text}"
258
- else:
259
- text = item.text
260
-
261
- # Can be empty.
262
- prov = [
263
- Prov(
264
- bbox=p.bbox.as_tuple(),
265
- page=p.page_no,
266
- span=[0, len(item.text)],
267
- )
268
- for p in item.prov
269
- ]
270
- main_text.append(
271
- BaseText(
272
- text=text,
273
- obj_type=layout_label_to_ds_type.get(item.label),
274
- name=reverse_label_mapping[item.label],
275
- prov=prov,
276
- )
277
- )
278
-
279
- # skip captions of they are embedded in the actual
280
- # floating object
281
- if item_type == DocItemLabel.CAPTION and text in embedded_captions:
282
- continue
283
-
284
- elif isinstance(item, TableItem) and item.data:
285
- index = len(tables)
286
- ref_str = f"#/tables/{index}"
287
- main_text.append(
288
- Ref(
289
- name=reverse_label_mapping[item.label],
290
- obj_type=layout_label_to_ds_type.get(item.label),
291
- ref=ref_str,
292
- ),
293
- )
294
-
295
- # Initialise empty table data grid (only empty cells)
296
- table_data = [
297
- [
298
- TableCell(
299
- text="",
300
- # bbox=[0,0,0,0],
301
- spans=[[i, j]],
302
- obj_type="body",
303
- )
304
- for j in range(item.data.num_cols)
305
- ]
306
- for i in range(item.data.num_rows)
307
- ]
308
-
309
- # Overwrite cells in table data for which there is actual cell content.
310
- for cell in item.data.table_cells:
311
- for i in range(
312
- min(cell.start_row_offset_idx, item.data.num_rows),
313
- min(cell.end_row_offset_idx, item.data.num_rows),
314
- ):
315
- for j in range(
316
- min(cell.start_col_offset_idx, item.data.num_cols),
317
- min(cell.end_col_offset_idx, item.data.num_cols),
318
- ):
319
- celltype = "body"
320
- if cell.column_header:
321
- celltype = "col_header"
322
- elif cell.row_header:
323
- celltype = "row_header"
324
- elif cell.row_section:
325
- celltype = "row_section"
326
-
327
- def make_spans(cell):
328
- for rspan in range(
329
- min(
330
- cell.start_row_offset_idx,
331
- item.data.num_rows,
332
- ),
333
- min(
334
- cell.end_row_offset_idx, item.data.num_rows
335
- ),
336
- ):
337
- for cspan in range(
338
- min(
339
- cell.start_col_offset_idx,
340
- item.data.num_cols,
341
- ),
342
- min(
343
- cell.end_col_offset_idx,
344
- item.data.num_cols,
345
- ),
346
- ):
347
- yield [rspan, cspan]
348
-
349
- spans = list(make_spans(cell))
350
- table_data[i][j] = GlmTableCell(
351
- text=cell.text,
352
- bbox=(
353
- cell.bbox.as_tuple()
354
- if cell.bbox is not None
355
- else None
356
- ), # check if this is bottom-left
357
- spans=spans,
358
- obj_type=celltype,
359
- col=j,
360
- row=i,
361
- row_header=cell.row_header,
362
- row_section=cell.row_section,
363
- col_header=cell.column_header,
364
- row_span=[
365
- cell.start_row_offset_idx,
366
- cell.end_row_offset_idx,
367
- ],
368
- col_span=[
369
- cell.start_col_offset_idx,
370
- cell.end_col_offset_idx,
371
- ],
372
- )
373
-
374
- # Compute the caption
375
- caption = item.caption_text(self.document)
376
-
377
- tables.append(
378
- DsSchemaTable(
379
- text=caption,
380
- num_cols=item.data.num_cols,
381
- num_rows=item.data.num_rows,
382
- obj_type=layout_label_to_ds_type.get(item.label),
383
- data=table_data,
384
- prov=[
385
- Prov(
386
- bbox=p.bbox.as_tuple(),
387
- page=p.page_no,
388
- span=[0, 0],
389
- )
390
- for p in item.prov
391
- ],
392
- )
393
- )
394
-
395
- elif isinstance(item, PictureItem):
396
- index = len(figures)
397
- ref_str = f"#/figures/{index}"
398
- main_text.append(
399
- Ref(
400
- name=reverse_label_mapping[item.label],
401
- obj_type=layout_label_to_ds_type.get(item.label),
402
- ref=ref_str,
403
- ),
404
- )
405
-
406
- # Compute the caption
407
- caption = item.caption_text(self.document)
408
-
409
- figures.append(
410
- Figure(
411
- prov=[
412
- Prov(
413
- bbox=p.bbox.as_tuple(),
414
- page=p.page_no,
415
- span=[0, len(caption)],
416
- )
417
- for p in item.prov
418
- ],
419
- obj_type=layout_label_to_ds_type.get(item.label),
420
- text=caption,
421
- # data=[[]],
422
- )
423
- )
424
-
425
- page_dimensions = [
426
- PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
427
- for p in self.document.pages.values()
428
- ]
429
-
430
- ds_doc = DsDocument(
431
- name=title,
432
- description=desc,
433
- file_info=file_info,
434
- main_text=main_text,
435
- equations=equations,
436
- footnotes=footnotes,
437
- page_headers=page_headers,
438
- page_footers=page_footers,
439
- tables=tables,
440
- figures=figures,
441
- page_dimensions=page_dimensions,
442
- )
443
-
444
- return ds_doc
193
+ return docling_document_to_legacy(self.document)
445
194
 
446
195
 
447
196
  class _DummyBackend(AbstractDocumentBackend):
@@ -126,12 +126,33 @@ class OcrMacOptions(OcrOptions):
126
126
  )
127
127
 
128
128
 
129
+ # Define an enum for the backend options
130
+ class PdfBackend(str, Enum):
131
+ """Enum of valid PDF backends."""
132
+
133
+ PYPDFIUM2 = "pypdfium2"
134
+ DLPARSE_V1 = "dlparse_v1"
135
+ DLPARSE_V2 = "dlparse_v2"
136
+
137
+
138
+ # Define an enum for the ocr engines
139
+ class OcrEngine(str, Enum):
140
+ """Enum of valid OCR engines."""
141
+
142
+ EASYOCR = "easyocr"
143
+ TESSERACT_CLI = "tesseract_cli"
144
+ TESSERACT = "tesseract"
145
+ OCRMAC = "ocrmac"
146
+ RAPIDOCR = "rapidocr"
147
+
148
+
129
149
  class PipelineOptions(BaseModel):
130
150
  """Base pipeline options."""
131
151
 
132
152
  create_legacy_output: bool = (
133
- True # This defautl will be set to False on a future version of docling
153
+ True # This default will be set to False on a future version of docling
134
154
  )
155
+ document_timeout: Optional[float] = None
135
156
 
136
157
 
137
158
  class PdfPipelineOptions(PipelineOptions):
@@ -143,7 +164,11 @@ class PdfPipelineOptions(PipelineOptions):
143
164
 
144
165
  table_structure_options: TableStructureOptions = TableStructureOptions()
145
166
  ocr_options: Union[
146
- EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
167
+ EasyOcrOptions,
168
+ TesseractCliOcrOptions,
169
+ TesseractOcrOptions,
170
+ OcrMacOptions,
171
+ RapidOcrOptions,
147
172
  ] = Field(EasyOcrOptions(), discriminator="kind")
148
173
 
149
174
  images_scale: float = 1.0
@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
10
10
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
11
  from docling.backend.asciidoc_backend import AsciiDocBackend
12
- from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
12
+ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
13
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
14
  from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
84
84
 
85
85
  class PdfFormatOption(FormatOption):
86
86
  pipeline_cls: Type = StandardPdfPipeline
87
- backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
87
+ backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
88
88
 
89
89
 
90
90
  class ImageFormatOption(FormatOption):
91
91
  pipeline_cls: Type = StandardPdfPipeline
92
- backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
92
+ backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
93
93
 
94
94
 
95
95
  def _get_default_option(format: InputFormat) -> FormatOption:
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
113
113
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
114
  ),
115
115
  InputFormat.IMAGE: FormatOption(
116
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
116
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
117
117
  ),
118
118
  InputFormat.PDF: FormatOption(
119
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
119
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
120
120
  ),
121
121
  }
122
122
  if (options := format_to_default_options.get(format)) is not None:
@@ -3,9 +3,7 @@ import random
3
3
  from pathlib import Path
4
4
  from typing import List, Union
5
5
 
6
- from deepsearch_glm.nlp_utils import init_nlp_model
7
- from deepsearch_glm.utils.doc_utils import to_docling_document
8
- from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
6
+ from deepsearch_glm.andromeda_nlp import nlp_model
9
7
  from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
10
8
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
11
9
  from docling_core.types.legacy_doc.base import (
@@ -29,6 +27,7 @@ from pydantic import BaseModel, ConfigDict
29
27
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
30
28
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
29
  from docling.datamodel.settings import settings
30
+ from docling.utils.glm_utils import to_docling_document
32
31
  from docling.utils.profiling import ProfilingScope, TimeRecorder
33
32
  from docling.utils.utils import create_hash
34
33
 
@@ -43,9 +42,7 @@ class GlmModel:
43
42
  def __init__(self, options: GlmOptions):
44
43
  self.options = options
45
44
 
46
- if self.options.model_names != "":
47
- load_pretrained_nlp_models()
48
- self.model = init_nlp_model(model_names=self.options.model_names)
45
+ self.model = nlp_model(loglevel="error", text_ordering=True)
49
46
 
50
47
  def _to_legacy_document(self, conv_res) -> DsDocument:
51
48
  title = ""
@@ -232,7 +229,7 @@ class GlmModel:
232
229
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
233
230
  with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
234
231
  ds_doc = self._to_legacy_document(conv_res)
235
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
232
+ ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
236
233
 
237
234
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
238
235
 
@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
118
118
  del high_res_image
119
119
  del im
120
120
 
121
- cells = [
122
- OcrCell(
123
- id=ix,
124
- text=line[1],
125
- confidence=line[2],
126
- bbox=BoundingBox.from_tuple(
127
- coord=(
128
- (line[0][0][0] / self.scale) + ocr_rect.l,
129
- (line[0][0][1] / self.scale) + ocr_rect.t,
130
- (line[0][2][0] / self.scale) + ocr_rect.l,
131
- (line[0][2][1] / self.scale) + ocr_rect.t,
121
+ if result is not None:
122
+ cells = [
123
+ OcrCell(
124
+ id=ix,
125
+ text=line[1],
126
+ confidence=line[2],
127
+ bbox=BoundingBox.from_tuple(
128
+ coord=(
129
+ (line[0][0][0] / self.scale) + ocr_rect.l,
130
+ (line[0][0][1] / self.scale) + ocr_rect.t,
131
+ (line[0][2][0] / self.scale) + ocr_rect.l,
132
+ (line[0][2][1] / self.scale) + ocr_rect.t,
133
+ ),
134
+ origin=CoordOrigin.TOPLEFT,
132
135
  ),
133
- origin=CoordOrigin.TOPLEFT,
134
- ),
135
- )
136
- for ix, line in enumerate(result)
137
- ]
138
- all_ocr_cells.extend(cells)
136
+ )
137
+ for ix, line in enumerate(result)
138
+ ]
139
+ all_ocr_cells.extend(cells)
139
140
 
140
141
  # Post-process the cells
141
142
  page.cells = self.post_process_cells(all_ocr_cells, page.cells)
@@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
126
126
  # conv_res.status = ConversionStatus.FAILURE
127
127
  # return conv_res
128
128
 
129
+ total_elapsed_time = 0.0
129
130
  with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
130
131
 
131
132
  for i in range(0, conv_res.input.page_count):
@@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
136
137
  for page_batch in chunkify(
137
138
  conv_res.pages, settings.perf.page_batch_size
138
139
  ):
139
- start_pb_time = time.time()
140
+ start_batch_time = time.monotonic()
140
141
 
141
142
  # 1. Initialise the page resources
142
143
  init_pages = map(
@@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
149
150
  for p in pipeline_pages: # Must exhaust!
150
151
  pass
151
152
 
152
- end_pb_time = time.time() - start_pb_time
153
- _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
153
+ end_batch_time = time.monotonic()
154
+ total_elapsed_time += end_batch_time - start_batch_time
155
+ if (
156
+ self.pipeline_options.document_timeout is not None
157
+ and total_elapsed_time > self.pipeline_options.document_timeout
158
+ ):
159
+ _log.warning(
160
+ f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
161
+ )
162
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
163
+ break
164
+
165
+ _log.debug(
166
+ f"Finished converting page batch time={end_batch_time:.3f}"
167
+ )
154
168
 
155
169
  except Exception as e:
156
170
  conv_res.status = ConversionStatus.FAILURE
@@ -97,7 +97,9 @@ class StandardPdfPipeline(PaginatedPipeline):
97
97
  local_dir: Optional[Path] = None, force: bool = False
98
98
  ) -> Path:
99
99
  from huggingface_hub import snapshot_download
100
+ from huggingface_hub.utils import disable_progress_bars
100
101
 
102
+ disable_progress_bars()
101
103
  download_path = snapshot_download(
102
104
  repo_id="ds4sd/docling-models",
103
105
  force_download=force,
@@ -0,0 +1,336 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ import pandas as pd
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ProvenanceItem,
14
+ Size,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+
19
+
20
+ def resolve_item(paths, obj):
21
+ """Find item in document from a reference path"""
22
+
23
+ if len(paths) == 0:
24
+ return obj
25
+
26
+ if paths[0] == "#":
27
+ return resolve_item(paths[1:], obj)
28
+
29
+ try:
30
+ key = int(paths[0])
31
+ except:
32
+ key = paths[0]
33
+
34
+ if len(paths) == 1:
35
+ if isinstance(key, str) and key in obj:
36
+ return obj[key]
37
+ elif isinstance(key, int) and key < len(obj):
38
+ return obj[key]
39
+ else:
40
+ return None
41
+
42
+ elif len(paths) > 1:
43
+ if isinstance(key, str) and key in obj:
44
+ return resolve_item(paths[1:], obj[key])
45
+ elif isinstance(key, int) and key < len(obj):
46
+ return resolve_item(paths[1:], obj[key])
47
+ else:
48
+ return None
49
+
50
+ else:
51
+ return None
52
+
53
+
54
+ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
55
+ unique_objects = []
56
+ seen_spans = set()
57
+
58
+ for sublist in grid:
59
+ for obj in sublist:
60
+ # Convert the spans list to a tuple of tuples for hashing
61
+ spans_tuple = tuple(tuple(span) for span in obj["spans"])
62
+ if spans_tuple not in seen_spans:
63
+ seen_spans.add(spans_tuple)
64
+ unique_objects.append(obj)
65
+
66
+ return unique_objects
67
+
68
+
69
+ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
70
+ origin = DocumentOrigin(
71
+ mimetype="application/pdf",
72
+ filename=doc_glm["file-info"]["filename"],
73
+ binary_hash=doc_glm["file-info"]["document-hash"],
74
+ )
75
+ doc_name = Path(origin.filename).stem
76
+
77
+ doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
78
+
79
+ for page_dim in doc_glm["page-dimensions"]:
80
+ page_no = int(page_dim["page"])
81
+ size = Size(width=page_dim["width"], height=page_dim["height"])
82
+
83
+ doc.add_page(page_no=page_no, size=size)
84
+
85
+ if "properties" in doc_glm:
86
+ props = pd.DataFrame(
87
+ doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
88
+ )
89
+ else:
90
+ props = pd.DataFrame()
91
+
92
+ current_list = None
93
+
94
+ for ix, pelem in enumerate(doc_glm["page-elements"]):
95
+ ptype = pelem["type"]
96
+ span_i = pelem["span"][0]
97
+ span_j = pelem["span"][1]
98
+
99
+ if "iref" not in pelem:
100
+ # print(json.dumps(pelem, indent=2))
101
+ continue
102
+
103
+ iref = pelem["iref"]
104
+
105
+ if re.match("#/figures/(\\d+)/captions/(.+)", iref):
106
+ # print(f"skip {iref}")
107
+ continue
108
+
109
+ if re.match("#/tables/(\\d+)/captions/(.+)", iref):
110
+ # print(f"skip {iref}")
111
+ continue
112
+
113
+ path = iref.split("/")
114
+ obj = resolve_item(path, doc_glm)
115
+
116
+ if obj is None:
117
+ current_list = None
118
+ print(f"warning: undefined {path}")
119
+ continue
120
+
121
+ if ptype == "figure":
122
+ current_list = None
123
+ text = ""
124
+ caption_refs = []
125
+ for caption in obj["captions"]:
126
+ text += caption["text"]
127
+
128
+ for nprov in caption["prov"]:
129
+ npaths = nprov["$ref"].split("/")
130
+ nelem = resolve_item(npaths, doc_glm)
131
+
132
+ if nelem is None:
133
+ # print(f"warning: undefined caption {npaths}")
134
+ continue
135
+
136
+ span_i = nelem["span"][0]
137
+ span_j = nelem["span"][1]
138
+
139
+ cap_text = caption["text"][span_i:span_j]
140
+
141
+ # doc_glm["page-elements"].remove(nelem)
142
+
143
+ prov = ProvenanceItem(
144
+ page_no=nelem["page"],
145
+ charspan=tuple(nelem["span"]),
146
+ bbox=BoundingBox.from_tuple(
147
+ nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
148
+ ),
149
+ )
150
+
151
+ caption_obj = doc.add_text(
152
+ label=DocItemLabel.CAPTION, text=cap_text, prov=prov
153
+ )
154
+ caption_refs.append(caption_obj.get_ref())
155
+
156
+ prov = ProvenanceItem(
157
+ page_no=pelem["page"],
158
+ charspan=(0, len(text)),
159
+ bbox=BoundingBox.from_tuple(
160
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
161
+ ),
162
+ )
163
+
164
+ pic = doc.add_picture(prov=prov)
165
+ pic.captions.extend(caption_refs)
166
+ _add_child_elements(pic, doc, obj, pelem)
167
+
168
+ elif ptype == "table":
169
+ current_list = None
170
+ text = ""
171
+ caption_refs = []
172
+ for caption in obj["captions"]:
173
+ text += caption["text"]
174
+
175
+ for nprov in caption["prov"]:
176
+ npaths = nprov["$ref"].split("/")
177
+ nelem = resolve_item(npaths, doc_glm)
178
+
179
+ if nelem is None:
180
+ # print(f"warning: undefined caption {npaths}")
181
+ continue
182
+
183
+ span_i = nelem["span"][0]
184
+ span_j = nelem["span"][1]
185
+
186
+ cap_text = caption["text"][span_i:span_j]
187
+
188
+ # doc_glm["page-elements"].remove(nelem)
189
+
190
+ prov = ProvenanceItem(
191
+ page_no=nelem["page"],
192
+ charspan=tuple(nelem["span"]),
193
+ bbox=BoundingBox.from_tuple(
194
+ nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
195
+ ),
196
+ )
197
+
198
+ caption_obj = doc.add_text(
199
+ label=DocItemLabel.CAPTION, text=cap_text, prov=prov
200
+ )
201
+ caption_refs.append(caption_obj.get_ref())
202
+
203
+ table_cells_glm = _flatten_table_grid(obj["data"])
204
+
205
+ table_cells = []
206
+ for tbl_cell_glm in table_cells_glm:
207
+ if tbl_cell_glm["bbox"] is not None:
208
+ bbox = BoundingBox.from_tuple(
209
+ tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
210
+ )
211
+ else:
212
+ bbox = None
213
+
214
+ is_col_header = False
215
+ is_row_header = False
216
+ is_row_section = False
217
+
218
+ if tbl_cell_glm["type"] == "col_header":
219
+ is_col_header = True
220
+ elif tbl_cell_glm["type"] == "row_header":
221
+ is_row_header = True
222
+ elif tbl_cell_glm["type"] == "row_section":
223
+ is_row_section = True
224
+
225
+ table_cells.append(
226
+ TableCell(
227
+ row_span=tbl_cell_glm["row-span"][1]
228
+ - tbl_cell_glm["row-span"][0],
229
+ col_span=tbl_cell_glm["col-span"][1]
230
+ - tbl_cell_glm["col-span"][0],
231
+ start_row_offset_idx=tbl_cell_glm["row-span"][0],
232
+ end_row_offset_idx=tbl_cell_glm["row-span"][1],
233
+ start_col_offset_idx=tbl_cell_glm["col-span"][0],
234
+ end_col_offset_idx=tbl_cell_glm["col-span"][1],
235
+ text=tbl_cell_glm["text"],
236
+ bbox=bbox,
237
+ column_header=is_col_header,
238
+ row_header=is_row_header,
239
+ row_section=is_row_section,
240
+ )
241
+ )
242
+
243
+ tbl_data = TableData(
244
+ num_rows=obj.get("#-rows", 0),
245
+ num_cols=obj.get("#-cols", 0),
246
+ table_cells=table_cells,
247
+ )
248
+
249
+ prov = ProvenanceItem(
250
+ page_no=pelem["page"],
251
+ charspan=(0, 0),
252
+ bbox=BoundingBox.from_tuple(
253
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
254
+ ),
255
+ )
256
+
257
+ tbl = doc.add_table(data=tbl_data, prov=prov)
258
+ tbl.captions.extend(caption_refs)
259
+
260
+ elif ptype in ["form", "key_value_region"]:
261
+ label = DocItemLabel(ptype)
262
+ container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
263
+
264
+ _add_child_elements(container_el, doc, obj, pelem)
265
+
266
+ elif "text" in obj:
267
+ text = obj["text"][span_i:span_j]
268
+
269
+ type_label = pelem["type"]
270
+ name_label = pelem["name"]
271
+ if update_name_label and len(props) > 0 and type_label == "paragraph":
272
+ prop = props[
273
+ (props["type"] == "semantic") & (props["subj_path"] == iref)
274
+ ]
275
+ if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
276
+ name_label = prop.iloc[0]["label"]
277
+
278
+ prov = ProvenanceItem(
279
+ page_no=pelem["page"],
280
+ charspan=(0, len(text)),
281
+ bbox=BoundingBox.from_tuple(
282
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
283
+ ),
284
+ )
285
+ label = DocItemLabel(name_label)
286
+
287
+ if label == DocItemLabel.LIST_ITEM:
288
+ if current_list is None:
289
+ current_list = doc.add_group(label=GroupLabel.LIST, name="list")
290
+
291
+ # TODO: Infer if this is a numbered or a bullet list item
292
+ doc.add_list_item(
293
+ text=text, enumerated=False, prov=prov, parent=current_list
294
+ )
295
+ elif label == DocItemLabel.SECTION_HEADER:
296
+ current_list = None
297
+
298
+ doc.add_heading(text=text, prov=prov)
299
+ else:
300
+ current_list = None
301
+
302
+ doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
303
+
304
+ return doc
305
+
306
+
307
+ def _add_child_elements(container_el, doc, obj, pelem):
308
+ payload = obj.get("payload")
309
+ if payload is not None:
310
+ children = payload.get("children", [])
311
+
312
+ for child in children:
313
+ c_label = DocItemLabel(child["label"])
314
+ c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
315
+ doc.pages[pelem["page"]].size.height
316
+ )
317
+ c_text = " ".join(
318
+ [
319
+ cell["text"].replace("\x02", "-").strip()
320
+ for cell in child["cells"]
321
+ if len(cell["text"].strip()) > 0
322
+ ]
323
+ )
324
+
325
+ c_prov = ProvenanceItem(
326
+ page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
327
+ )
328
+ if c_label == DocItemLabel.LIST_ITEM:
329
+ # TODO: Infer if this is a numbered or a bullet list item
330
+ doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
331
+ elif c_label == DocItemLabel.SECTION_HEADER:
332
+ doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
333
+ else:
334
+ doc.add_text(
335
+ parent=container_el, label=c_label, text=c_text, prov=c_prov
336
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.9.0
3
+ Version: 2.11.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
25
25
  Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
- Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core[chunking] (>=2.8.0,<3.0.0)
28
+ Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
- Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
31
+ Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -2,8 +2,8 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
4
  docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
- docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJksWIYoXxZMjU,7633
6
- docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
5
+ docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
6
+ docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
7
7
  docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
@@ -13,39 +13,40 @@ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
14
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
15
15
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- docling/cli/main.py,sha256=dWGiweoxaD4ptORsx1lzxc1zcDg2fgd4b-GG1cSdVTA,14628
16
+ docling/cli/main.py,sha256=FFDUDADvK7QNW7xCs6dlsC7Bt_BMyrKdbZewKTEjm54,14624
17
17
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/datamodel/base_models.py,sha256=627IB8HZdXGmHNfsX4Qhf7kKSxx2btPjS7z8hitvhyE,5560
19
- docling/datamodel/document.py,sha256=Y0NEFphwz44VxIaRaDRhtmw6rifzSC7MqyaDBzaR0lM,20902
20
- docling/datamodel/pipeline_options.py,sha256=K65nEZ52aRfF8hWIzl0zVvRQj-3XVwoBbxTacGS6jEg,4960
18
+ docling/datamodel/base_models.py,sha256=vwy59eDrkzCSaay24RlUvx4zEyuaUukOdOhw3622u2I,5616
19
+ docling/datamodel/document.py,sha256=GNlTsgKgDqdqv2dfhpYmnqymxDQWWWC8HgE8uAta8V4,10265
20
+ docling/datamodel/pipeline_options.py,sha256=1ouWNE5VhZolrWMb4RE6s_AxgNFr3_3PMtxB_YQ391A,5495
21
21
  docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
22
- docling/document_converter.py,sha256=bsXGQCUrbL2LmaqaaEmlkfSANl2XwBBx8HDLwFrqhFY,11570
22
+ docling/document_converter.py,sha256=Iz5eerBWFPVJoXAMlXEivRQX2VLBiUkA07BL4NNbaEs,11583
23
23
  docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
24
24
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
26
26
  docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
27
- docling/models/ds_glm_model.py,sha256=hBRCx6oFGhxBbKEJlRSWVndDwFtB5IpeLOowFAVqFM0,12033
27
+ docling/models/ds_glm_model.py,sha256=YJkGxV46wh7G2Wr4vVzt9b8oewkUDPWpvI6AEaZDrs0,11872
28
28
  docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
29
29
  docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
30
30
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
31
31
  docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
32
32
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
33
- docling/models/rapid_ocr_model.py,sha256=VQ0jaFmOzB9f-1JaqZ6d0o_El55Lr-nsFHfTNubMAuc,6005
33
+ docling/models/rapid_ocr_model.py,sha256=ui152cerv9b9OeWyyyefs8qMLwYn0qsE2DFE_gHmaCM,6124
34
34
  docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
35
35
  docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
36
36
  docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
37
37
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
38
+ docling/pipeline/base_pipeline.py,sha256=hVvtk5E4DVZdl_SyNs_pYRUjN9C8PABhpVaeN5Z_fAY,7885
39
39
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
40
- docling/pipeline/standard_pdf_pipeline.py,sha256=7sbkh9EwXlhSfJSgf-WyjB5jdJ1El7Pn4siSssTJpq8,8789
40
+ docling/pipeline/standard_pdf_pipeline.py,sha256=B1q8xt3Dfecpi8s8DrcfPzdATh8TYgL43FDzzcS4vEA,8885
41
41
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
42
42
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
43
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
44
+ docling/utils/glm_utils.py,sha256=H1O_tDiRksMgw45rY9LhK6GjcZSOq5IyoGurGjoo-Ac,11211
44
45
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
45
46
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
46
47
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
47
- docling-2.9.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
48
- docling-2.9.0.dist-info/METADATA,sha256=AFpGTTO_IIyzV5AglCh8hhj3Z7UopU7mPkJ3n_UzrUs,7732
49
- docling-2.9.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
50
- docling-2.9.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
51
- docling-2.9.0.dist-info/RECORD,,
48
+ docling-2.11.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
49
+ docling-2.11.0.dist-info/METADATA,sha256=ajUVy5CuNDUp0x9tMCqO2px2M-ia-Vs7frIyb0_HxMo,7731
50
+ docling-2.11.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
51
+ docling-2.11.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
52
+ docling-2.11.0.dist-info/RECORD,,