docling 2.8.0__py3-none-any.whl → 2.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -2,6 +2,7 @@ import importlib
2
2
  import json
3
3
  import logging
4
4
  import re
5
+ import tempfile
5
6
  import time
6
7
  import warnings
7
8
  from enum import Enum
@@ -9,7 +10,7 @@ from pathlib import Path
9
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
10
11
 
11
12
  import typer
12
- from docling_core.utils.file import resolve_file_source
13
+ from docling_core.utils.file import resolve_source_to_path
13
14
 
14
15
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
15
16
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -32,6 +33,7 @@ from docling.datamodel.pipeline_options import (
32
33
  TesseractCliOcrOptions,
33
34
  TesseractOcrOptions,
34
35
  )
36
+ from docling.datamodel.settings import settings
35
37
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
36
38
 
37
39
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -212,6 +214,24 @@ def convert(
212
214
  help="Set the verbosity level. -v for info logging, -vv for debug logging.",
213
215
  ),
214
216
  ] = 0,
217
+ debug_visualize_cells: Annotated[
218
+ bool,
219
+ typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
220
+ ] = False,
221
+ debug_visualize_ocr: Annotated[
222
+ bool,
223
+ typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
224
+ ] = False,
225
+ debug_visualize_layout: Annotated[
226
+ bool,
227
+ typer.Option(
228
+ ..., help="Enable debug output which visualizes the layour clusters"
229
+ ),
230
+ ] = False,
231
+ debug_visualize_tables: Annotated[
232
+ bool,
233
+ typer.Option(..., help="Enable debug output which visualizes the table cells"),
234
+ ] = False,
215
235
  version: Annotated[
216
236
  Optional[bool],
217
237
  typer.Option(
@@ -229,98 +249,106 @@ def convert(
229
249
  elif verbose == 2:
230
250
  logging.basicConfig(level=logging.DEBUG)
231
251
 
252
+ settings.debug.visualize_cells = debug_visualize_cells
253
+ settings.debug.visualize_layout = debug_visualize_layout
254
+ settings.debug.visualize_tables = debug_visualize_tables
255
+ settings.debug.visualize_ocr = debug_visualize_ocr
256
+
232
257
  if from_formats is None:
233
258
  from_formats = [e for e in InputFormat]
234
259
 
235
- input_doc_paths: List[Path] = []
236
- for src in input_sources:
237
- source = resolve_file_source(source=src)
238
- if not source.exists():
239
- err_console.print(
240
- f"[red]Error: The input file {source} does not exist.[/red]"
241
- )
242
- raise typer.Abort()
243
- elif source.is_dir():
244
- for fmt in from_formats:
245
- for ext in FormatToExtensions[fmt]:
246
- input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
247
- input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
260
+ with tempfile.TemporaryDirectory() as tempdir:
261
+ input_doc_paths: List[Path] = []
262
+ for src in input_sources:
263
+ source = resolve_source_to_path(source=src, workdir=Path(tempdir))
264
+ if not source.exists():
265
+ err_console.print(
266
+ f"[red]Error: The input file {source} does not exist.[/red]"
267
+ )
268
+ raise typer.Abort()
269
+ elif source.is_dir():
270
+ for fmt in from_formats:
271
+ for ext in FormatToExtensions[fmt]:
272
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
273
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
274
+ else:
275
+ input_doc_paths.append(source)
276
+
277
+ if to_formats is None:
278
+ to_formats = [OutputFormat.MARKDOWN]
279
+
280
+ export_json = OutputFormat.JSON in to_formats
281
+ export_md = OutputFormat.MARKDOWN in to_formats
282
+ export_txt = OutputFormat.TEXT in to_formats
283
+ export_doctags = OutputFormat.DOCTAGS in to_formats
284
+
285
+ if ocr_engine == OcrEngine.EASYOCR:
286
+ ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
287
+ elif ocr_engine == OcrEngine.TESSERACT_CLI:
288
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
289
+ elif ocr_engine == OcrEngine.TESSERACT:
290
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
291
+ elif ocr_engine == OcrEngine.OCRMAC:
292
+ ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
293
+ elif ocr_engine == OcrEngine.RAPIDOCR:
294
+ ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
248
295
  else:
249
- input_doc_paths.append(source)
250
-
251
- if to_formats is None:
252
- to_formats = [OutputFormat.MARKDOWN]
253
-
254
- export_json = OutputFormat.JSON in to_formats
255
- export_md = OutputFormat.MARKDOWN in to_formats
256
- export_txt = OutputFormat.TEXT in to_formats
257
- export_doctags = OutputFormat.DOCTAGS in to_formats
258
-
259
- if ocr_engine == OcrEngine.EASYOCR:
260
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
261
- elif ocr_engine == OcrEngine.TESSERACT_CLI:
262
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
263
- elif ocr_engine == OcrEngine.TESSERACT:
264
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
265
- elif ocr_engine == OcrEngine.OCRMAC:
266
- ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
267
- elif ocr_engine == OcrEngine.RAPIDOCR:
268
- ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
269
- else:
270
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
271
-
272
- ocr_lang_list = _split_list(ocr_lang)
273
- if ocr_lang_list is not None:
274
- ocr_options.lang = ocr_lang_list
275
-
276
- pipeline_options = PdfPipelineOptions(
277
- do_ocr=ocr,
278
- ocr_options=ocr_options,
279
- do_table_structure=True,
280
- )
281
- pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
282
- pipeline_options.table_structure_options.mode = table_mode
283
-
284
- if artifacts_path is not None:
285
- pipeline_options.artifacts_path = artifacts_path
286
-
287
- if pdf_backend == PdfBackend.DLPARSE_V1:
288
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
289
- elif pdf_backend == PdfBackend.DLPARSE_V2:
290
- backend = DoclingParseV2DocumentBackend
291
- elif pdf_backend == PdfBackend.PYPDFIUM2:
292
- backend = PyPdfiumDocumentBackend
293
- else:
294
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
295
-
296
- format_options: Dict[InputFormat, FormatOption] = {
297
- InputFormat.PDF: PdfFormatOption(
298
- pipeline_options=pipeline_options,
299
- backend=backend, # pdf_backend
296
+ raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
297
+
298
+ ocr_lang_list = _split_list(ocr_lang)
299
+ if ocr_lang_list is not None:
300
+ ocr_options.lang = ocr_lang_list
301
+
302
+ pipeline_options = PdfPipelineOptions(
303
+ do_ocr=ocr,
304
+ ocr_options=ocr_options,
305
+ do_table_structure=True,
300
306
  )
301
- }
302
- doc_converter = DocumentConverter(
303
- allowed_formats=from_formats,
304
- format_options=format_options,
305
- )
307
+ pipeline_options.table_structure_options.do_cell_matching = (
308
+ True # do_cell_matching
309
+ )
310
+ pipeline_options.table_structure_options.mode = table_mode
306
311
 
307
- start_time = time.time()
312
+ if artifacts_path is not None:
313
+ pipeline_options.artifacts_path = artifacts_path
308
314
 
309
- conv_results = doc_converter.convert_all(
310
- input_doc_paths, raises_on_error=abort_on_error
311
- )
315
+ if pdf_backend == PdfBackend.DLPARSE_V1:
316
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
317
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
318
+ backend = DoclingParseV2DocumentBackend
319
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
320
+ backend = PyPdfiumDocumentBackend
321
+ else:
322
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
312
323
 
313
- output.mkdir(parents=True, exist_ok=True)
314
- export_documents(
315
- conv_results,
316
- output_dir=output,
317
- export_json=export_json,
318
- export_md=export_md,
319
- export_txt=export_txt,
320
- export_doctags=export_doctags,
321
- )
324
+ format_options: Dict[InputFormat, FormatOption] = {
325
+ InputFormat.PDF: PdfFormatOption(
326
+ pipeline_options=pipeline_options,
327
+ backend=backend, # pdf_backend
328
+ )
329
+ }
330
+ doc_converter = DocumentConverter(
331
+ allowed_formats=from_formats,
332
+ format_options=format_options,
333
+ )
334
+
335
+ start_time = time.time()
336
+
337
+ conv_results = doc_converter.convert_all(
338
+ input_doc_paths, raises_on_error=abort_on_error
339
+ )
340
+
341
+ output.mkdir(parents=True, exist_ok=True)
342
+ export_documents(
343
+ conv_results,
344
+ output_dir=output,
345
+ export_json=export_json,
346
+ export_md=export_md,
347
+ export_txt=export_txt,
348
+ export_doctags=export_doctags,
349
+ )
322
350
 
323
- end_time = time.time() - start_time
351
+ end_time = time.time() - start_time
324
352
 
325
353
  _log.info(f"All documents were converted in {end_time:.2f} seconds.")
326
354
 
@@ -1,5 +1,4 @@
1
1
  from enum import Enum, auto
2
- from io import BytesIO
3
2
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
3
 
5
4
  from docling_core.types.doc import (
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
9
8
  Size,
10
9
  TableCell,
11
10
  )
11
+ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
12
+ DocumentStream,
13
+ )
12
14
  from PIL.Image import Image
13
15
  from pydantic import BaseModel, ConfigDict
14
16
 
@@ -207,10 +209,3 @@ class Page(BaseModel):
207
209
  @property
208
210
  def image(self) -> Optional[Image]:
209
211
  return self.get_image(scale=self._default_image_scale)
210
-
211
-
212
- class DocumentStream(BaseModel):
213
- model_config = ConfigDict(arbitrary_types_allowed=True)
214
-
215
- name: str
216
- stream: BytesIO
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
32
32
  )
33
33
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
34
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
- from docling_core.utils.file import resolve_file_source
35
+ from docling_core.utils.file import resolve_source_to_stream
36
36
  from pydantic import BaseModel
37
37
  from typing_extensions import deprecated
38
38
 
@@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
459
459
  self, format_options: Dict[InputFormat, "FormatOption"]
460
460
  ) -> Iterable[InputDocument]:
461
461
  for item in self.path_or_stream_iterator:
462
- obj = resolve_file_source(item) if isinstance(item, str) else item
462
+ obj = resolve_source_to_stream(item) if isinstance(item, str) else item
463
463
  format = self._guess_format(obj)
464
464
  if format not in format_options.keys():
465
465
  _log.info(
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
 
8
8
  class TableFormerMode(str, Enum):
9
+ """Modes for the TableFormer model."""
10
+
9
11
  FAST = "fast"
10
12
  ACCURATE = "accurate"
11
13
 
12
14
 
13
15
  class TableStructureOptions(BaseModel):
16
+ """Options for the table structure."""
17
+
14
18
  do_cell_matching: bool = (
15
19
  True
16
20
  # True: Matches predictions back to PDF cells. Can break table output if PDF cells
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
21
25
 
22
26
 
23
27
  class OcrOptions(BaseModel):
28
+ """OCR options."""
29
+
24
30
  kind: str
25
31
  lang: List[str]
26
32
  force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
30
36
 
31
37
 
32
38
  class RapidOcrOptions(OcrOptions):
39
+ """Options for the RapidOCR engine."""
40
+
33
41
  kind: Literal["rapidocr"] = "rapidocr"
34
42
 
35
43
  # English and chinese are the most commly used models and have been tested with RapidOCR.
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
66
74
 
67
75
 
68
76
  class EasyOcrOptions(OcrOptions):
77
+ """Options for the EasyOCR engine."""
78
+
69
79
  kind: Literal["easyocr"] = "easyocr"
70
80
  lang: List[str] = ["fr", "de", "es", "en"]
71
81
  use_gpu: bool = True # same default as easyocr.Reader
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
79
89
 
80
90
 
81
91
  class TesseractCliOcrOptions(OcrOptions):
92
+ """Options for the TesseractCli engine."""
93
+
82
94
  kind: Literal["tesseract"] = "tesseract"
83
95
  lang: List[str] = ["fra", "deu", "spa", "eng"]
84
96
  tesseract_cmd: str = "tesseract"
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
90
102
 
91
103
 
92
104
  class TesseractOcrOptions(OcrOptions):
105
+ """Options for the Tesseract engine."""
106
+
93
107
  kind: Literal["tesserocr"] = "tesserocr"
94
108
  lang: List[str] = ["fra", "deu", "spa", "eng"]
95
109
  path: Optional[str] = None
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
100
114
 
101
115
 
102
116
  class OcrMacOptions(OcrOptions):
117
+ """Options for the Mac OCR engine."""
118
+
103
119
  kind: Literal["ocrmac"] = "ocrmac"
104
120
  lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
105
121
  recognition: str = "accurate"
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
111
127
 
112
128
 
113
129
  class PipelineOptions(BaseModel):
130
+ """Base pipeline options."""
131
+
114
132
  create_legacy_output: bool = (
115
133
  True # This defautl will be set to False on a future version of docling
116
134
  )
117
135
 
118
136
 
119
137
  class PdfPipelineOptions(PipelineOptions):
138
+ """Options for the PDF pipeline."""
139
+
120
140
  artifacts_path: Optional[Union[Path, str]] = None
121
141
  do_table_structure: bool = True # True: perform table structure extraction
122
142
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
@@ -1,5 +1,7 @@
1
+ import csv
1
2
  import io
2
3
  import logging
4
+ import os
3
5
  import tempfile
4
6
  from subprocess import DEVNULL, PIPE, Popen
5
7
  from typing import Iterable, Optional, Tuple
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
95
97
  # _log.info(decoded_data)
96
98
 
97
99
  # Read the TSV file generated by Tesseract
98
- df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
100
+ df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
99
101
 
100
102
  # Display the dataframe (optional)
101
103
  # _log.info("df: ", df.head())
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
130
132
  high_res_image = page._backend.get_page_image(
131
133
  scale=self.scale, cropbox=ocr_rect
132
134
  )
133
-
134
- with tempfile.NamedTemporaryFile(
135
- suffix=".png", mode="w"
136
- ) as image_file:
137
- fname = image_file.name
138
- high_res_image.save(fname)
135
+ try:
136
+ with tempfile.NamedTemporaryFile(
137
+ suffix=".png", mode="w+b", delete=False
138
+ ) as image_file:
139
+ fname = image_file.name
140
+ high_res_image.save(image_file)
139
141
 
140
142
  df = self._run_tesseract(fname)
143
+ finally:
144
+ if os.path.exists(fname):
145
+ os.remove(fname)
141
146
 
142
147
  # _log.info(df)
143
148
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.8.0
3
+ Version: 2.8.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core (>=2.5.1,<3.0.0)
29
+ Requires-Dist: docling-core (>=2.6.1,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
31
  Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -39,7 +39,6 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
- Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
43
42
  Requires-Dist: pydantic (>=2.0.0,<2.10)
44
43
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
45
44
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -60,7 +59,7 @@ Description-Content-Type: text/markdown
60
59
  </a>
61
60
  </p>
62
61
 
63
- # Docling
62
+ # 🦆 Docling
64
63
 
65
64
  <p align="center">
66
65
  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -85,7 +84,7 @@ Docling parses documents and exports them to the desired format with ease and sp
85
84
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
86
85
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
87
86
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
88
- * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
87
+ * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
89
88
  * 🔍 OCR support for scanned PDFs
90
89
  * 💻 Simple and convenient CLI
91
90
 
@@ -121,8 +120,24 @@ result = converter.convert(source)
121
120
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
122
121
  ```
123
122
 
124
- Check out [Getting started](https://ds4sd.github.io/docling/).
125
- You will find lots of tuning options to leverage all the advanced capabilities.
123
+ More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
124
+ the docs.
125
+
126
+ ## Documentation
127
+
128
+ Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
129
+ installation, usage, concepts, recipes, extensions, and more.
130
+
131
+ ## Examples
132
+
133
+ Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
134
+ demonstrating how to address different application use cases with Docling.
135
+
136
+ ## Integrations
137
+
138
+ To further accelerate your AI application development, check out Docling's native
139
+ [integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
140
+ and tools.
126
141
 
127
142
  ## Get help and support
128
143
 
@@ -12,11 +12,11 @@ docling/backend/msword_backend.py,sha256=VFHPr-gCak7w3NJToc5Cs-JaTb4Vm3a1JnnRIfJ
12
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
14
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- docling/cli/main.py,sha256=KxukTq155IFVkfc_aUpSL6laGG1KjnXE4oAau7B5xBA,10881
15
+ docling/cli/main.py,sha256=R9ao2zCv1GZQIATOqg9b64O7AOUCWLwjJ-2FIpW8m0I,12236
16
16
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- docling/datamodel/base_models.py,sha256=6qlwPamDZ3XUsE2kTAyGKG6O2IJClVjCqaE7DZ74KHU,5533
18
- docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
19
- docling/datamodel/pipeline_options.py,sha256=J-6kWugUrxahymKzgaEgiqPuyle1fbInPXV2wNos6Vc,4550
17
+ docling/datamodel/base_models.py,sha256=nlNrSzQGB4A8iK8aGnHXvYahfp1hSvQcrA6vuQFHwRE,5497
18
+ docling/datamodel/document.py,sha256=zTKHvNeLuKACqYedohBmOQad-2hvUMjtvb-vePQG3Dk,20708
19
+ docling/datamodel/pipeline_options.py,sha256=K65nEZ52aRfF8hWIzl0zVvRQj-3XVwoBbxTacGS6jEg,4960
20
20
  docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
21
21
  docling/document_converter.py,sha256=L0A3g7IQBaKIK7dWpUFC72ZqKywIPYkyh71Qd6DiNPE,10940
22
22
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,7 +30,7 @@ docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th
30
30
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
31
31
  docling/models/rapid_ocr_model.py,sha256=VQ0jaFmOzB9f-1JaqZ6d0o_El55Lr-nsFHfTNubMAuc,6005
32
32
  docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
33
- docling/models/tesseract_ocr_cli_model.py,sha256=OfopQnt2FGwtLJTMtW9jbJZ9EN2G2QFkA_aACjuUuDs,6372
33
+ docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
34
34
  docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
35
35
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
@@ -41,8 +41,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
41
41
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
42
42
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
43
43
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
44
- docling-2.8.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
45
- docling-2.8.0.dist-info/METADATA,sha256=4XSleijcmMxpwEFyjiNIh71ScIZUTApiKIfKDdM660A,7236
46
- docling-2.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
- docling-2.8.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
48
- docling-2.8.0.dist-info/RECORD,,
44
+ docling-2.8.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
45
+ docling-2.8.2.dist-info/METADATA,sha256=l28sUpfFDDGip1TWoxlj4gjQ1hXDDjHiWKjMP6pJg3Q,7682
46
+ docling-2.8.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
+ docling-2.8.2.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
48
+ docling-2.8.2.dist-info/RECORD,,