docling 2.8.1__py3-none-any.whl → 2.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -2,6 +2,7 @@ import importlib
2
2
  import json
3
3
  import logging
4
4
  import re
5
+ import tempfile
5
6
  import time
6
7
  import warnings
7
8
  from enum import Enum
@@ -9,7 +10,7 @@ from pathlib import Path
9
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
10
11
 
11
12
  import typer
12
- from docling_core.utils.file import resolve_file_source
13
+ from docling_core.utils.file import resolve_source_to_path
13
14
 
14
15
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
15
16
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -256,95 +257,98 @@ def convert(
256
257
  if from_formats is None:
257
258
  from_formats = [e for e in InputFormat]
258
259
 
259
- input_doc_paths: List[Path] = []
260
- for src in input_sources:
261
- source = resolve_file_source(source=src)
262
- if not source.exists():
263
- err_console.print(
264
- f"[red]Error: The input file {source} does not exist.[/red]"
265
- )
266
- raise typer.Abort()
267
- elif source.is_dir():
268
- for fmt in from_formats:
269
- for ext in FormatToExtensions[fmt]:
270
- input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
271
- input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
260
+ with tempfile.TemporaryDirectory() as tempdir:
261
+ input_doc_paths: List[Path] = []
262
+ for src in input_sources:
263
+ source = resolve_source_to_path(source=src, workdir=Path(tempdir))
264
+ if not source.exists():
265
+ err_console.print(
266
+ f"[red]Error: The input file {source} does not exist.[/red]"
267
+ )
268
+ raise typer.Abort()
269
+ elif source.is_dir():
270
+ for fmt in from_formats:
271
+ for ext in FormatToExtensions[fmt]:
272
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
273
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
274
+ else:
275
+ input_doc_paths.append(source)
276
+
277
+ if to_formats is None:
278
+ to_formats = [OutputFormat.MARKDOWN]
279
+
280
+ export_json = OutputFormat.JSON in to_formats
281
+ export_md = OutputFormat.MARKDOWN in to_formats
282
+ export_txt = OutputFormat.TEXT in to_formats
283
+ export_doctags = OutputFormat.DOCTAGS in to_formats
284
+
285
+ if ocr_engine == OcrEngine.EASYOCR:
286
+ ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
287
+ elif ocr_engine == OcrEngine.TESSERACT_CLI:
288
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
289
+ elif ocr_engine == OcrEngine.TESSERACT:
290
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
291
+ elif ocr_engine == OcrEngine.OCRMAC:
292
+ ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
293
+ elif ocr_engine == OcrEngine.RAPIDOCR:
294
+ ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
272
295
  else:
273
- input_doc_paths.append(source)
274
-
275
- if to_formats is None:
276
- to_formats = [OutputFormat.MARKDOWN]
277
-
278
- export_json = OutputFormat.JSON in to_formats
279
- export_md = OutputFormat.MARKDOWN in to_formats
280
- export_txt = OutputFormat.TEXT in to_formats
281
- export_doctags = OutputFormat.DOCTAGS in to_formats
282
-
283
- if ocr_engine == OcrEngine.EASYOCR:
284
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
285
- elif ocr_engine == OcrEngine.TESSERACT_CLI:
286
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
287
- elif ocr_engine == OcrEngine.TESSERACT:
288
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
289
- elif ocr_engine == OcrEngine.OCRMAC:
290
- ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
291
- elif ocr_engine == OcrEngine.RAPIDOCR:
292
- ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
293
- else:
294
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
295
-
296
- ocr_lang_list = _split_list(ocr_lang)
297
- if ocr_lang_list is not None:
298
- ocr_options.lang = ocr_lang_list
299
-
300
- pipeline_options = PdfPipelineOptions(
301
- do_ocr=ocr,
302
- ocr_options=ocr_options,
303
- do_table_structure=True,
304
- )
305
- pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
306
- pipeline_options.table_structure_options.mode = table_mode
307
-
308
- if artifacts_path is not None:
309
- pipeline_options.artifacts_path = artifacts_path
310
-
311
- if pdf_backend == PdfBackend.DLPARSE_V1:
312
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
313
- elif pdf_backend == PdfBackend.DLPARSE_V2:
314
- backend = DoclingParseV2DocumentBackend
315
- elif pdf_backend == PdfBackend.PYPDFIUM2:
316
- backend = PyPdfiumDocumentBackend
317
- else:
318
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
319
-
320
- format_options: Dict[InputFormat, FormatOption] = {
321
- InputFormat.PDF: PdfFormatOption(
322
- pipeline_options=pipeline_options,
323
- backend=backend, # pdf_backend
296
+ raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
297
+
298
+ ocr_lang_list = _split_list(ocr_lang)
299
+ if ocr_lang_list is not None:
300
+ ocr_options.lang = ocr_lang_list
301
+
302
+ pipeline_options = PdfPipelineOptions(
303
+ do_ocr=ocr,
304
+ ocr_options=ocr_options,
305
+ do_table_structure=True,
324
306
  )
325
- }
326
- doc_converter = DocumentConverter(
327
- allowed_formats=from_formats,
328
- format_options=format_options,
329
- )
307
+ pipeline_options.table_structure_options.do_cell_matching = (
308
+ True # do_cell_matching
309
+ )
310
+ pipeline_options.table_structure_options.mode = table_mode
330
311
 
331
- start_time = time.time()
312
+ if artifacts_path is not None:
313
+ pipeline_options.artifacts_path = artifacts_path
332
314
 
333
- conv_results = doc_converter.convert_all(
334
- input_doc_paths, raises_on_error=abort_on_error
335
- )
315
+ if pdf_backend == PdfBackend.DLPARSE_V1:
316
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
317
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
318
+ backend = DoclingParseV2DocumentBackend
319
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
320
+ backend = PyPdfiumDocumentBackend
321
+ else:
322
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
336
323
 
337
- output.mkdir(parents=True, exist_ok=True)
338
- export_documents(
339
- conv_results,
340
- output_dir=output,
341
- export_json=export_json,
342
- export_md=export_md,
343
- export_txt=export_txt,
344
- export_doctags=export_doctags,
345
- )
324
+ format_options: Dict[InputFormat, FormatOption] = {
325
+ InputFormat.PDF: PdfFormatOption(
326
+ pipeline_options=pipeline_options,
327
+ backend=backend, # pdf_backend
328
+ )
329
+ }
330
+ doc_converter = DocumentConverter(
331
+ allowed_formats=from_formats,
332
+ format_options=format_options,
333
+ )
334
+
335
+ start_time = time.time()
336
+
337
+ conv_results = doc_converter.convert_all(
338
+ input_doc_paths, raises_on_error=abort_on_error
339
+ )
340
+
341
+ output.mkdir(parents=True, exist_ok=True)
342
+ export_documents(
343
+ conv_results,
344
+ output_dir=output,
345
+ export_json=export_json,
346
+ export_md=export_md,
347
+ export_txt=export_txt,
348
+ export_doctags=export_doctags,
349
+ )
346
350
 
347
- end_time = time.time() - start_time
351
+ end_time = time.time() - start_time
348
352
 
349
353
  _log.info(f"All documents were converted in {end_time:.2f} seconds.")
350
354
 
@@ -1,5 +1,4 @@
1
1
  from enum import Enum, auto
2
- from io import BytesIO
3
2
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
3
 
5
4
  from docling_core.types.doc import (
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
9
8
  Size,
10
9
  TableCell,
11
10
  )
11
+ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
12
+ DocumentStream,
13
+ )
12
14
  from PIL.Image import Image
13
15
  from pydantic import BaseModel, ConfigDict
14
16
 
@@ -207,10 +209,3 @@ class Page(BaseModel):
207
209
  @property
208
210
  def image(self) -> Optional[Image]:
209
211
  return self.get_image(scale=self._default_image_scale)
210
-
211
-
212
- class DocumentStream(BaseModel):
213
- model_config = ConfigDict(arbitrary_types_allowed=True)
214
-
215
- name: str
216
- stream: BytesIO
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
32
32
  )
33
33
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
34
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
- from docling_core.utils.file import resolve_file_source
35
+ from docling_core.utils.file import resolve_source_to_stream
36
36
  from pydantic import BaseModel
37
37
  from typing_extensions import deprecated
38
38
 
@@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
459
459
  self, format_options: Dict[InputFormat, "FormatOption"]
460
460
  ) -> Iterable[InputDocument]:
461
461
  for item in self.path_or_stream_iterator:
462
- obj = resolve_file_source(item) if isinstance(item, str) else item
462
+ obj = resolve_source_to_stream(item) if isinstance(item, str) else item
463
463
  format = self._guess_format(obj)
464
464
  if format not in format_options.keys():
465
465
  _log.info(
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
 
8
8
  class TableFormerMode(str, Enum):
9
+ """Modes for the TableFormer model."""
10
+
9
11
  FAST = "fast"
10
12
  ACCURATE = "accurate"
11
13
 
12
14
 
13
15
  class TableStructureOptions(BaseModel):
16
+ """Options for the table structure."""
17
+
14
18
  do_cell_matching: bool = (
15
19
  True
16
20
  # True: Matches predictions back to PDF cells. Can break table output if PDF cells
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
21
25
 
22
26
 
23
27
  class OcrOptions(BaseModel):
28
+ """OCR options."""
29
+
24
30
  kind: str
25
31
  lang: List[str]
26
32
  force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
30
36
 
31
37
 
32
38
  class RapidOcrOptions(OcrOptions):
39
+ """Options for the RapidOCR engine."""
40
+
33
41
  kind: Literal["rapidocr"] = "rapidocr"
34
42
 
35
43
  # English and chinese are the most commly used models and have been tested with RapidOCR.
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
66
74
 
67
75
 
68
76
  class EasyOcrOptions(OcrOptions):
77
+ """Options for the EasyOCR engine."""
78
+
69
79
  kind: Literal["easyocr"] = "easyocr"
70
80
  lang: List[str] = ["fr", "de", "es", "en"]
71
81
  use_gpu: bool = True # same default as easyocr.Reader
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
79
89
 
80
90
 
81
91
  class TesseractCliOcrOptions(OcrOptions):
92
+ """Options for the TesseractCli engine."""
93
+
82
94
  kind: Literal["tesseract"] = "tesseract"
83
95
  lang: List[str] = ["fra", "deu", "spa", "eng"]
84
96
  tesseract_cmd: str = "tesseract"
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
90
102
 
91
103
 
92
104
  class TesseractOcrOptions(OcrOptions):
105
+ """Options for the Tesseract engine."""
106
+
93
107
  kind: Literal["tesserocr"] = "tesserocr"
94
108
  lang: List[str] = ["fra", "deu", "spa", "eng"]
95
109
  path: Optional[str] = None
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
100
114
 
101
115
 
102
116
  class OcrMacOptions(OcrOptions):
117
+ """Options for the Mac OCR engine."""
118
+
103
119
  kind: Literal["ocrmac"] = "ocrmac"
104
120
  lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
105
121
  recognition: str = "accurate"
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
111
127
 
112
128
 
113
129
  class PipelineOptions(BaseModel):
130
+ """Base pipeline options."""
131
+
114
132
  create_legacy_output: bool = (
115
133
  True # This defautl will be set to False on a future version of docling
116
134
  )
117
135
 
118
136
 
119
137
  class PdfPipelineOptions(PipelineOptions):
138
+ """Options for the PDF pipeline."""
139
+
120
140
  artifacts_path: Optional[Union[Path, str]] = None
121
141
  do_table_structure: bool = True # True: perform table structure extraction
122
142
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
@@ -1,5 +1,7 @@
1
+ import csv
1
2
  import io
2
3
  import logging
4
+ import os
3
5
  import tempfile
4
6
  from subprocess import DEVNULL, PIPE, Popen
5
7
  from typing import Iterable, Optional, Tuple
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
95
97
  # _log.info(decoded_data)
96
98
 
97
99
  # Read the TSV file generated by Tesseract
98
- df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
100
+ df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
99
101
 
100
102
  # Display the dataframe (optional)
101
103
  # _log.info("df: ", df.head())
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
130
132
  high_res_image = page._backend.get_page_image(
131
133
  scale=self.scale, cropbox=ocr_rect
132
134
  )
133
-
134
- with tempfile.NamedTemporaryFile(
135
- suffix=".png", mode="w"
136
- ) as image_file:
137
- fname = image_file.name
138
- high_res_image.save(fname)
135
+ try:
136
+ with tempfile.NamedTemporaryFile(
137
+ suffix=".png", mode="w+b", delete=False
138
+ ) as image_file:
139
+ fname = image_file.name
140
+ high_res_image.save(image_file)
139
141
 
140
142
  df = self._run_tesseract(fname)
143
+ finally:
144
+ if os.path.exists(fname):
145
+ os.remove(fname)
141
146
 
142
147
  # _log.info(df)
143
148
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.8.1
3
+ Version: 2.8.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core (>=2.5.1,<3.0.0)
29
+ Requires-Dist: docling-core (>=2.6.1,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
31
  Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -12,11 +12,11 @@ docling/backend/msword_backend.py,sha256=VFHPr-gCak7w3NJToc5Cs-JaTb4Vm3a1JnnRIfJ
12
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
14
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- docling/cli/main.py,sha256=AgPD32NfM0_bmHeKjx5-fqk57ahX5tN3AeoDOerhTuE,11808
15
+ docling/cli/main.py,sha256=R9ao2zCv1GZQIATOqg9b64O7AOUCWLwjJ-2FIpW8m0I,12236
16
16
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- docling/datamodel/base_models.py,sha256=6qlwPamDZ3XUsE2kTAyGKG6O2IJClVjCqaE7DZ74KHU,5533
18
- docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
19
- docling/datamodel/pipeline_options.py,sha256=J-6kWugUrxahymKzgaEgiqPuyle1fbInPXV2wNos6Vc,4550
17
+ docling/datamodel/base_models.py,sha256=nlNrSzQGB4A8iK8aGnHXvYahfp1hSvQcrA6vuQFHwRE,5497
18
+ docling/datamodel/document.py,sha256=zTKHvNeLuKACqYedohBmOQad-2hvUMjtvb-vePQG3Dk,20708
19
+ docling/datamodel/pipeline_options.py,sha256=K65nEZ52aRfF8hWIzl0zVvRQj-3XVwoBbxTacGS6jEg,4960
20
20
  docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
21
21
  docling/document_converter.py,sha256=L0A3g7IQBaKIK7dWpUFC72ZqKywIPYkyh71Qd6DiNPE,10940
22
22
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,7 +30,7 @@ docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th
30
30
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
31
31
  docling/models/rapid_ocr_model.py,sha256=VQ0jaFmOzB9f-1JaqZ6d0o_El55Lr-nsFHfTNubMAuc,6005
32
32
  docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
33
- docling/models/tesseract_ocr_cli_model.py,sha256=OfopQnt2FGwtLJTMtW9jbJZ9EN2G2QFkA_aACjuUuDs,6372
33
+ docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
34
34
  docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
35
35
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
@@ -41,8 +41,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
41
41
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
42
42
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
43
43
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
44
- docling-2.8.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
45
- docling-2.8.1.dist-info/METADATA,sha256=auj5PtDj-UBB72sW8jk1CSVSwQpd9q0nYzoAYIItl8o,7682
46
- docling-2.8.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
- docling-2.8.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
48
- docling-2.8.1.dist-info/RECORD,,
44
+ docling-2.8.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
45
+ docling-2.8.2.dist-info/METADATA,sha256=l28sUpfFDDGip1TWoxlj4gjQ1hXDDjHiWKjMP6pJg3Q,7682
46
+ docling-2.8.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
47
+ docling-2.8.2.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
48
+ docling-2.8.2.dist-info/RECORD,,