docling 2.8.1__tar.gz → 2.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.8.1 → docling-2.8.2}/PKG-INFO +2 -2
- {docling-2.8.1 → docling-2.8.2}/docling/cli/main.py +88 -84
- {docling-2.8.1 → docling-2.8.2}/docling/datamodel/base_models.py +3 -8
- {docling-2.8.1 → docling-2.8.2}/docling/datamodel/document.py +2 -2
- {docling-2.8.1 → docling-2.8.2}/docling/datamodel/pipeline_options.py +20 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/tesseract_ocr_cli_model.py +12 -7
- {docling-2.8.1 → docling-2.8.2}/pyproject.toml +9 -4
- {docling-2.8.1 → docling-2.8.2}/LICENSE +0 -0
- {docling-2.8.1 → docling-2.8.2}/README.md +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/__init__.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/__init__.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/abstract_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/html_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/md_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/msword_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/pdf_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/cli/__init__.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/datamodel/__init__.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/datamodel/settings.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/document_converter.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/__init__.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/base_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/base_ocr_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/ds_glm_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/easyocr_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/layout_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/page_assemble_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/table_structure_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/pipeline/__init__.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/utils/__init__.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/utils/export.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/utils/layout_utils.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/utils/profiling.py +0 -0
- {docling-2.8.1 → docling-2.8.2}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.8.
|
3
|
+
Version: 2.8.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
29
|
-
Requires-Dist: docling-core (>=2.
|
29
|
+
Requires-Dist: docling-core (>=2.6.1,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -2,6 +2,7 @@ import importlib
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import re
|
5
|
+
import tempfile
|
5
6
|
import time
|
6
7
|
import warnings
|
7
8
|
from enum import Enum
|
@@ -9,7 +10,7 @@ from pathlib import Path
|
|
9
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
10
11
|
|
11
12
|
import typer
|
12
|
-
from docling_core.utils.file import
|
13
|
+
from docling_core.utils.file import resolve_source_to_path
|
13
14
|
|
14
15
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
15
16
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -256,95 +257,98 @@ def convert(
|
|
256
257
|
if from_formats is None:
|
257
258
|
from_formats = [e for e in InputFormat]
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
for
|
270
|
-
|
271
|
-
|
260
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
261
|
+
input_doc_paths: List[Path] = []
|
262
|
+
for src in input_sources:
|
263
|
+
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
264
|
+
if not source.exists():
|
265
|
+
err_console.print(
|
266
|
+
f"[red]Error: The input file {source} does not exist.[/red]"
|
267
|
+
)
|
268
|
+
raise typer.Abort()
|
269
|
+
elif source.is_dir():
|
270
|
+
for fmt in from_formats:
|
271
|
+
for ext in FormatToExtensions[fmt]:
|
272
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
273
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
274
|
+
else:
|
275
|
+
input_doc_paths.append(source)
|
276
|
+
|
277
|
+
if to_formats is None:
|
278
|
+
to_formats = [OutputFormat.MARKDOWN]
|
279
|
+
|
280
|
+
export_json = OutputFormat.JSON in to_formats
|
281
|
+
export_md = OutputFormat.MARKDOWN in to_formats
|
282
|
+
export_txt = OutputFormat.TEXT in to_formats
|
283
|
+
export_doctags = OutputFormat.DOCTAGS in to_formats
|
284
|
+
|
285
|
+
if ocr_engine == OcrEngine.EASYOCR:
|
286
|
+
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
287
|
+
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
288
|
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
289
|
+
elif ocr_engine == OcrEngine.TESSERACT:
|
290
|
+
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
291
|
+
elif ocr_engine == OcrEngine.OCRMAC:
|
292
|
+
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
293
|
+
elif ocr_engine == OcrEngine.RAPIDOCR:
|
294
|
+
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
272
295
|
else:
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
if ocr_engine == OcrEngine.EASYOCR:
|
284
|
-
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
285
|
-
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
286
|
-
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
287
|
-
elif ocr_engine == OcrEngine.TESSERACT:
|
288
|
-
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
289
|
-
elif ocr_engine == OcrEngine.OCRMAC:
|
290
|
-
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
291
|
-
elif ocr_engine == OcrEngine.RAPIDOCR:
|
292
|
-
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
293
|
-
else:
|
294
|
-
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
295
|
-
|
296
|
-
ocr_lang_list = _split_list(ocr_lang)
|
297
|
-
if ocr_lang_list is not None:
|
298
|
-
ocr_options.lang = ocr_lang_list
|
299
|
-
|
300
|
-
pipeline_options = PdfPipelineOptions(
|
301
|
-
do_ocr=ocr,
|
302
|
-
ocr_options=ocr_options,
|
303
|
-
do_table_structure=True,
|
304
|
-
)
|
305
|
-
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
306
|
-
pipeline_options.table_structure_options.mode = table_mode
|
307
|
-
|
308
|
-
if artifacts_path is not None:
|
309
|
-
pipeline_options.artifacts_path = artifacts_path
|
310
|
-
|
311
|
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
312
|
-
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
313
|
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
314
|
-
backend = DoclingParseV2DocumentBackend
|
315
|
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
316
|
-
backend = PyPdfiumDocumentBackend
|
317
|
-
else:
|
318
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
319
|
-
|
320
|
-
format_options: Dict[InputFormat, FormatOption] = {
|
321
|
-
InputFormat.PDF: PdfFormatOption(
|
322
|
-
pipeline_options=pipeline_options,
|
323
|
-
backend=backend, # pdf_backend
|
296
|
+
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
297
|
+
|
298
|
+
ocr_lang_list = _split_list(ocr_lang)
|
299
|
+
if ocr_lang_list is not None:
|
300
|
+
ocr_options.lang = ocr_lang_list
|
301
|
+
|
302
|
+
pipeline_options = PdfPipelineOptions(
|
303
|
+
do_ocr=ocr,
|
304
|
+
ocr_options=ocr_options,
|
305
|
+
do_table_structure=True,
|
324
306
|
)
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
)
|
307
|
+
pipeline_options.table_structure_options.do_cell_matching = (
|
308
|
+
True # do_cell_matching
|
309
|
+
)
|
310
|
+
pipeline_options.table_structure_options.mode = table_mode
|
330
311
|
|
331
|
-
|
312
|
+
if artifacts_path is not None:
|
313
|
+
pipeline_options.artifacts_path = artifacts_path
|
332
314
|
|
333
|
-
|
334
|
-
|
335
|
-
|
315
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
316
|
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
317
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
318
|
+
backend = DoclingParseV2DocumentBackend
|
319
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
320
|
+
backend = PyPdfiumDocumentBackend
|
321
|
+
else:
|
322
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
336
323
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
324
|
+
format_options: Dict[InputFormat, FormatOption] = {
|
325
|
+
InputFormat.PDF: PdfFormatOption(
|
326
|
+
pipeline_options=pipeline_options,
|
327
|
+
backend=backend, # pdf_backend
|
328
|
+
)
|
329
|
+
}
|
330
|
+
doc_converter = DocumentConverter(
|
331
|
+
allowed_formats=from_formats,
|
332
|
+
format_options=format_options,
|
333
|
+
)
|
334
|
+
|
335
|
+
start_time = time.time()
|
336
|
+
|
337
|
+
conv_results = doc_converter.convert_all(
|
338
|
+
input_doc_paths, raises_on_error=abort_on_error
|
339
|
+
)
|
340
|
+
|
341
|
+
output.mkdir(parents=True, exist_ok=True)
|
342
|
+
export_documents(
|
343
|
+
conv_results,
|
344
|
+
output_dir=output,
|
345
|
+
export_json=export_json,
|
346
|
+
export_md=export_md,
|
347
|
+
export_txt=export_txt,
|
348
|
+
export_doctags=export_doctags,
|
349
|
+
)
|
346
350
|
|
347
|
-
|
351
|
+
end_time = time.time() - start_time
|
348
352
|
|
349
353
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
350
354
|
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from enum import Enum, auto
|
2
|
-
from io import BytesIO
|
3
2
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
3
|
|
5
4
|
from docling_core.types.doc import (
|
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
|
9
8
|
Size,
|
10
9
|
TableCell,
|
11
10
|
)
|
11
|
+
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
12
|
+
DocumentStream,
|
13
|
+
)
|
12
14
|
from PIL.Image import Image
|
13
15
|
from pydantic import BaseModel, ConfigDict
|
14
16
|
|
@@ -207,10 +209,3 @@ class Page(BaseModel):
|
|
207
209
|
@property
|
208
210
|
def image(self) -> Optional[Image]:
|
209
211
|
return self.get_image(scale=self._default_image_scale)
|
210
|
-
|
211
|
-
|
212
|
-
class DocumentStream(BaseModel):
|
213
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
214
|
-
|
215
|
-
name: str
|
216
|
-
stream: BytesIO
|
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
|
32
32
|
)
|
33
33
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
34
34
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
35
|
-
from docling_core.utils.file import
|
35
|
+
from docling_core.utils.file import resolve_source_to_stream
|
36
36
|
from pydantic import BaseModel
|
37
37
|
from typing_extensions import deprecated
|
38
38
|
|
@@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
|
|
459
459
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
460
460
|
) -> Iterable[InputDocument]:
|
461
461
|
for item in self.path_or_stream_iterator:
|
462
|
-
obj =
|
462
|
+
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
463
463
|
format = self._guess_format(obj)
|
464
464
|
if format not in format_options.keys():
|
465
465
|
_log.info(
|
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
6
6
|
|
7
7
|
|
8
8
|
class TableFormerMode(str, Enum):
|
9
|
+
"""Modes for the TableFormer model."""
|
10
|
+
|
9
11
|
FAST = "fast"
|
10
12
|
ACCURATE = "accurate"
|
11
13
|
|
12
14
|
|
13
15
|
class TableStructureOptions(BaseModel):
|
16
|
+
"""Options for the table structure."""
|
17
|
+
|
14
18
|
do_cell_matching: bool = (
|
15
19
|
True
|
16
20
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
|
|
21
25
|
|
22
26
|
|
23
27
|
class OcrOptions(BaseModel):
|
28
|
+
"""OCR options."""
|
29
|
+
|
24
30
|
kind: str
|
25
31
|
lang: List[str]
|
26
32
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
|
|
30
36
|
|
31
37
|
|
32
38
|
class RapidOcrOptions(OcrOptions):
|
39
|
+
"""Options for the RapidOCR engine."""
|
40
|
+
|
33
41
|
kind: Literal["rapidocr"] = "rapidocr"
|
34
42
|
|
35
43
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
|
|
66
74
|
|
67
75
|
|
68
76
|
class EasyOcrOptions(OcrOptions):
|
77
|
+
"""Options for the EasyOCR engine."""
|
78
|
+
|
69
79
|
kind: Literal["easyocr"] = "easyocr"
|
70
80
|
lang: List[str] = ["fr", "de", "es", "en"]
|
71
81
|
use_gpu: bool = True # same default as easyocr.Reader
|
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
|
|
79
89
|
|
80
90
|
|
81
91
|
class TesseractCliOcrOptions(OcrOptions):
|
92
|
+
"""Options for the TesseractCli engine."""
|
93
|
+
|
82
94
|
kind: Literal["tesseract"] = "tesseract"
|
83
95
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
84
96
|
tesseract_cmd: str = "tesseract"
|
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
|
|
90
102
|
|
91
103
|
|
92
104
|
class TesseractOcrOptions(OcrOptions):
|
105
|
+
"""Options for the Tesseract engine."""
|
106
|
+
|
93
107
|
kind: Literal["tesserocr"] = "tesserocr"
|
94
108
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
95
109
|
path: Optional[str] = None
|
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
|
|
100
114
|
|
101
115
|
|
102
116
|
class OcrMacOptions(OcrOptions):
|
117
|
+
"""Options for the Mac OCR engine."""
|
118
|
+
|
103
119
|
kind: Literal["ocrmac"] = "ocrmac"
|
104
120
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
105
121
|
recognition: str = "accurate"
|
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
|
|
111
127
|
|
112
128
|
|
113
129
|
class PipelineOptions(BaseModel):
|
130
|
+
"""Base pipeline options."""
|
131
|
+
|
114
132
|
create_legacy_output: bool = (
|
115
133
|
True # This defautl will be set to False on a future version of docling
|
116
134
|
)
|
117
135
|
|
118
136
|
|
119
137
|
class PdfPipelineOptions(PipelineOptions):
|
138
|
+
"""Options for the PDF pipeline."""
|
139
|
+
|
120
140
|
artifacts_path: Optional[Union[Path, str]] = None
|
121
141
|
do_table_structure: bool = True # True: perform table structure extraction
|
122
142
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
@@ -1,5 +1,7 @@
|
|
1
|
+
import csv
|
1
2
|
import io
|
2
3
|
import logging
|
4
|
+
import os
|
3
5
|
import tempfile
|
4
6
|
from subprocess import DEVNULL, PIPE, Popen
|
5
7
|
from typing import Iterable, Optional, Tuple
|
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
95
97
|
# _log.info(decoded_data)
|
96
98
|
|
97
99
|
# Read the TSV file generated by Tesseract
|
98
|
-
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
100
|
+
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
99
101
|
|
100
102
|
# Display the dataframe (optional)
|
101
103
|
# _log.info("df: ", df.head())
|
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
130
132
|
high_res_image = page._backend.get_page_image(
|
131
133
|
scale=self.scale, cropbox=ocr_rect
|
132
134
|
)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
135
|
+
try:
|
136
|
+
with tempfile.NamedTemporaryFile(
|
137
|
+
suffix=".png", mode="w+b", delete=False
|
138
|
+
) as image_file:
|
139
|
+
fname = image_file.name
|
140
|
+
high_res_image.save(image_file)
|
139
141
|
|
140
142
|
df = self._run_tesseract(fname)
|
143
|
+
finally:
|
144
|
+
if os.path.exists(fname):
|
145
|
+
os.remove(fname)
|
141
146
|
|
142
147
|
# _log.info(df)
|
143
148
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.8.
|
3
|
+
version = "2.8.2" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
28
|
pydantic = ">=2.0.0,<2.10"
|
29
|
-
docling-core = "^2.
|
29
|
+
docling-core = "^2.6.1"
|
30
30
|
docling-ibm-models = "^2.0.6"
|
31
31
|
deepsearch-glm = "^0.26.1"
|
32
32
|
filetype = "^1.2.0"
|
@@ -80,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114"
|
|
80
80
|
mkdocs-material = "^9.5.40"
|
81
81
|
mkdocs-jupyter = "^0.25.0"
|
82
82
|
mkdocs-click = "^0.8.1"
|
83
|
+
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
|
84
|
+
griffe-pydantic = "^1.1.0"
|
83
85
|
|
84
86
|
[tool.poetry.group.examples.dependencies]
|
85
87
|
datasets = "^2.21.0"
|
@@ -88,10 +90,13 @@ langchain-huggingface = "^0.0.3"
|
|
88
90
|
langchain-milvus = "^0.1.4"
|
89
91
|
langchain-text-splitters = "^0.2.4"
|
90
92
|
|
93
|
+
[tool.poetry.group.constraints]
|
94
|
+
optional = true
|
95
|
+
|
91
96
|
[tool.poetry.group.constraints.dependencies]
|
92
97
|
numpy = [
|
93
|
-
{ version = "
|
94
|
-
{ version = "
|
98
|
+
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
99
|
+
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
95
100
|
]
|
96
101
|
|
97
102
|
[tool.poetry.group.mac_intel]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|