docling 2.21.0__py3-none-any.whl → 2.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/csv_backend.py +125 -0
- docling/backend/xml/jats_backend.py +772 -0
- docling/cli/main.py +7 -0
- docling/datamodel/base_models.py +6 -3
- docling/datamodel/document.py +37 -4
- docling/datamodel/pipeline_options.py +30 -4
- docling/datamodel/settings.py +2 -1
- docling/document_converter.py +14 -5
- docling/exceptions.py +4 -0
- docling/models/picture_description_api_model.py +11 -4
- docling/models/tesseract_ocr_model.py +1 -2
- docling/pipeline/standard_pdf_pipeline.py +9 -0
- docling/utils/accelerator_utils.py +41 -15
- {docling-2.21.0.dist-info → docling-2.23.0.dist-info}/METADATA +3 -3
- {docling-2.21.0.dist-info → docling-2.23.0.dist-info}/RECORD +18 -17
- docling/backend/xml/pubmed_backend.py +0 -592
- {docling-2.21.0.dist-info → docling-2.23.0.dist-info}/LICENSE +0 -0
- {docling-2.21.0.dist-info → docling-2.23.0.dist-info}/WHEEL +0 -0
- {docling-2.21.0.dist-info → docling-2.23.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -234,6 +234,12 @@ def convert(
|
|
234
234
|
Optional[Path],
|
235
235
|
typer.Option(..., help="If provided, the location of the model artifacts."),
|
236
236
|
] = None,
|
237
|
+
enable_remote_services: Annotated[
|
238
|
+
bool,
|
239
|
+
typer.Option(
|
240
|
+
..., help="Must be enabled when using models connecting to remote services."
|
241
|
+
),
|
242
|
+
] = False,
|
237
243
|
abort_on_error: Annotated[
|
238
244
|
bool,
|
239
245
|
typer.Option(
|
@@ -380,6 +386,7 @@ def convert(
|
|
380
386
|
|
381
387
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
382
388
|
pipeline_options = PdfPipelineOptions(
|
389
|
+
enable_remote_services=enable_remote_services,
|
383
390
|
accelerator_options=accelerator_options,
|
384
391
|
do_ocr=ocr,
|
385
392
|
ocr_options=ocr_options,
|
docling/datamodel/base_models.py
CHANGED
@@ -34,13 +34,14 @@ class InputFormat(str, Enum):
|
|
34
34
|
DOCX = "docx"
|
35
35
|
PPTX = "pptx"
|
36
36
|
HTML = "html"
|
37
|
-
XML_PUBMED = "xml_pubmed"
|
38
37
|
IMAGE = "image"
|
39
38
|
PDF = "pdf"
|
40
39
|
ASCIIDOC = "asciidoc"
|
41
40
|
MD = "md"
|
41
|
+
CSV = "csv"
|
42
42
|
XLSX = "xlsx"
|
43
43
|
XML_USPTO = "xml_uspto"
|
44
|
+
XML_JATS = "xml_jats"
|
44
45
|
JSON_DOCLING = "json_docling"
|
45
46
|
|
46
47
|
|
@@ -58,9 +59,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
58
59
|
InputFormat.PDF: ["pdf"],
|
59
60
|
InputFormat.MD: ["md"],
|
60
61
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
61
|
-
InputFormat.
|
62
|
+
InputFormat.XML_JATS: ["xml", "nxml"],
|
62
63
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
63
64
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
65
|
+
InputFormat.CSV: ["csv"],
|
64
66
|
InputFormat.XLSX: ["xlsx"],
|
65
67
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
66
68
|
InputFormat.JSON_DOCLING: ["json"],
|
@@ -77,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
77
79
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
78
80
|
],
|
79
81
|
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
80
|
-
InputFormat.
|
82
|
+
InputFormat.XML_JATS: ["application/xml"],
|
81
83
|
InputFormat.IMAGE: [
|
82
84
|
"image/png",
|
83
85
|
"image/jpeg",
|
@@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
88
90
|
InputFormat.PDF: ["application/pdf"],
|
89
91
|
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
90
92
|
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
93
|
+
InputFormat.CSV: ["text/csv"],
|
91
94
|
InputFormat.XLSX: [
|
92
95
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
93
96
|
],
|
docling/datamodel/document.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import csv
|
1
2
|
import logging
|
2
3
|
import re
|
3
4
|
from enum import Enum
|
@@ -296,6 +297,7 @@ class _DocumentConversionInput(BaseModel):
|
|
296
297
|
mime = _DocumentConversionInput._mime_from_extension(ext)
|
297
298
|
|
298
299
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
300
|
+
mime = mime or _DocumentConversionInput._detect_csv(content)
|
299
301
|
mime = mime or "text/plain"
|
300
302
|
formats = MimeTypeToFormat.get(mime, [])
|
301
303
|
if formats:
|
@@ -331,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
|
|
331
333
|
):
|
332
334
|
input_format = InputFormat.XML_USPTO
|
333
335
|
|
334
|
-
if (
|
335
|
-
|
336
|
-
|
336
|
+
if InputFormat.XML_JATS in formats and (
|
337
|
+
"JATS-journalpublishing" in xml_doctype
|
338
|
+
or "JATS-archive" in xml_doctype
|
337
339
|
):
|
338
|
-
input_format = InputFormat.
|
340
|
+
input_format = InputFormat.XML_JATS
|
339
341
|
|
340
342
|
elif mime == "text/plain":
|
341
343
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
@@ -352,6 +354,8 @@ class _DocumentConversionInput(BaseModel):
|
|
352
354
|
mime = FormatToMimeType[InputFormat.HTML][0]
|
353
355
|
elif ext in FormatToExtensions[InputFormat.MD]:
|
354
356
|
mime = FormatToMimeType[InputFormat.MD][0]
|
357
|
+
elif ext in FormatToExtensions[InputFormat.CSV]:
|
358
|
+
mime = FormatToMimeType[InputFormat.CSV][0]
|
355
359
|
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
356
360
|
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
357
361
|
elif ext in FormatToExtensions[InputFormat.PDF]:
|
@@ -392,3 +396,32 @@ class _DocumentConversionInput(BaseModel):
|
|
392
396
|
return "application/xml"
|
393
397
|
|
394
398
|
return None
|
399
|
+
|
400
|
+
@staticmethod
|
401
|
+
def _detect_csv(
|
402
|
+
content: bytes,
|
403
|
+
) -> Optional[Literal["text/csv"]]:
|
404
|
+
"""Guess the mime type of a CSV file from its content.
|
405
|
+
|
406
|
+
Args:
|
407
|
+
content: A short piece of a document from its beginning.
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
The mime type of a CSV file, or None if the content does
|
411
|
+
not match any of the format.
|
412
|
+
"""
|
413
|
+
content_str = content.decode("ascii", errors="ignore").strip()
|
414
|
+
|
415
|
+
# Ensure there's at least one newline (CSV is usually multi-line)
|
416
|
+
if "\n" not in content_str:
|
417
|
+
return None
|
418
|
+
|
419
|
+
# Use csv.Sniffer to detect CSV characteristics
|
420
|
+
try:
|
421
|
+
dialect = csv.Sniffer().sniff(content_str)
|
422
|
+
if dialect.delimiter in {",", ";", "\t", "|"}: # Common delimiters
|
423
|
+
return "text/csv"
|
424
|
+
except csv.Error:
|
425
|
+
return None
|
426
|
+
|
427
|
+
return None
|
@@ -1,11 +1,26 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
+
import re
|
4
|
+
import warnings
|
3
5
|
from enum import Enum
|
4
6
|
from pathlib import Path
|
5
7
|
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
6
8
|
|
7
|
-
from pydantic import
|
8
|
-
|
9
|
+
from pydantic import (
|
10
|
+
AnyUrl,
|
11
|
+
BaseModel,
|
12
|
+
ConfigDict,
|
13
|
+
Field,
|
14
|
+
field_validator,
|
15
|
+
model_validator,
|
16
|
+
validator,
|
17
|
+
)
|
18
|
+
from pydantic_settings import (
|
19
|
+
BaseSettings,
|
20
|
+
PydanticBaseSettingsSource,
|
21
|
+
SettingsConfigDict,
|
22
|
+
)
|
23
|
+
from typing_extensions import deprecated
|
9
24
|
|
10
25
|
_log = logging.getLogger(__name__)
|
11
26
|
|
@@ -25,7 +40,18 @@ class AcceleratorOptions(BaseSettings):
|
|
25
40
|
)
|
26
41
|
|
27
42
|
num_threads: int = 4
|
28
|
-
device: AcceleratorDevice =
|
43
|
+
device: Union[str, AcceleratorDevice] = "auto"
|
44
|
+
|
45
|
+
@field_validator("device")
|
46
|
+
def validate_device(cls, value):
|
47
|
+
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
48
|
+
if value in {d.value for d in AcceleratorDevice} or re.match(
|
49
|
+
r"^cuda(:\d+)?$", value
|
50
|
+
):
|
51
|
+
return value
|
52
|
+
raise ValueError(
|
53
|
+
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
54
|
+
)
|
29
55
|
|
30
56
|
@model_validator(mode="before")
|
31
57
|
@classmethod
|
@@ -41,7 +67,6 @@ class AcceleratorOptions(BaseSettings):
|
|
41
67
|
"""
|
42
68
|
if isinstance(data, dict):
|
43
69
|
input_num_threads = data.get("num_threads")
|
44
|
-
|
45
70
|
# Check if to set the num_threads from the alternative envvar
|
46
71
|
if input_num_threads is None:
|
47
72
|
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
@@ -257,6 +282,7 @@ class PipelineOptions(BaseModel):
|
|
257
282
|
)
|
258
283
|
document_timeout: Optional[float] = None
|
259
284
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
285
|
+
enable_remote_services: bool = False
|
260
286
|
|
261
287
|
|
262
288
|
class PdfPipelineOptions(PipelineOptions):
|
docling/datamodel/settings.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import sys
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import Annotated, Tuple
|
3
|
+
from typing import Annotated, Optional, Tuple
|
4
4
|
|
5
5
|
from pydantic import BaseModel, PlainValidator
|
6
6
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
@@ -62,6 +62,7 @@ class AppSettings(BaseSettings):
|
|
62
62
|
debug: DebugSettings
|
63
63
|
|
64
64
|
cache_dir: Path = Path.home() / ".cache" / "docling"
|
65
|
+
artifacts_path: Optional[Path] = None
|
65
66
|
|
66
67
|
|
67
68
|
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
docling/document_converter.py
CHANGED
@@ -10,6 +10,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|
10
10
|
|
11
11
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
12
12
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
13
|
+
from docling.backend.csv_backend import CsvDocumentBackend
|
13
14
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
14
15
|
from docling.backend.html_backend import HTMLDocumentBackend
|
15
16
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
@@ -17,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
17
18
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
18
19
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
19
20
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
20
|
-
from docling.backend.xml.
|
21
|
+
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
21
22
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
22
23
|
from docling.datamodel.base_models import (
|
23
24
|
ConversionStatus,
|
@@ -61,6 +62,11 @@ class FormatOption(BaseModel):
|
|
61
62
|
return self
|
62
63
|
|
63
64
|
|
65
|
+
class CsvFormatOption(FormatOption):
|
66
|
+
pipeline_cls: Type = SimplePipeline
|
67
|
+
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
68
|
+
|
69
|
+
|
64
70
|
class ExcelFormatOption(FormatOption):
|
65
71
|
pipeline_cls: Type = SimplePipeline
|
66
72
|
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
@@ -96,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
|
|
96
102
|
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
97
103
|
|
98
104
|
|
99
|
-
class
|
105
|
+
class XMLJatsFormatOption(FormatOption):
|
100
106
|
pipeline_cls: Type = SimplePipeline
|
101
|
-
backend: Type[AbstractDocumentBackend] =
|
107
|
+
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
|
102
108
|
|
103
109
|
|
104
110
|
class ImageFormatOption(FormatOption):
|
@@ -113,6 +119,9 @@ class PdfFormatOption(FormatOption):
|
|
113
119
|
|
114
120
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
115
121
|
format_to_default_options = {
|
122
|
+
InputFormat.CSV: FormatOption(
|
123
|
+
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
|
124
|
+
),
|
116
125
|
InputFormat.XLSX: FormatOption(
|
117
126
|
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
118
127
|
),
|
@@ -134,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
134
143
|
InputFormat.XML_USPTO: FormatOption(
|
135
144
|
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
136
145
|
),
|
137
|
-
InputFormat.
|
138
|
-
pipeline_cls=SimplePipeline, backend=
|
146
|
+
InputFormat.XML_JATS: FormatOption(
|
147
|
+
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
139
148
|
),
|
140
149
|
InputFormat.IMAGE: FormatOption(
|
141
150
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
docling/exceptions.py
CHANGED
@@ -8,6 +8,7 @@ from PIL import Image
|
|
8
8
|
from pydantic import BaseModel, ConfigDict
|
9
9
|
|
10
10
|
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
|
11
|
+
from docling.exceptions import OperationNotAllowed
|
11
12
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
12
13
|
|
13
14
|
_log = logging.getLogger(__name__)
|
@@ -45,14 +46,20 @@ class ApiResponse(BaseModel):
|
|
45
46
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
46
47
|
# elements_batch_size = 4
|
47
48
|
|
48
|
-
def __init__(
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
enabled: bool,
|
52
|
+
enable_remote_services: bool,
|
53
|
+
options: PictureDescriptionApiOptions,
|
54
|
+
):
|
49
55
|
super().__init__(enabled=enabled, options=options)
|
50
56
|
self.options: PictureDescriptionApiOptions
|
51
57
|
|
52
58
|
if self.enabled:
|
53
|
-
if
|
54
|
-
raise
|
55
|
-
"
|
59
|
+
if not enable_remote_services:
|
60
|
+
raise OperationNotAllowed(
|
61
|
+
"Connections to remote services is only allowed when set explicitly. "
|
62
|
+
"pipeline_options.enable_remote_services=True."
|
56
63
|
)
|
57
64
|
|
58
65
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
@@ -22,6 +22,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
22
22
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
23
|
self.reader = None
|
24
24
|
self.osd_reader = None
|
25
|
+
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
25
26
|
|
26
27
|
if self.enabled:
|
27
28
|
install_errmsg = (
|
@@ -57,8 +58,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|
57
58
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
58
59
|
lang = "+".join(self.options.lang)
|
59
60
|
|
60
|
-
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
61
|
-
|
62
61
|
if any([l.startswith("script/") for l in self._tesserocr_languages]):
|
63
62
|
self.script_prefix = "script/"
|
64
63
|
else:
|
@@ -61,6 +61,14 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
61
61
|
artifacts_path: Optional[Path] = None
|
62
62
|
if pipeline_options.artifacts_path is not None:
|
63
63
|
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
64
|
+
elif settings.artifacts_path is not None:
|
65
|
+
artifacts_path = Path(settings.artifacts_path).expanduser()
|
66
|
+
|
67
|
+
if artifacts_path is not None and not artifacts_path.is_dir():
|
68
|
+
raise RuntimeError(
|
69
|
+
f"The value of {artifacts_path=} is not valid. "
|
70
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
71
|
+
)
|
64
72
|
|
65
73
|
self.keep_images = (
|
66
74
|
self.pipeline_options.generate_page_images
|
@@ -201,6 +209,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
201
209
|
):
|
202
210
|
return PictureDescriptionApiModel(
|
203
211
|
enabled=self.pipeline_options.do_picture_description,
|
212
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
204
213
|
options=self.pipeline_options.picture_description_options,
|
205
214
|
)
|
206
215
|
elif isinstance(
|
@@ -7,36 +7,62 @@ from docling.datamodel.pipeline_options import AcceleratorDevice
|
|
7
7
|
_log = logging.getLogger(__name__)
|
8
8
|
|
9
9
|
|
10
|
-
def decide_device(accelerator_device:
|
10
|
+
def decide_device(accelerator_device: str) -> str:
|
11
11
|
r"""
|
12
|
-
Resolve the device based on the acceleration options and the available devices in the system
|
12
|
+
Resolve the device based on the acceleration options and the available devices in the system.
|
13
|
+
|
13
14
|
Rules:
|
14
15
|
1. AUTO: Check for the best available device on the system.
|
15
16
|
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
16
17
|
"""
|
17
|
-
cuda_index = 0
|
18
18
|
device = "cpu"
|
19
19
|
|
20
20
|
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
21
21
|
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
22
22
|
|
23
|
-
if accelerator_device == AcceleratorDevice.AUTO:
|
23
|
+
if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
|
24
24
|
if has_cuda:
|
25
|
-
device =
|
25
|
+
device = "cuda:0"
|
26
26
|
elif has_mps:
|
27
27
|
device = "mps"
|
28
28
|
|
29
|
-
|
30
|
-
if
|
31
|
-
if
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
29
|
+
elif accelerator_device.startswith("cuda"):
|
30
|
+
if has_cuda:
|
31
|
+
# if cuda device index specified extract device id
|
32
|
+
parts = accelerator_device.split(":")
|
33
|
+
if len(parts) == 2 and parts[1].isdigit():
|
34
|
+
# select cuda device's id
|
35
|
+
cuda_index = int(parts[1])
|
36
|
+
if cuda_index < torch.cuda.device_count():
|
37
|
+
device = f"cuda:{cuda_index}"
|
38
|
+
else:
|
39
|
+
_log.warning(
|
40
|
+
"CUDA device 'cuda:%d' is not available. Fall back to 'CPU'.",
|
41
|
+
cuda_index,
|
42
|
+
)
|
43
|
+
elif len(parts) == 1: # just "cuda"
|
44
|
+
device = "cuda:0"
|
38
45
|
else:
|
39
|
-
_log.warning(
|
46
|
+
_log.warning(
|
47
|
+
"Invalid CUDA device format '%s'. Fall back to 'CPU'",
|
48
|
+
accelerator_device,
|
49
|
+
)
|
50
|
+
else:
|
51
|
+
_log.warning("CUDA is not available in the system. Fall back to 'CPU'")
|
52
|
+
|
53
|
+
elif accelerator_device == AcceleratorDevice.MPS.value:
|
54
|
+
if has_mps:
|
55
|
+
device = "mps"
|
56
|
+
else:
|
57
|
+
_log.warning("MPS is not available in the system. Fall back to 'CPU'")
|
58
|
+
|
59
|
+
elif accelerator_device == AcceleratorDevice.CPU.value:
|
60
|
+
device = "cpu"
|
61
|
+
|
62
|
+
else:
|
63
|
+
_log.warning(
|
64
|
+
"Unknown device option '%s'. Fall back to 'CPU'", accelerator_device
|
65
|
+
)
|
40
66
|
|
41
67
|
_log.info("Accelerator device: '%s'", device)
|
42
68
|
return device
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.23.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
29
29
|
Requires-Dist: certifi (>=2024.7.4)
|
30
30
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -41,7 +41,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
|
|
41
41
|
Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
|
42
42
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
43
43
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
44
|
-
Requires-Dist: pillow (>=10.0.0,<
|
44
|
+
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
45
45
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
46
46
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
47
47
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
@@ -2,6 +2,7 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
|
5
|
+
docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
|
5
6
|
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
6
7
|
docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
|
7
8
|
docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
|
@@ -14,20 +15,20 @@ docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4y
|
|
14
15
|
docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
|
15
16
|
docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
|
16
17
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
docling/backend/xml/
|
18
|
+
docling/backend/xml/jats_backend.py,sha256=JI1iibmrob9Gv9y7zoFncavQ0oJaGWnQoLkozAIiTQU,27513
|
18
19
|
docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
|
19
20
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
20
21
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
docling/cli/main.py,sha256=
|
22
|
+
docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
|
22
23
|
docling/cli/models.py,sha256=Z4IEuaXE9el5PuI6_6mR4D5Sn3y8WZzBtoIJPi6jL_s,3188
|
23
24
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
24
25
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
-
docling/datamodel/base_models.py,sha256=
|
26
|
-
docling/datamodel/document.py,sha256=
|
27
|
-
docling/datamodel/pipeline_options.py,sha256=
|
28
|
-
docling/datamodel/settings.py,sha256=
|
29
|
-
docling/document_converter.py,sha256=
|
30
|
-
docling/exceptions.py,sha256
|
26
|
+
docling/datamodel/base_models.py,sha256=b_8LiDCC4MkpqnKfsJjduH2DSsjADCllBLNB83Tpamw,7099
|
27
|
+
docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
|
28
|
+
docling/datamodel/pipeline_options.py,sha256=5jXSVNGyOy6Ha18Wd80e7pYFmvRZk-2Lkgx0bwMOuq8,10234
|
29
|
+
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
30
|
+
docling/document_converter.py,sha256=AeiSmKzWcnOkZm8O-KIBG72g3l4W2CAsq3yEbfC1tiE,13184
|
31
|
+
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
31
32
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
33
|
docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,2679
|
33
34
|
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
@@ -39,20 +40,20 @@ docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3
|
|
39
40
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
40
41
|
docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
|
41
42
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
42
|
-
docling/models/picture_description_api_model.py,sha256=
|
43
|
+
docling/models/picture_description_api_model.py,sha256=SKNoHpqzbfM8iO-DJJ4ccyNVqO0B2d9neLBnXqt50FY,3186
|
43
44
|
docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0ZxwO7yubhh2RkaC_8e8,1910
|
44
45
|
docling/models/picture_description_vlm_model.py,sha256=a2vYUdlcA0--_8neY0tTiU8reCf29NCbVMKwWdMy2QQ,3653
|
45
46
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
46
47
|
docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
|
47
48
|
docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
|
48
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
49
|
+
docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
|
49
50
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
51
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
51
52
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
52
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
53
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=Zoe8GGPujha16_TGYBAxcPriEwgYPaJPkp3BwG5XowU,12862
|
53
54
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
54
55
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
|
-
docling/utils/accelerator_utils.py,sha256=
|
56
|
+
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
56
57
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
57
58
|
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
58
59
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
@@ -61,8 +62,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
|
|
61
62
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
62
63
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
63
64
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
64
|
-
docling-2.
|
65
|
-
docling-2.
|
66
|
-
docling-2.
|
67
|
-
docling-2.
|
68
|
-
docling-2.
|
65
|
+
docling-2.23.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
66
|
+
docling-2.23.0.dist-info/METADATA,sha256=O4EJYC_yjLCFfKnhnzgSW4qGLOHaatDWDXsQS2EJDjU,8720
|
67
|
+
docling-2.23.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
68
|
+
docling-2.23.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
69
|
+
docling-2.23.0.dist-info/RECORD,,
|