docling 2.22.0__py3-none-any.whl → 2.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/jats_backend.py +772 -0
- docling/datamodel/base_models.py +3 -3
- docling/datamodel/document.py +4 -4
- docling/datamodel/pipeline_options.py +29 -4
- docling/document_converter.py +5 -5
- docling/utils/accelerator_utils.py +41 -15
- {docling-2.22.0.dist-info → docling-2.23.0.dist-info}/METADATA +2 -2
- {docling-2.22.0.dist-info → docling-2.23.0.dist-info}/RECORD +11 -11
- docling/backend/xml/pubmed_backend.py +0 -592
- {docling-2.22.0.dist-info → docling-2.23.0.dist-info}/LICENSE +0 -0
- {docling-2.22.0.dist-info → docling-2.23.0.dist-info}/WHEEL +0 -0
- {docling-2.22.0.dist-info → docling-2.23.0.dist-info}/entry_points.txt +0 -0
docling/datamodel/base_models.py
CHANGED
@@ -34,7 +34,6 @@ class InputFormat(str, Enum):
|
|
34
34
|
DOCX = "docx"
|
35
35
|
PPTX = "pptx"
|
36
36
|
HTML = "html"
|
37
|
-
XML_PUBMED = "xml_pubmed"
|
38
37
|
IMAGE = "image"
|
39
38
|
PDF = "pdf"
|
40
39
|
ASCIIDOC = "asciidoc"
|
@@ -42,6 +41,7 @@ class InputFormat(str, Enum):
|
|
42
41
|
CSV = "csv"
|
43
42
|
XLSX = "xlsx"
|
44
43
|
XML_USPTO = "xml_uspto"
|
44
|
+
XML_JATS = "xml_jats"
|
45
45
|
JSON_DOCLING = "json_docling"
|
46
46
|
|
47
47
|
|
@@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
59
59
|
InputFormat.PDF: ["pdf"],
|
60
60
|
InputFormat.MD: ["md"],
|
61
61
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
62
|
-
InputFormat.
|
62
|
+
InputFormat.XML_JATS: ["xml", "nxml"],
|
63
63
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
64
64
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
65
65
|
InputFormat.CSV: ["csv"],
|
@@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
79
79
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
80
80
|
],
|
81
81
|
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
82
|
-
InputFormat.
|
82
|
+
InputFormat.XML_JATS: ["application/xml"],
|
83
83
|
InputFormat.IMAGE: [
|
84
84
|
"image/png",
|
85
85
|
"image/jpeg",
|
docling/datamodel/document.py
CHANGED
@@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
|
|
333
333
|
):
|
334
334
|
input_format = InputFormat.XML_USPTO
|
335
335
|
|
336
|
-
if (
|
337
|
-
|
338
|
-
|
336
|
+
if InputFormat.XML_JATS in formats and (
|
337
|
+
"JATS-journalpublishing" in xml_doctype
|
338
|
+
or "JATS-archive" in xml_doctype
|
339
339
|
):
|
340
|
-
input_format = InputFormat.
|
340
|
+
input_format = InputFormat.XML_JATS
|
341
341
|
|
342
342
|
elif mime == "text/plain":
|
343
343
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
@@ -1,11 +1,26 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
+
import re
|
4
|
+
import warnings
|
3
5
|
from enum import Enum
|
4
6
|
from pathlib import Path
|
5
7
|
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
6
8
|
|
7
|
-
from pydantic import
|
8
|
-
|
9
|
+
from pydantic import (
|
10
|
+
AnyUrl,
|
11
|
+
BaseModel,
|
12
|
+
ConfigDict,
|
13
|
+
Field,
|
14
|
+
field_validator,
|
15
|
+
model_validator,
|
16
|
+
validator,
|
17
|
+
)
|
18
|
+
from pydantic_settings import (
|
19
|
+
BaseSettings,
|
20
|
+
PydanticBaseSettingsSource,
|
21
|
+
SettingsConfigDict,
|
22
|
+
)
|
23
|
+
from typing_extensions import deprecated
|
9
24
|
|
10
25
|
_log = logging.getLogger(__name__)
|
11
26
|
|
@@ -25,7 +40,18 @@ class AcceleratorOptions(BaseSettings):
|
|
25
40
|
)
|
26
41
|
|
27
42
|
num_threads: int = 4
|
28
|
-
device: AcceleratorDevice =
|
43
|
+
device: Union[str, AcceleratorDevice] = "auto"
|
44
|
+
|
45
|
+
@field_validator("device")
|
46
|
+
def validate_device(cls, value):
|
47
|
+
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
48
|
+
if value in {d.value for d in AcceleratorDevice} or re.match(
|
49
|
+
r"^cuda(:\d+)?$", value
|
50
|
+
):
|
51
|
+
return value
|
52
|
+
raise ValueError(
|
53
|
+
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
54
|
+
)
|
29
55
|
|
30
56
|
@model_validator(mode="before")
|
31
57
|
@classmethod
|
@@ -41,7 +67,6 @@ class AcceleratorOptions(BaseSettings):
|
|
41
67
|
"""
|
42
68
|
if isinstance(data, dict):
|
43
69
|
input_num_threads = data.get("num_threads")
|
44
|
-
|
45
70
|
# Check if to set the num_threads from the alternative envvar
|
46
71
|
if input_num_threads is None:
|
47
72
|
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
docling/document_converter.py
CHANGED
@@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
18
18
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
19
19
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
20
20
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
21
|
-
from docling.backend.xml.
|
21
|
+
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
22
22
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
23
23
|
from docling.datamodel.base_models import (
|
24
24
|
ConversionStatus,
|
@@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
|
|
102
102
|
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
103
103
|
|
104
104
|
|
105
|
-
class
|
105
|
+
class XMLJatsFormatOption(FormatOption):
|
106
106
|
pipeline_cls: Type = SimplePipeline
|
107
|
-
backend: Type[AbstractDocumentBackend] =
|
107
|
+
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
|
108
108
|
|
109
109
|
|
110
110
|
class ImageFormatOption(FormatOption):
|
@@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
143
143
|
InputFormat.XML_USPTO: FormatOption(
|
144
144
|
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
145
145
|
),
|
146
|
-
InputFormat.
|
147
|
-
pipeline_cls=SimplePipeline, backend=
|
146
|
+
InputFormat.XML_JATS: FormatOption(
|
147
|
+
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
148
148
|
),
|
149
149
|
InputFormat.IMAGE: FormatOption(
|
150
150
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
@@ -7,36 +7,62 @@ from docling.datamodel.pipeline_options import AcceleratorDevice
|
|
7
7
|
_log = logging.getLogger(__name__)
|
8
8
|
|
9
9
|
|
10
|
-
def decide_device(accelerator_device:
|
10
|
+
def decide_device(accelerator_device: str) -> str:
|
11
11
|
r"""
|
12
|
-
Resolve the device based on the acceleration options and the available devices in the system
|
12
|
+
Resolve the device based on the acceleration options and the available devices in the system.
|
13
|
+
|
13
14
|
Rules:
|
14
15
|
1. AUTO: Check for the best available device on the system.
|
15
16
|
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
16
17
|
"""
|
17
|
-
cuda_index = 0
|
18
18
|
device = "cpu"
|
19
19
|
|
20
20
|
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
21
21
|
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
22
22
|
|
23
|
-
if accelerator_device == AcceleratorDevice.AUTO:
|
23
|
+
if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
|
24
24
|
if has_cuda:
|
25
|
-
device =
|
25
|
+
device = "cuda:0"
|
26
26
|
elif has_mps:
|
27
27
|
device = "mps"
|
28
28
|
|
29
|
-
|
30
|
-
if
|
31
|
-
if
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
29
|
+
elif accelerator_device.startswith("cuda"):
|
30
|
+
if has_cuda:
|
31
|
+
# if cuda device index specified extract device id
|
32
|
+
parts = accelerator_device.split(":")
|
33
|
+
if len(parts) == 2 and parts[1].isdigit():
|
34
|
+
# select cuda device's id
|
35
|
+
cuda_index = int(parts[1])
|
36
|
+
if cuda_index < torch.cuda.device_count():
|
37
|
+
device = f"cuda:{cuda_index}"
|
38
|
+
else:
|
39
|
+
_log.warning(
|
40
|
+
"CUDA device 'cuda:%d' is not available. Fall back to 'CPU'.",
|
41
|
+
cuda_index,
|
42
|
+
)
|
43
|
+
elif len(parts) == 1: # just "cuda"
|
44
|
+
device = "cuda:0"
|
38
45
|
else:
|
39
|
-
_log.warning(
|
46
|
+
_log.warning(
|
47
|
+
"Invalid CUDA device format '%s'. Fall back to 'CPU'",
|
48
|
+
accelerator_device,
|
49
|
+
)
|
50
|
+
else:
|
51
|
+
_log.warning("CUDA is not available in the system. Fall back to 'CPU'")
|
52
|
+
|
53
|
+
elif accelerator_device == AcceleratorDevice.MPS.value:
|
54
|
+
if has_mps:
|
55
|
+
device = "mps"
|
56
|
+
else:
|
57
|
+
_log.warning("MPS is not available in the system. Fall back to 'CPU'")
|
58
|
+
|
59
|
+
elif accelerator_device == AcceleratorDevice.CPU.value:
|
60
|
+
device = "cpu"
|
61
|
+
|
62
|
+
else:
|
63
|
+
_log.warning(
|
64
|
+
"Unknown device option '%s'. Fall back to 'CPU'", accelerator_device
|
65
|
+
)
|
40
66
|
|
41
67
|
_log.info("Accelerator device: '%s'", device)
|
42
68
|
return device
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.23.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
29
29
|
Requires-Dist: certifi (>=2024.7.4)
|
30
30
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -15,7 +15,7 @@ docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4y
|
|
15
15
|
docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
|
16
16
|
docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
|
17
17
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
docling/backend/xml/
|
18
|
+
docling/backend/xml/jats_backend.py,sha256=JI1iibmrob9Gv9y7zoFncavQ0oJaGWnQoLkozAIiTQU,27513
|
19
19
|
docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
|
20
20
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
21
21
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -23,11 +23,11 @@ docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
|
|
23
23
|
docling/cli/models.py,sha256=Z4IEuaXE9el5PuI6_6mR4D5Sn3y8WZzBtoIJPi6jL_s,3188
|
24
24
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
25
25
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
-
docling/datamodel/base_models.py,sha256=
|
27
|
-
docling/datamodel/document.py,sha256=
|
28
|
-
docling/datamodel/pipeline_options.py,sha256=
|
26
|
+
docling/datamodel/base_models.py,sha256=b_8LiDCC4MkpqnKfsJjduH2DSsjADCllBLNB83Tpamw,7099
|
27
|
+
docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
|
28
|
+
docling/datamodel/pipeline_options.py,sha256=5jXSVNGyOy6Ha18Wd80e7pYFmvRZk-2Lkgx0bwMOuq8,10234
|
29
29
|
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
30
|
-
docling/document_converter.py,sha256=
|
30
|
+
docling/document_converter.py,sha256=AeiSmKzWcnOkZm8O-KIBG72g3l4W2CAsq3yEbfC1tiE,13184
|
31
31
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
32
32
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
33
|
docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,2679
|
@@ -53,7 +53,7 @@ docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoaz
|
|
53
53
|
docling/pipeline/standard_pdf_pipeline.py,sha256=Zoe8GGPujha16_TGYBAxcPriEwgYPaJPkp3BwG5XowU,12862
|
54
54
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
55
55
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
|
-
docling/utils/accelerator_utils.py,sha256=
|
56
|
+
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
57
57
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
58
58
|
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
59
59
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
@@ -62,8 +62,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
|
|
62
62
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
63
63
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
64
64
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
65
|
-
docling-2.
|
66
|
-
docling-2.
|
67
|
-
docling-2.
|
68
|
-
docling-2.
|
69
|
-
docling-2.
|
65
|
+
docling-2.23.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
66
|
+
docling-2.23.0.dist-info/METADATA,sha256=O4EJYC_yjLCFfKnhnzgSW4qGLOHaatDWDXsQS2EJDjU,8720
|
67
|
+
docling-2.23.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
68
|
+
docling-2.23.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
69
|
+
docling-2.23.0.dist-info/RECORD,,
|