docling 2.22.0__py3-none-any.whl → 2.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,7 +34,6 @@ class InputFormat(str, Enum):
34
34
  DOCX = "docx"
35
35
  PPTX = "pptx"
36
36
  HTML = "html"
37
- XML_PUBMED = "xml_pubmed"
38
37
  IMAGE = "image"
39
38
  PDF = "pdf"
40
39
  ASCIIDOC = "asciidoc"
@@ -42,6 +41,7 @@ class InputFormat(str, Enum):
42
41
  CSV = "csv"
43
42
  XLSX = "xlsx"
44
43
  XML_USPTO = "xml_uspto"
44
+ XML_JATS = "xml_jats"
45
45
  JSON_DOCLING = "json_docling"
46
46
 
47
47
 
@@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
59
59
  InputFormat.PDF: ["pdf"],
60
60
  InputFormat.MD: ["md"],
61
61
  InputFormat.HTML: ["html", "htm", "xhtml"],
62
- InputFormat.XML_PUBMED: ["xml", "nxml"],
62
+ InputFormat.XML_JATS: ["xml", "nxml"],
63
63
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
64
64
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
65
65
  InputFormat.CSV: ["csv"],
@@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
79
79
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
80
80
  ],
81
81
  InputFormat.HTML: ["text/html", "application/xhtml+xml"],
82
- InputFormat.XML_PUBMED: ["application/xml"],
82
+ InputFormat.XML_JATS: ["application/xml"],
83
83
  InputFormat.IMAGE: [
84
84
  "image/png",
85
85
  "image/jpeg",
@@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
333
333
  ):
334
334
  input_format = InputFormat.XML_USPTO
335
335
 
336
- if (
337
- InputFormat.XML_PUBMED in formats
338
- and "/NLM//DTD JATS" in xml_doctype
336
+ if InputFormat.XML_JATS in formats and (
337
+ "JATS-journalpublishing" in xml_doctype
338
+ or "JATS-archive" in xml_doctype
339
339
  ):
340
- input_format = InputFormat.XML_PUBMED
340
+ input_format = InputFormat.XML_JATS
341
341
 
342
342
  elif mime == "text/plain":
343
343
  if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
@@ -1,11 +1,26 @@
1
1
  import logging
2
2
  import os
3
+ import re
4
+ import warnings
3
5
  from enum import Enum
4
6
  from pathlib import Path
5
7
  from typing import Annotated, Any, Dict, List, Literal, Optional, Union
6
8
 
7
- from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
8
- from pydantic_settings import BaseSettings, SettingsConfigDict
9
+ from pydantic import (
10
+ AnyUrl,
11
+ BaseModel,
12
+ ConfigDict,
13
+ Field,
14
+ field_validator,
15
+ model_validator,
16
+ validator,
17
+ )
18
+ from pydantic_settings import (
19
+ BaseSettings,
20
+ PydanticBaseSettingsSource,
21
+ SettingsConfigDict,
22
+ )
23
+ from typing_extensions import deprecated
9
24
 
10
25
  _log = logging.getLogger(__name__)
11
26
 
@@ -25,7 +40,18 @@ class AcceleratorOptions(BaseSettings):
25
40
  )
26
41
 
27
42
  num_threads: int = 4
28
- device: AcceleratorDevice = AcceleratorDevice.AUTO
43
+ device: Union[str, AcceleratorDevice] = "auto"
44
+
45
+ @field_validator("device")
46
+ def validate_device(cls, value):
47
+ # "auto", "cpu", "cuda", "mps", or "cuda:N"
48
+ if value in {d.value for d in AcceleratorDevice} or re.match(
49
+ r"^cuda(:\d+)?$", value
50
+ ):
51
+ return value
52
+ raise ValueError(
53
+ "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
54
+ )
29
55
 
30
56
  @model_validator(mode="before")
31
57
  @classmethod
@@ -41,7 +67,6 @@ class AcceleratorOptions(BaseSettings):
41
67
  """
42
68
  if isinstance(data, dict):
43
69
  input_num_threads = data.get("num_threads")
44
-
45
70
  # Check if to set the num_threads from the alternative envvar
46
71
  if input_num_threads is None:
47
72
  docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
@@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
18
18
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
19
19
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
20
20
  from docling.backend.msword_backend import MsWordDocumentBackend
21
- from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
21
+ from docling.backend.xml.jats_backend import JatsDocumentBackend
22
22
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
23
23
  from docling.datamodel.base_models import (
24
24
  ConversionStatus,
@@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
102
102
  backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
103
103
 
104
104
 
105
- class XMLPubMedFormatOption(FormatOption):
105
+ class XMLJatsFormatOption(FormatOption):
106
106
  pipeline_cls: Type = SimplePipeline
107
- backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
107
+ backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
108
108
 
109
109
 
110
110
  class ImageFormatOption(FormatOption):
@@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
143
143
  InputFormat.XML_USPTO: FormatOption(
144
144
  pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
145
145
  ),
146
- InputFormat.XML_PUBMED: FormatOption(
147
- pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
146
+ InputFormat.XML_JATS: FormatOption(
147
+ pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
148
148
  ),
149
149
  InputFormat.IMAGE: FormatOption(
150
150
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
@@ -7,36 +7,62 @@ from docling.datamodel.pipeline_options import AcceleratorDevice
7
7
  _log = logging.getLogger(__name__)
8
8
 
9
9
 
10
- def decide_device(accelerator_device: AcceleratorDevice) -> str:
10
+ def decide_device(accelerator_device: str) -> str:
11
11
  r"""
12
- Resolve the device based on the acceleration options and the available devices in the system
12
+ Resolve the device based on the acceleration options and the available devices in the system.
13
+
13
14
  Rules:
14
15
  1. AUTO: Check for the best available device on the system.
15
16
  2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
16
17
  """
17
- cuda_index = 0
18
18
  device = "cpu"
19
19
 
20
20
  has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
21
21
  has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
22
22
 
23
- if accelerator_device == AcceleratorDevice.AUTO:
23
+ if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
24
24
  if has_cuda:
25
- device = f"cuda:{cuda_index}"
25
+ device = "cuda:0"
26
26
  elif has_mps:
27
27
  device = "mps"
28
28
 
29
- else:
30
- if accelerator_device == AcceleratorDevice.CUDA:
31
- if has_cuda:
32
- device = f"cuda:{cuda_index}"
33
- else:
34
- _log.warning("CUDA is not available in the system. Fall back to 'CPU'")
35
- elif accelerator_device == AcceleratorDevice.MPS:
36
- if has_mps:
37
- device = "mps"
29
+ elif accelerator_device.startswith("cuda"):
30
+ if has_cuda:
31
+ # if cuda device index specified extract device id
32
+ parts = accelerator_device.split(":")
33
+ if len(parts) == 2 and parts[1].isdigit():
34
+ # select cuda device's id
35
+ cuda_index = int(parts[1])
36
+ if cuda_index < torch.cuda.device_count():
37
+ device = f"cuda:{cuda_index}"
38
+ else:
39
+ _log.warning(
40
+ "CUDA device 'cuda:%d' is not available. Fall back to 'CPU'.",
41
+ cuda_index,
42
+ )
43
+ elif len(parts) == 1: # just "cuda"
44
+ device = "cuda:0"
38
45
  else:
39
- _log.warning("MPS is not available in the system. Fall back to 'CPU'")
46
+ _log.warning(
47
+ "Invalid CUDA device format '%s'. Fall back to 'CPU'",
48
+ accelerator_device,
49
+ )
50
+ else:
51
+ _log.warning("CUDA is not available in the system. Fall back to 'CPU'")
52
+
53
+ elif accelerator_device == AcceleratorDevice.MPS.value:
54
+ if has_mps:
55
+ device = "mps"
56
+ else:
57
+ _log.warning("MPS is not available in the system. Fall back to 'CPU'")
58
+
59
+ elif accelerator_device == AcceleratorDevice.CPU.value:
60
+ device = "cpu"
61
+
62
+ else:
63
+ _log.warning(
64
+ "Unknown device option '%s'. Fall back to 'CPU'", accelerator_device
65
+ )
40
66
 
41
67
  _log.info("Accelerator device: '%s'", device)
42
68
  return device
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.22.0
3
+ Version: 2.23.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
29
29
  Requires-Dist: certifi (>=2024.7.4)
30
30
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
31
- Requires-Dist: docling-core[chunking] (>=2.18.0,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -15,7 +15,7 @@ docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4y
15
15
  docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
16
16
  docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
17
17
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
18
+ docling/backend/xml/jats_backend.py,sha256=JI1iibmrob9Gv9y7zoFncavQ0oJaGWnQoLkozAIiTQU,27513
19
19
  docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
20
20
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
21
21
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -23,11 +23,11 @@ docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
23
23
  docling/cli/models.py,sha256=Z4IEuaXE9el5PuI6_6mR4D5Sn3y8WZzBtoIJPi6jL_s,3188
24
24
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
25
25
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- docling/datamodel/base_models.py,sha256=_TPj-ADts3Qsc6vx1dpwZZnrOQCelqXOYIBCkK7A8FM,7107
27
- docling/datamodel/document.py,sha256=Aeqpm7d_CCV_2mwMhvNGVeGPWtWN9DJ5WAE4sjqN-dw,14530
28
- docling/datamodel/pipeline_options.py,sha256=pWCGtK0HEfltTR9Z14BYdS1-Zg6gZq9RlIHA014DpAk,9683
26
+ docling/datamodel/base_models.py,sha256=b_8LiDCC4MkpqnKfsJjduH2DSsjADCllBLNB83Tpamw,7099
27
+ docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
28
+ docling/datamodel/pipeline_options.py,sha256=5jXSVNGyOy6Ha18Wd80e7pYFmvRZk-2Lkgx0bwMOuq8,10234
29
29
  docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
30
- docling/document_converter.py,sha256=DX_bMqYyVO6rQvpf2JEy95HDR1QXT51v3T3Xn40pwjE,13196
30
+ docling/document_converter.py,sha256=AeiSmKzWcnOkZm8O-KIBG72g3l4W2CAsq3yEbfC1tiE,13184
31
31
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
32
32
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,2679
@@ -53,7 +53,7 @@ docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoaz
53
53
  docling/pipeline/standard_pdf_pipeline.py,sha256=Zoe8GGPujha16_TGYBAxcPriEwgYPaJPkp3BwG5XowU,12862
54
54
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
55
55
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
- docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
56
+ docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
57
57
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
58
58
  docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
59
59
  docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
@@ -62,8 +62,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
62
62
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
63
63
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
64
64
  docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
65
- docling-2.22.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
66
- docling-2.22.0.dist-info/METADATA,sha256=eKFbLHbqOA9xMt4c0Pdqwh7tVBOXSqdSWh_MP4ztkeU,8720
67
- docling-2.22.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
68
- docling-2.22.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
69
- docling-2.22.0.dist-info/RECORD,,
65
+ docling-2.23.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
66
+ docling-2.23.0.dist-info/METADATA,sha256=O4EJYC_yjLCFfKnhnzgSW4qGLOHaatDWDXsQS2EJDjU,8720
67
+ docling-2.23.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
68
+ docling-2.23.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
69
+ docling-2.23.0.dist-info/RECORD,,