docling-core 2.5.1__py3-none-any.whl → 2.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -37,8 +37,8 @@ from docling_core.types.base import _JSON_POINTER_REGEX
37
37
  from docling_core.types.doc import BoundingBox, Size
38
38
  from docling_core.types.doc.base import ImageRefMode
39
39
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
40
+ from docling_core.types.doc.utils import relative_path
40
41
  from docling_core.types.legacy_doc.tokens import DocumentToken
41
- from docling_core.utils.file import relative_path
42
42
 
43
43
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
44
44
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -810,14 +810,8 @@ class PictureItem(FloatingItem):
810
810
  ):
811
811
  return default_response
812
812
 
813
- if (
814
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
815
- ) or isinstance(self.image.uri, Path):
816
- text = f"\n![Image]({str(self.image.uri)})\n"
817
- return text
818
-
819
- else:
820
- return default_response
813
+ text = f"\n![Image]({str(self.image.uri)})\n"
814
+ return text
821
815
 
822
816
  else:
823
817
  return default_response
@@ -870,14 +864,8 @@ class PictureItem(FloatingItem):
870
864
  ):
871
865
  return default_response
872
866
 
873
- if (
874
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
875
- ) or isinstance(self.image.uri, Path):
876
- img_text = f'<img src="{str(self.image.uri)}">'
877
- return f"<figure>{caption_text}{img_text}</figure>"
878
-
879
- else:
880
- return default_response
867
+ img_text = f'<img src="{str(self.image.uri)}">'
868
+ return f"<figure>{caption_text}{img_text}</figure>"
881
869
 
882
870
  else:
883
871
  return default_response
@@ -1211,19 +1199,58 @@ class DoclingDocument(BaseModel):
1211
1199
  """DoclingDocument."""
1212
1200
 
1213
1201
  _HTML_DEFAULT_HEAD: str = r"""<head>
1202
+ <link rel="icon" type="image/png"
1203
+ href="https://ds4sd.github.io/docling/assets/logo.png"/>
1214
1204
  <meta charset="UTF-8">
1205
+ <title>
1206
+ Powered by Docling
1207
+ </title>
1215
1208
  <style>
1209
+ html {
1210
+ background-color: LightGray;
1211
+ }
1212
+ body {
1213
+ margin: 0 auto;
1214
+ width:800px;
1215
+ padding: 30px;
1216
+ background-color: White;
1217
+ font-family: Arial, sans-serif;
1218
+ box-shadow: 10px 10px 10px grey;
1219
+ }
1220
+ figure{
1221
+ display: block;
1222
+ width: 100%;
1223
+ margin: 0px;
1224
+ margin-top: 10px;
1225
+ margin-bottom: 10px;
1226
+ }
1227
+ img {
1228
+ display: block;
1229
+ margin: auto;
1230
+ margin-top: 10px;
1231
+ margin-bottom: 10px;
1232
+ max-width: 640px;
1233
+ max-height: 640px;
1234
+ }
1216
1235
  table {
1217
- border-collapse: separate;
1218
- /* Maintain separate borders */
1219
- border-spacing: 5px; /*
1220
- Space between cells */
1221
- width: 50%;
1236
+ min-width:500px;
1237
+ background-color: White;
1238
+ border-collapse: collapse;
1239
+ cell-padding: 5px;
1240
+ margin: auto;
1241
+ margin-top: 10px;
1242
+ margin-bottom: 10px;
1222
1243
  }
1223
1244
  th, td {
1224
1245
  border: 1px solid black;
1225
- /* Add lines etween cells */
1226
- padding: 8px; }
1246
+ padding: 8px;
1247
+ }
1248
+ th {
1249
+ font-weight: bold;
1250
+ }
1251
+ table tr:nth-child(even) td{
1252
+ background-color: LightGray;
1253
+ }
1227
1254
  </style>
1228
1255
  </head>"""
1229
1256
 
@@ -1733,6 +1760,20 @@ class DoclingDocument(BaseModel):
1733
1760
  with open(filename, "w") as fw:
1734
1761
  json.dump(out, fw, indent=indent)
1735
1762
 
1763
+ @classmethod
1764
+ def load_from_json(cls, filename: Path) -> "DoclingDocument":
1765
+ """load_from_json.
1766
+
1767
+ :param filename: The filename to load a saved DoclingDocument from a .json.
1768
+ :type filename: Path
1769
+
1770
+ :returns: The loaded DoclingDocument.
1771
+ :rtype: DoclingDocument
1772
+
1773
+ """
1774
+ with open(filename, "r") as f:
1775
+ return cls.model_validate(json.loads(f.read()))
1776
+
1736
1777
  def save_as_yaml(
1737
1778
  self,
1738
1779
  filename: Path,
@@ -1825,26 +1866,28 @@ class DoclingDocument(BaseModel):
1825
1866
  from_element and to_element; defaulting to the whole document.
1826
1867
 
1827
1868
  :param delim: Delimiter to use when concatenating the various
1828
- Markdown parts. Defaults to "\n\n".
1829
- :type delim: str
1869
+ Markdown parts. (Default value = "\n").
1870
+ :type delim: str = "\n"
1830
1871
  :param from_element: Body slicing start index (inclusive).
1831
- Defaults to 0.
1832
- :type from_element: int
1872
+ (Default value = 0).
1873
+ :type from_element: int = 0
1833
1874
  :param to_element: Body slicing stop index
1834
- (exclusive). Defaults to 0maxint.
1835
- :type to_element: int
1836
- :param delim: str: (Default value = "\n\n")
1837
- :param labels: set[DocItemLabel]
1838
- :param "subtitle-level-1":
1839
- :param "paragraph":
1840
- :param "caption":
1841
- :param "table":
1842
- :param "Text":
1843
- :param "text":
1844
- :param strict_text: bool: (Default value = False)
1845
- :param image_placeholder str: (Default value = "<!-- image -->")
1846
- the placeholder to include to position images in the markdown.
1847
- :param indent: int (default=4): indent of the nested lists
1875
+ (exclusive). (Default value = maxint).
1876
+ :type to_element: int = sys.maxsize
1877
+ :param labels: The set of document labels to include in the export.
1878
+ :type labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS
1879
+ :param strict_text: bool: Whether to only include the text content
1880
+ of the document. (Default value = False).
1881
+ :type strict_text: bool = False
1882
+ :param image_placeholder: The placeholder to include to position
1883
+ images in the markdown. (Default value = "\<!-- image --\>").
1884
+ :type image_placeholder: str = "<!-- image -->"
1885
+ :param image_mode: The mode to use for including images in the
1886
+ markdown. (Default value = ImageRefMode.PLACEHOLDER).
1887
+ :type image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
1888
+ :param indent: The indent in spaces of the nested lists.
1889
+ (Default value = 4).
1890
+ :type indent: int = 4
1848
1891
  :returns: The exported Markdown representation.
1849
1892
  :rtype: str
1850
1893
  """
@@ -0,0 +1,48 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Utils for document types."""
7
+
8
+ from pathlib import Path
9
+
10
+
11
+ def relative_path(src: Path, target: Path) -> Path:
12
+ """Compute the relative path from `src` to `target`.
13
+
14
+ Args:
15
+ src (str | Path): The source directory or file path (must be absolute).
16
+ target (str | Path): The target directory or file path (must be absolute).
17
+
18
+ Returns:
19
+ Path: The relative path from `src` to `target`.
20
+
21
+ Raises:
22
+ ValueError: If either `src` or `target` is not an absolute path.
23
+ """
24
+ src = Path(src).resolve()
25
+ target = Path(target).resolve()
26
+
27
+ # Ensure both paths are absolute
28
+ if not src.is_absolute():
29
+ raise ValueError(f"The source path must be absolute: {src}")
30
+ if not target.is_absolute():
31
+ raise ValueError(f"The target path must be absolute: {target}")
32
+
33
+ # Find the common ancestor
34
+ common_parts = []
35
+ for src_part, target_part in zip(src.parts, target.parts):
36
+ if src_part == target_part:
37
+ common_parts.append(src_part)
38
+ else:
39
+ break
40
+
41
+ # Determine the path to go up from src to the common ancestor
42
+ up_segments = [".."] * (len(src.parts) - len(common_parts))
43
+
44
+ # Add the path from the common ancestor to the target
45
+ down_segments = target.parts[len(common_parts) :]
46
+
47
+ # Combine and return the result
48
+ return Path(*up_segments, *down_segments)
@@ -0,0 +1,19 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Models for io."""
7
+
8
+ from io import BytesIO
9
+
10
+ from pydantic import BaseModel, ConfigDict
11
+
12
+
13
+ class DocumentStream(BaseModel):
14
+ """Wrapper class for a bytes stream with a filename."""
15
+
16
+ model_config = ConfigDict(arbitrary_types_allowed=True)
17
+
18
+ name: str
19
+ stream: BytesIO
@@ -7,28 +7,63 @@
7
7
 
8
8
  import importlib
9
9
  import tempfile
10
+ from io import BytesIO
10
11
  from pathlib import Path
11
12
  from typing import Dict, Optional, Union
12
13
 
13
14
  import requests
14
15
  from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
16
+ from typing_extensions import deprecated
15
17
 
18
+ from docling_core.types.doc.utils import relative_path # noqa
19
+ from docling_core.types.io import DocumentStream
16
20
 
17
- def resolve_file_source(
18
- source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
19
- ) -> Path:
20
- """Resolves the source (URL, path) of a file to a local file path.
21
21
 
22
- If a URL is provided, the content is first downloaded to a temporary local file.
22
+ def resolve_remote_filename(
23
+ http_url: AnyHttpUrl,
24
+ response_headers: Dict[str, str],
25
+ fallback_filename="file",
26
+ ) -> str:
27
+ """Resolves the filename from a remote url and its response headers.
28
+
29
+ Args:
30
+ source AnyHttpUrl: The source http url.
31
+ response_headers Dict: Headers received while fetching the remote file.
32
+ fallback_filename str: Filename to use in case none can be determined.
33
+
34
+ Returns:
35
+ str: The actual filename of the remote url.
36
+ """
37
+ fname = None
38
+ # try to get filename from response header
39
+ if cont_disp := response_headers.get("Content-Disposition"):
40
+ for par in cont_disp.strip().split(";"):
41
+ # currently only handling directive "filename" (not "*filename")
42
+ if (split := par.split("=")) and split[0].strip() == "filename":
43
+ fname = "=".join(split[1:]).strip().strip("'\"") or None
44
+ break
45
+ # otherwise, use name from URL:
46
+ if fname is None:
47
+ fname = Path(http_url.path or "").name or fallback_filename
48
+
49
+ return fname
50
+
51
+
52
+ def resolve_source_to_stream(
53
+ source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
54
+ ) -> DocumentStream:
55
+ """Resolves the source (URL, path) of a file to a binary stream.
23
56
 
24
57
  Args:
25
58
  source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
59
+ headers (Dict | None): Optional set of headers to use for fetching
60
+ the remote URL.
26
61
 
27
62
  Raises:
28
63
  ValueError: If source is of unexpected type.
29
64
 
30
65
  Returns:
31
- Path: The local file path.
66
+ DocumentStream: The resolved file loaded as a stream.
32
67
  """
33
68
  try:
34
69
  http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
@@ -44,64 +79,93 @@ def resolve_file_source(
44
79
  # fetch the page
45
80
  res = requests.get(http_url, stream=True, headers=req_headers)
46
81
  res.raise_for_status()
47
- fname = None
48
- # try to get filename from response header
49
- if cont_disp := res.headers.get("Content-Disposition"):
50
- for par in cont_disp.strip().split(";"):
51
- # currently only handling directive "filename" (not "*filename")
52
- if (split := par.split("=")) and split[0].strip() == "filename":
53
- fname = "=".join(split[1:]).strip().strip("'\"") or None
54
- break
55
- # otherwise, use name from URL:
56
- if fname is None:
57
- fname = Path(http_url.path or "").name or "file"
58
- local_path = Path(tempfile.mkdtemp()) / fname
59
- with open(local_path, "wb") as f:
60
- for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
61
- f.write(chunk)
82
+ fname = resolve_remote_filename(http_url=http_url, response_headers=res.headers)
83
+
84
+ stream = BytesIO(res.content)
85
+ doc_stream = DocumentStream(name=fname, stream=stream)
62
86
  except ValidationError:
63
87
  try:
64
88
  local_path = TypeAdapter(Path).validate_python(source)
89
+ stream = BytesIO(local_path.read_bytes())
90
+ doc_stream = DocumentStream(name=local_path.name, stream=stream)
65
91
  except ValidationError:
66
92
  raise ValueError(f"Unexpected source type encountered: {type(source)}")
93
+ return doc_stream
94
+
95
+
96
+ def _resolve_source_to_path(
97
+ source: Union[Path, AnyHttpUrl, str],
98
+ headers: Optional[Dict[str, str]] = None,
99
+ workdir: Optional[Path] = None,
100
+ ) -> Path:
101
+ doc_stream = resolve_source_to_stream(source=source, headers=headers)
102
+
103
+ # use a temporary directory if not specified
104
+ if workdir is None:
105
+ workdir = Path(tempfile.mkdtemp())
106
+
107
+ # create the parent workdir if it doesn't exist
108
+ workdir.mkdir(exist_ok=True, parents=True)
109
+
110
+ # save result to a local file
111
+ local_path = workdir / doc_stream.name
112
+ with local_path.open("wb") as f:
113
+ f.write(doc_stream.stream.read())
114
+
67
115
  return local_path
68
116
 
69
117
 
70
- def relative_path(src: Path, target: Path) -> Path:
71
- """Compute the relative path from `src` to `target`.
118
+ def resolve_source_to_path(
119
+ source: Union[Path, AnyHttpUrl, str],
120
+ headers: Optional[Dict[str, str]] = None,
121
+ workdir: Optional[Path] = None,
122
+ ) -> Path:
123
+ """Resolves the source (URL, path) of a file to a local file path.
124
+
125
+ If a URL is provided, the content is first downloaded to a local file, located in
126
+ the provided workdir or in a temporary directory if no workdir provided.
72
127
 
73
128
  Args:
74
- src (str | Path): The source directory or file path (must be absolute).
75
- target (str | Path): The target directory or file path (must be absolute).
129
+ source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
130
+ headers (Dict | None): Optional set of headers to use for fetching
131
+ the remote URL.
132
+ workdir (Path | None): If set, the work directory where the file will
133
+ be downloaded, otherwise a temp dir will be used.
134
+
135
+ Raises:
136
+ ValueError: If source is of unexpected type.
76
137
 
77
138
  Returns:
78
- Path: The relative path from `src` to `target`.
139
+ Path: The local file path.
140
+ """
141
+ return _resolve_source_to_path(
142
+ source=source,
143
+ headers=headers,
144
+ workdir=workdir,
145
+ )
146
+
147
+
148
+ @deprecated("Use `resolve_source_to_path()` or `resolve_source_to_stream()` instead")
149
+ def resolve_file_source(
150
+ source: Union[Path, AnyHttpUrl, str],
151
+ headers: Optional[Dict[str, str]] = None,
152
+ ) -> Path:
153
+ """Resolves the source (URL, path) of a file to a local file path.
154
+
155
+ If a URL is provided, the content is first downloaded to a temporary local file.
156
+
157
+ Args:
158
+ source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
159
+ headers (Dict | None): Optional set of headers to use for fetching
160
+ the remote URL.
79
161
 
80
162
  Raises:
81
- ValueError: If either `src` or `target` is not an absolute path.
163
+ ValueError: If source is of unexpected type.
164
+
165
+ Returns:
166
+ Path: The local file path.
82
167
  """
83
- src = Path(src).resolve()
84
- target = Path(target).resolve()
85
-
86
- # Ensure both paths are absolute
87
- if not src.is_absolute():
88
- raise ValueError(f"The source path must be absolute: {src}")
89
- if not target.is_absolute():
90
- raise ValueError(f"The target path must be absolute: {target}")
91
-
92
- # Find the common ancestor
93
- common_parts = []
94
- for src_part, target_part in zip(src.parts, target.parts):
95
- if src_part == target_part:
96
- common_parts.append(src_part)
97
- else:
98
- break
99
-
100
- # Determine the path to go up from src to the common ancestor
101
- up_segments = [".."] * (len(src.parts) - len(common_parts))
102
-
103
- # Add the path from the common ancestor to the target
104
- down_segments = target.parts[len(common_parts) :]
105
-
106
- # Combine and return the result
107
- return Path(*up_segments, *down_segments)
168
+ return _resolve_source_to_path(
169
+ source=source,
170
+ headers=headers,
171
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.5.1
3
+ Version: 2.6.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -32,6 +32,7 @@ Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<2.10)
33
33
  Requires-Dist: pyyaml (>=5.1,<7.0.0)
34
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
35
+ Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
35
36
  Project-URL: Repository, https://github.com/DS4SD/docling-core
36
37
  Description-Content-Type: text/markdown
37
38
 
@@ -21,10 +21,12 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
21
21
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
22
22
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
23
23
  docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
24
- docling_core/types/doc/document.py,sha256=apWwh2ixsVc0axtqJec3xKNuYmEwFDB00fQ2vJdKgBA,86018
24
+ docling_core/types/doc/document.py,sha256=8qVhet6eQtvju286zUkdOU0NXnkZ0AoOVAysMEZ3Aws,87099
25
25
  docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
26
+ docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
26
27
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
27
28
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
29
+ docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
28
30
  docling_core/types/legacy_doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
29
31
  docling_core/types/legacy_doc/base.py,sha256=l8NKCuORUQ1ebjdGWpj6b30oQEvtErLsIHKQHbbJiPg,14683
30
32
  docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
@@ -44,13 +46,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
44
46
  docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
45
47
  docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
46
48
  docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
47
- docling_core/utils/file.py,sha256=ug4-z0KuthkEb_d5YDRPbY79PWfNSj9GYsi16xF2sDA,3699
49
+ docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
48
50
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
49
51
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
50
52
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
51
53
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
52
- docling_core-2.5.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
- docling_core-2.5.1.dist-info/METADATA,sha256=9K3Hip_Uev5copWGL0ragXG-N5uFHQiF2SNk0se2m_o,5468
54
- docling_core-2.5.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
- docling_core-2.5.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
- docling_core-2.5.1.dist-info/RECORD,,
54
+ docling_core-2.6.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
55
+ docling_core-2.6.1.dist-info/METADATA,sha256=aHtmbajidCAFKmJiAq-sSW-rSjZhHAMsqSEfRrpYBes,5519
56
+ docling_core-2.6.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
57
+ docling_core-2.6.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
58
+ docling_core-2.6.1.dist-info/RECORD,,