docling-core 2.5.0__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (57) hide show
  1. {docling_core-2.5.0 → docling_core-2.6.0}/PKG-INFO +2 -1
  2. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/document.py +97 -44
  3. docling_core-2.6.0/docling_core/types/io/__init__.py +19 -0
  4. docling_core-2.6.0/docling_core/utils/file.py +210 -0
  5. {docling_core-2.5.0 → docling_core-2.6.0}/pyproject.toml +2 -1
  6. docling_core-2.5.0/docling_core/utils/file.py +0 -107
  7. {docling_core-2.5.0 → docling_core-2.6.0}/LICENSE +0 -0
  8. {docling_core-2.5.0 → docling_core-2.6.0}/README.md +0 -0
  9. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/__init__.py +0 -0
  10. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/py.typed +0 -0
  11. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/package.py +0 -0
  24. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/__init__.py +0 -0
  29. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/base.py +0 -0
  30. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/__init__.py +0 -0
  31. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/base.py +0 -0
  32. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/labels.py +0 -0
  33. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/gen/__init__.py +0 -0
  34. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/gen/generic.py +0 -0
  35. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  36. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/base.py +0 -0
  37. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  38. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  39. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  40. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/document.py +0 -0
  41. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  42. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/nlp/__init__.py +0 -0
  43. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/nlp/qa.py +0 -0
  44. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/nlp/qa_labels.py +0 -0
  45. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/__init__.py +0 -0
  46. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/attribute.py +0 -0
  47. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/base.py +0 -0
  48. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/predicate.py +0 -0
  49. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/record.py +0 -0
  50. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/statement.py +0 -0
  51. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/subject.py +0 -0
  52. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/__init__.py +0 -0
  53. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/alias.py +0 -0
  54. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/generate_docs.py +0 -0
  55. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/generate_jsonschema.py +0 -0
  56. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/validate.py +0 -0
  57. {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.5.0
3
+ Version: 2.6.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -32,6 +32,7 @@ Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<2.10)
33
33
  Requires-Dist: pyyaml (>=5.1,<7.0.0)
34
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
35
+ Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
35
36
  Project-URL: Repository, https://github.com/DS4SD/docling-core
36
37
  Description-Content-Type: text/markdown
37
38
 
@@ -10,6 +10,7 @@ import re
10
10
  import sys
11
11
  import textwrap
12
12
  import typing
13
+ import warnings
13
14
  from io import BytesIO
14
15
  from pathlib import Path
15
16
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
@@ -809,14 +810,8 @@ class PictureItem(FloatingItem):
809
810
  ):
810
811
  return default_response
811
812
 
812
- if (
813
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
814
- ) or isinstance(self.image.uri, Path):
815
- text = f"\n![Image]({str(self.image.uri)})\n"
816
- return text
817
-
818
- else:
819
- return default_response
813
+ text = f"\n![Image]({str(self.image.uri)})\n"
814
+ return text
820
815
 
821
816
  else:
822
817
  return default_response
@@ -869,14 +864,8 @@ class PictureItem(FloatingItem):
869
864
  ):
870
865
  return default_response
871
866
 
872
- if (
873
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
874
- ) or isinstance(self.image.uri, Path):
875
- img_text = f'<img src="{str(self.image.uri)}">'
876
- return f"<figure>{caption_text}{img_text}</figure>"
877
-
878
- else:
879
- return default_response
867
+ img_text = f'<img src="{str(self.image.uri)}">'
868
+ return f"<figure>{caption_text}{img_text}</figure>"
880
869
 
881
870
  else:
882
871
  return default_response
@@ -1008,14 +997,23 @@ class TableItem(FloatingItem):
1008
997
  )
1009
998
  return md_table
1010
999
 
1011
- def export_to_html(self, doc: "DoclingDocument", add_caption: bool = True) -> str:
1000
+ def export_to_html(
1001
+ self, doc: Optional["DoclingDocument"] = None, add_caption: bool = True
1002
+ ) -> str:
1012
1003
  """Export the table as html."""
1004
+ if doc is None:
1005
+ warnings.warn(
1006
+ "The `doc` argument will be mandatory in a future version. "
1007
+ "It must be provided to include a caption.",
1008
+ DeprecationWarning,
1009
+ )
1010
+
1013
1011
  body = ""
1014
1012
  nrows = self.data.num_rows
1015
1013
  ncols = self.data.num_cols
1016
1014
 
1017
1015
  text = ""
1018
- if add_caption and len(self.captions):
1016
+ if doc is not None and add_caption and len(self.captions):
1019
1017
  text = self.caption_text(doc)
1020
1018
 
1021
1019
  if len(self.data.table_cells) == 0:
@@ -1201,19 +1199,58 @@ class DoclingDocument(BaseModel):
1201
1199
  """DoclingDocument."""
1202
1200
 
1203
1201
  _HTML_DEFAULT_HEAD: str = r"""<head>
1202
+ <link rel="icon" type="image/png"
1203
+ href="https://ds4sd.github.io/docling/assets/logo.png"/>
1204
1204
  <meta charset="UTF-8">
1205
+ <title>
1206
+ Powered by Docling
1207
+ </title>
1205
1208
  <style>
1209
+ html {
1210
+ background-color: LightGray;
1211
+ }
1212
+ body {
1213
+ margin: 0 auto;
1214
+ width:800px;
1215
+ padding: 30px;
1216
+ background-color: White;
1217
+ font-family: Arial, sans-serif;
1218
+ box-shadow: 10px 10px 10px grey;
1219
+ }
1220
+ figure{
1221
+ display: block;
1222
+ width: 100%;
1223
+ margin: 0px;
1224
+ margin-top: 10px;
1225
+ margin-bottom: 10px;
1226
+ }
1227
+ img {
1228
+ display: block;
1229
+ margin: auto;
1230
+ margin-top: 10px;
1231
+ margin-bottom: 10px;
1232
+ max-width: 640px;
1233
+ max-height: 640px;
1234
+ }
1206
1235
  table {
1207
- border-collapse: separate;
1208
- /* Maintain separate borders */
1209
- border-spacing: 5px; /*
1210
- Space between cells */
1211
- width: 50%;
1236
+ min-width:500px;
1237
+ background-color: White;
1238
+ border-collapse: collapse;
1239
+ cell-padding: 5px;
1240
+ margin: auto;
1241
+ margin-top: 10px;
1242
+ margin-bottom: 10px;
1212
1243
  }
1213
1244
  th, td {
1214
1245
  border: 1px solid black;
1215
- /* Add lines etween cells */
1216
- padding: 8px; }
1246
+ padding: 8px;
1247
+ }
1248
+ th {
1249
+ font-weight: bold;
1250
+ }
1251
+ table tr:nth-child(even) td{
1252
+ background-color: LightGray;
1253
+ }
1217
1254
  </style>
1218
1255
  </head>"""
1219
1256
 
@@ -1723,6 +1760,20 @@ class DoclingDocument(BaseModel):
1723
1760
  with open(filename, "w") as fw:
1724
1761
  json.dump(out, fw, indent=indent)
1725
1762
 
1763
+ @classmethod
1764
+ def load_from_json(cls, filename: Path) -> "DoclingDocument":
1765
+ """load_from_json.
1766
+
1767
+ :param filename: The filename to load a saved DoclingDocument from a .json.
1768
+ :type filename: Path
1769
+
1770
+ :returns: The loaded DoclingDocument.
1771
+ :rtype: DoclingDocument
1772
+
1773
+ """
1774
+ with open(filename, "r") as f:
1775
+ return cls.model_validate(json.loads(f.read()))
1776
+
1726
1777
  def save_as_yaml(
1727
1778
  self,
1728
1779
  filename: Path,
@@ -1815,26 +1866,28 @@ class DoclingDocument(BaseModel):
1815
1866
  from_element and to_element; defaulting to the whole document.
1816
1867
 
1817
1868
  :param delim: Delimiter to use when concatenating the various
1818
- Markdown parts. Defaults to "\n\n".
1819
- :type delim: str
1869
+ Markdown parts. (Default value = "\n").
1870
+ :type delim: str = "\n"
1820
1871
  :param from_element: Body slicing start index (inclusive).
1821
- Defaults to 0.
1822
- :type from_element: int
1872
+ (Default value = 0).
1873
+ :type from_element: int = 0
1823
1874
  :param to_element: Body slicing stop index
1824
- (exclusive). Defaults to 0maxint.
1825
- :type to_element: int
1826
- :param delim: str: (Default value = "\n\n")
1827
- :param labels: set[DocItemLabel]
1828
- :param "subtitle-level-1":
1829
- :param "paragraph":
1830
- :param "caption":
1831
- :param "table":
1832
- :param "Text":
1833
- :param "text":
1834
- :param strict_text: bool: (Default value = False)
1835
- :param image_placeholder str: (Default value = "<!-- image -->")
1836
- the placeholder to include to position images in the markdown.
1837
- :param indent: int (default=4): indent of the nested lists
1875
+ (exclusive). (Default value = maxint).
1876
+ :type to_element: int = sys.maxsize
1877
+ :param labels: The set of document labels to include in the export.
1878
+ :type labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS
1879
+ :param strict_text: bool: Whether to only include the text content
1880
+ of the document. (Default value = False).
1881
+ :type strict_text: bool = False
1882
+ :param image_placeholder: The placeholder to include to position
1883
+ images in the markdown. (Default value = "\<!-- image --\>").
1884
+ :type image_placeholder: str = "<!-- image -->"
1885
+ :param image_mode: The mode to use for including images in the
1886
+ markdown. (Default value = ImageRefMode.PLACEHOLDER).
1887
+ :type image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
1888
+ :param indent: The indent in spaces of the nested lists.
1889
+ (Default value = 4).
1890
+ :type indent: int = 4
1838
1891
  :returns: The exported Markdown representation.
1839
1892
  :rtype: str
1840
1893
  """
@@ -2037,7 +2090,7 @@ class DoclingDocument(BaseModel):
2037
2090
  if artifacts_dir is None:
2038
2091
  # Remove the extension and add '_pictures'
2039
2092
  artifacts_dir = filename.with_suffix("")
2040
- artifacts_dir = artifacts_dir.with_name(artifacts_dir.stem + "_artifacts")
2093
+ artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")
2041
2094
  if artifacts_dir.is_absolute():
2042
2095
  reference_path = None
2043
2096
  else:
@@ -0,0 +1,19 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Models for io."""
7
+
8
+ from io import BytesIO
9
+
10
+ from pydantic import BaseModel, ConfigDict
11
+
12
+
13
+ class DocumentStream(BaseModel):
14
+ """Wrapper class for a bytes stream with a filename."""
15
+
16
+ model_config = ConfigDict(arbitrary_types_allowed=True)
17
+
18
+ name: str
19
+ stream: BytesIO
@@ -0,0 +1,210 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """File-related utilities."""
7
+
8
+ import importlib
9
+ import tempfile
10
+ from io import BytesIO
11
+ from pathlib import Path
12
+ from typing import Dict, Optional, Union
13
+
14
+ import requests
15
+ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
16
+ from typing_extensions import deprecated
17
+
18
+ from docling_core.types.io import DocumentStream
19
+
20
+
21
+ def resolve_remote_filename(
22
+ http_url: AnyHttpUrl,
23
+ response_headers: Dict[str, str],
24
+ fallback_filename="file",
25
+ ) -> str:
26
+ """Resolves the filename from a remote url and its response headers.
27
+
28
+ Args:
29
+ source AnyHttpUrl: The source http url.
30
+ response_headers Dict: Headers received while fetching the remote file.
31
+ fallback_filename str: Filename to use in case none can be determined.
32
+
33
+ Returns:
34
+ str: The actual filename of the remote url.
35
+ """
36
+ fname = None
37
+ # try to get filename from response header
38
+ if cont_disp := response_headers.get("Content-Disposition"):
39
+ for par in cont_disp.strip().split(";"):
40
+ # currently only handling directive "filename" (not "*filename")
41
+ if (split := par.split("=")) and split[0].strip() == "filename":
42
+ fname = "=".join(split[1:]).strip().strip("'\"") or None
43
+ break
44
+ # otherwise, use name from URL:
45
+ if fname is None:
46
+ fname = Path(http_url.path or "").name or fallback_filename
47
+
48
+ return fname
49
+
50
+
51
+ def resolve_source_to_stream(
52
+ source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
53
+ ) -> DocumentStream:
54
+ """Resolves the source (URL, path) of a file to a binary stream.
55
+
56
+ Args:
57
+ source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
58
+ headers (Dict | None): Optional set of headers to use for fetching
59
+ the remote URL.
60
+
61
+ Raises:
62
+ ValueError: If source is of unexpected type.
63
+
64
+ Returns:
65
+ DocumentStream: The resolved file loaded as a stream.
66
+ """
67
+ try:
68
+ http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
69
+
70
+ # make all header keys lower case
71
+ _headers = headers or {}
72
+ req_headers = {k.lower(): v for k, v in _headers.items()}
73
+ # add user-agent is not set
74
+ if "user-agent" not in req_headers:
75
+ agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
76
+ req_headers["user-agent"] = agent_name
77
+
78
+ # fetch the page
79
+ res = requests.get(http_url, stream=True, headers=req_headers)
80
+ res.raise_for_status()
81
+ fname = resolve_remote_filename(http_url=http_url, response_headers=res.headers)
82
+
83
+ stream = BytesIO(res.content)
84
+ doc_stream = DocumentStream(name=fname, stream=stream)
85
+ except ValidationError:
86
+ try:
87
+ local_path = TypeAdapter(Path).validate_python(source)
88
+ stream = BytesIO(local_path.read_bytes())
89
+ doc_stream = DocumentStream(name=local_path.name, stream=stream)
90
+ except ValidationError:
91
+ raise ValueError(f"Unexpected source type encountered: {type(source)}")
92
+ return doc_stream
93
+
94
+
95
+ def _resolve_source_to_path(
96
+ source: Union[Path, AnyHttpUrl, str],
97
+ headers: Optional[Dict[str, str]] = None,
98
+ workdir: Optional[Path] = None,
99
+ ) -> Path:
100
+ doc_stream = resolve_source_to_stream(source=source, headers=headers)
101
+
102
+ # use a temporary directory if not specified
103
+ if workdir is None:
104
+ workdir = Path(tempfile.mkdtemp())
105
+
106
+ # create the parent workdir if it doesn't exist
107
+ workdir.mkdir(exist_ok=True, parents=True)
108
+
109
+ # save result to a local file
110
+ local_path = workdir / doc_stream.name
111
+ with local_path.open("wb") as f:
112
+ f.write(doc_stream.stream.read())
113
+
114
+ return local_path
115
+
116
+
117
+ def resolve_source_to_path(
118
+ source: Union[Path, AnyHttpUrl, str],
119
+ headers: Optional[Dict[str, str]] = None,
120
+ workdir: Optional[Path] = None,
121
+ ) -> Path:
122
+ """Resolves the source (URL, path) of a file to a local file path.
123
+
124
+ If a URL is provided, the content is first downloaded to a local file, located in
125
+ the provided workdir or in a temporary directory if no workdir provided.
126
+
127
+ Args:
128
+ source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
129
+ headers (Dict | None): Optional set of headers to use for fetching
130
+ the remote URL.
131
+ workdir (Path | None): If set, the work directory where the file will
132
+ be downloaded, otherwise a temp dir will be used.
133
+
134
+ Raises:
135
+ ValueError: If source is of unexpected type.
136
+
137
+ Returns:
138
+ Path: The local file path.
139
+ """
140
+ return _resolve_source_to_path(
141
+ source=source,
142
+ headers=headers,
143
+ workdir=workdir,
144
+ )
145
+
146
+
147
+ @deprecated("Use `resolve_source_to_path()` or `resolve_source_to_stream()` instead")
148
+ def resolve_file_source(
149
+ source: Union[Path, AnyHttpUrl, str],
150
+ headers: Optional[Dict[str, str]] = None,
151
+ ) -> Path:
152
+ """Resolves the source (URL, path) of a file to a local file path.
153
+
154
+ If a URL is provided, the content is first downloaded to a temporary local file.
155
+
156
+ Args:
157
+ source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
158
+ headers (Dict | None): Optional set of headers to use for fetching
159
+ the remote URL.
160
+
161
+ Raises:
162
+ ValueError: If source is of unexpected type.
163
+
164
+ Returns:
165
+ Path: The local file path.
166
+ """
167
+ return _resolve_source_to_path(
168
+ source=source,
169
+ headers=headers,
170
+ )
171
+
172
+
173
+ def relative_path(src: Path, target: Path) -> Path:
174
+ """Compute the relative path from `src` to `target`.
175
+
176
+ Args:
177
+ src (str | Path): The source directory or file path (must be absolute).
178
+ target (str | Path): The target directory or file path (must be absolute).
179
+
180
+ Returns:
181
+ Path: The relative path from `src` to `target`.
182
+
183
+ Raises:
184
+ ValueError: If either `src` or `target` is not an absolute path.
185
+ """
186
+ src = Path(src).resolve()
187
+ target = Path(target).resolve()
188
+
189
+ # Ensure both paths are absolute
190
+ if not src.is_absolute():
191
+ raise ValueError(f"The source path must be absolute: {src}")
192
+ if not target.is_absolute():
193
+ raise ValueError(f"The target path must be absolute: {target}")
194
+
195
+ # Find the common ancestor
196
+ common_parts = []
197
+ for src_part, target_part in zip(src.parts, target.parts):
198
+ if src_part == target_part:
199
+ common_parts.append(src_part)
200
+ else:
201
+ break
202
+
203
+ # Determine the path to go up from src to the common ancestor
204
+ up_segments = [".."] * (len(src.parts) - len(common_parts))
205
+
206
+ # Add the path from the common ancestor to the target
207
+ down_segments = target.parts[len(common_parts) :]
208
+
209
+ # Combine and return the result
210
+ return Path(*up_segments, *down_segments)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.5.0"
3
+ version = "2.6.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -54,6 +54,7 @@ tabulate = "^0.9.0"
54
54
  pandas = "^2.1.4"
55
55
  pillow = "^10.3.0"
56
56
  pyyaml = ">=5.1,<7.0.0"
57
+ typing-extensions = "^4.12.2"
57
58
 
58
59
  [tool.poetry.group.dev.dependencies]
59
60
  black = "^24.4.2"
@@ -1,107 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """File-related utilities."""
7
-
8
- import importlib
9
- import tempfile
10
- from pathlib import Path
11
- from typing import Dict, Optional, Union
12
-
13
- import requests
14
- from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
15
-
16
-
17
- def resolve_file_source(
18
- source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
19
- ) -> Path:
20
- """Resolves the source (URL, path) of a file to a local file path.
21
-
22
- If a URL is provided, the content is first downloaded to a temporary local file.
23
-
24
- Args:
25
- source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
26
-
27
- Raises:
28
- ValueError: If source is of unexpected type.
29
-
30
- Returns:
31
- Path: The local file path.
32
- """
33
- try:
34
- http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
35
-
36
- # make all header keys lower case
37
- _headers = headers or {}
38
- req_headers = {k.lower(): v for k, v in _headers.items()}
39
- # add user-agent is not set
40
- if "user-agent" not in req_headers:
41
- agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
42
- req_headers["user-agent"] = agent_name
43
-
44
- # fetch the page
45
- res = requests.get(http_url, stream=True, headers=req_headers)
46
- res.raise_for_status()
47
- fname = None
48
- # try to get filename from response header
49
- if cont_disp := res.headers.get("Content-Disposition"):
50
- for par in cont_disp.strip().split(";"):
51
- # currently only handling directive "filename" (not "*filename")
52
- if (split := par.split("=")) and split[0].strip() == "filename":
53
- fname = "=".join(split[1:]).strip().strip("'\"") or None
54
- break
55
- # otherwise, use name from URL:
56
- if fname is None:
57
- fname = Path(http_url.path or "").name or "file"
58
- local_path = Path(tempfile.mkdtemp()) / fname
59
- with open(local_path, "wb") as f:
60
- for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
61
- f.write(chunk)
62
- except ValidationError:
63
- try:
64
- local_path = TypeAdapter(Path).validate_python(source)
65
- except ValidationError:
66
- raise ValueError(f"Unexpected source type encountered: {type(source)}")
67
- return local_path
68
-
69
-
70
- def relative_path(src: Path, target: Path) -> Path:
71
- """Compute the relative path from `src` to `target`.
72
-
73
- Args:
74
- src (str | Path): The source directory or file path (must be absolute).
75
- target (str | Path): The target directory or file path (must be absolute).
76
-
77
- Returns:
78
- Path: The relative path from `src` to `target`.
79
-
80
- Raises:
81
- ValueError: If either `src` or `target` is not an absolute path.
82
- """
83
- src = Path(src).resolve()
84
- target = Path(target).resolve()
85
-
86
- # Ensure both paths are absolute
87
- if not src.is_absolute():
88
- raise ValueError(f"The source path must be absolute: {src}")
89
- if not target.is_absolute():
90
- raise ValueError(f"The target path must be absolute: {target}")
91
-
92
- # Find the common ancestor
93
- common_parts = []
94
- for src_part, target_part in zip(src.parts, target.parts):
95
- if src_part == target_part:
96
- common_parts.append(src_part)
97
- else:
98
- break
99
-
100
- # Determine the path to go up from src to the common ancestor
101
- up_segments = [".."] * (len(src.parts) - len(common_parts))
102
-
103
- # Add the path from the common ancestor to the target
104
- down_segments = target.parts[len(common_parts) :]
105
-
106
- # Combine and return the result
107
- return Path(*up_segments, *down_segments)
File without changes
File without changes