docling-core 2.5.0__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.5.0 → docling_core-2.6.0}/PKG-INFO +2 -1
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/document.py +97 -44
- docling_core-2.6.0/docling_core/types/io/__init__.py +19 -0
- docling_core-2.6.0/docling_core/utils/file.py +210 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/pyproject.toml +2 -1
- docling_core-2.5.0/docling_core/utils/file.py +0 -107
- {docling_core-2.5.0 → docling_core-2.6.0}/LICENSE +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/README.md +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/py.typed +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/package.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/base.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.5.0 → docling_core-2.6.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,7 @@ Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
|
32
32
|
Requires-Dist: pydantic (>=2.6.0,<2.10)
|
|
33
33
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
34
34
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
35
|
+
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
35
36
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
36
37
|
Description-Content-Type: text/markdown
|
|
37
38
|
|
|
@@ -10,6 +10,7 @@ import re
|
|
|
10
10
|
import sys
|
|
11
11
|
import textwrap
|
|
12
12
|
import typing
|
|
13
|
+
import warnings
|
|
13
14
|
from io import BytesIO
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
@@ -809,14 +810,8 @@ class PictureItem(FloatingItem):
|
|
|
809
810
|
):
|
|
810
811
|
return default_response
|
|
811
812
|
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
) or isinstance(self.image.uri, Path):
|
|
815
|
-
text = f"\n})\n"
|
|
816
|
-
return text
|
|
817
|
-
|
|
818
|
-
else:
|
|
819
|
-
return default_response
|
|
813
|
+
text = f"\n})\n"
|
|
814
|
+
return text
|
|
820
815
|
|
|
821
816
|
else:
|
|
822
817
|
return default_response
|
|
@@ -869,14 +864,8 @@ class PictureItem(FloatingItem):
|
|
|
869
864
|
):
|
|
870
865
|
return default_response
|
|
871
866
|
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
) or isinstance(self.image.uri, Path):
|
|
875
|
-
img_text = f'<img src="{str(self.image.uri)}">'
|
|
876
|
-
return f"<figure>{caption_text}{img_text}</figure>"
|
|
877
|
-
|
|
878
|
-
else:
|
|
879
|
-
return default_response
|
|
867
|
+
img_text = f'<img src="{str(self.image.uri)}">'
|
|
868
|
+
return f"<figure>{caption_text}{img_text}</figure>"
|
|
880
869
|
|
|
881
870
|
else:
|
|
882
871
|
return default_response
|
|
@@ -1008,14 +997,23 @@ class TableItem(FloatingItem):
|
|
|
1008
997
|
)
|
|
1009
998
|
return md_table
|
|
1010
999
|
|
|
1011
|
-
def export_to_html(
|
|
1000
|
+
def export_to_html(
|
|
1001
|
+
self, doc: Optional["DoclingDocument"] = None, add_caption: bool = True
|
|
1002
|
+
) -> str:
|
|
1012
1003
|
"""Export the table as html."""
|
|
1004
|
+
if doc is None:
|
|
1005
|
+
warnings.warn(
|
|
1006
|
+
"The `doc` argument will be mandatory in a future version. "
|
|
1007
|
+
"It must be provided to include a caption.",
|
|
1008
|
+
DeprecationWarning,
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1013
1011
|
body = ""
|
|
1014
1012
|
nrows = self.data.num_rows
|
|
1015
1013
|
ncols = self.data.num_cols
|
|
1016
1014
|
|
|
1017
1015
|
text = ""
|
|
1018
|
-
if add_caption and len(self.captions):
|
|
1016
|
+
if doc is not None and add_caption and len(self.captions):
|
|
1019
1017
|
text = self.caption_text(doc)
|
|
1020
1018
|
|
|
1021
1019
|
if len(self.data.table_cells) == 0:
|
|
@@ -1201,19 +1199,58 @@ class DoclingDocument(BaseModel):
|
|
|
1201
1199
|
"""DoclingDocument."""
|
|
1202
1200
|
|
|
1203
1201
|
_HTML_DEFAULT_HEAD: str = r"""<head>
|
|
1202
|
+
<link rel="icon" type="image/png"
|
|
1203
|
+
href="https://ds4sd.github.io/docling/assets/logo.png"/>
|
|
1204
1204
|
<meta charset="UTF-8">
|
|
1205
|
+
<title>
|
|
1206
|
+
Powered by Docling
|
|
1207
|
+
</title>
|
|
1205
1208
|
<style>
|
|
1209
|
+
html {
|
|
1210
|
+
background-color: LightGray;
|
|
1211
|
+
}
|
|
1212
|
+
body {
|
|
1213
|
+
margin: 0 auto;
|
|
1214
|
+
width:800px;
|
|
1215
|
+
padding: 30px;
|
|
1216
|
+
background-color: White;
|
|
1217
|
+
font-family: Arial, sans-serif;
|
|
1218
|
+
box-shadow: 10px 10px 10px grey;
|
|
1219
|
+
}
|
|
1220
|
+
figure{
|
|
1221
|
+
display: block;
|
|
1222
|
+
width: 100%;
|
|
1223
|
+
margin: 0px;
|
|
1224
|
+
margin-top: 10px;
|
|
1225
|
+
margin-bottom: 10px;
|
|
1226
|
+
}
|
|
1227
|
+
img {
|
|
1228
|
+
display: block;
|
|
1229
|
+
margin: auto;
|
|
1230
|
+
margin-top: 10px;
|
|
1231
|
+
margin-bottom: 10px;
|
|
1232
|
+
max-width: 640px;
|
|
1233
|
+
max-height: 640px;
|
|
1234
|
+
}
|
|
1206
1235
|
table {
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
border-
|
|
1210
|
-
|
|
1211
|
-
|
|
1236
|
+
min-width:500px;
|
|
1237
|
+
background-color: White;
|
|
1238
|
+
border-collapse: collapse;
|
|
1239
|
+
cell-padding: 5px;
|
|
1240
|
+
margin: auto;
|
|
1241
|
+
margin-top: 10px;
|
|
1242
|
+
margin-bottom: 10px;
|
|
1212
1243
|
}
|
|
1213
1244
|
th, td {
|
|
1214
1245
|
border: 1px solid black;
|
|
1215
|
-
|
|
1216
|
-
|
|
1246
|
+
padding: 8px;
|
|
1247
|
+
}
|
|
1248
|
+
th {
|
|
1249
|
+
font-weight: bold;
|
|
1250
|
+
}
|
|
1251
|
+
table tr:nth-child(even) td{
|
|
1252
|
+
background-color: LightGray;
|
|
1253
|
+
}
|
|
1217
1254
|
</style>
|
|
1218
1255
|
</head>"""
|
|
1219
1256
|
|
|
@@ -1723,6 +1760,20 @@ class DoclingDocument(BaseModel):
|
|
|
1723
1760
|
with open(filename, "w") as fw:
|
|
1724
1761
|
json.dump(out, fw, indent=indent)
|
|
1725
1762
|
|
|
1763
|
+
@classmethod
|
|
1764
|
+
def load_from_json(cls, filename: Path) -> "DoclingDocument":
|
|
1765
|
+
"""load_from_json.
|
|
1766
|
+
|
|
1767
|
+
:param filename: The filename to load a saved DoclingDocument from a .json.
|
|
1768
|
+
:type filename: Path
|
|
1769
|
+
|
|
1770
|
+
:returns: The loaded DoclingDocument.
|
|
1771
|
+
:rtype: DoclingDocument
|
|
1772
|
+
|
|
1773
|
+
"""
|
|
1774
|
+
with open(filename, "r") as f:
|
|
1775
|
+
return cls.model_validate(json.loads(f.read()))
|
|
1776
|
+
|
|
1726
1777
|
def save_as_yaml(
|
|
1727
1778
|
self,
|
|
1728
1779
|
filename: Path,
|
|
@@ -1815,26 +1866,28 @@ class DoclingDocument(BaseModel):
|
|
|
1815
1866
|
from_element and to_element; defaulting to the whole document.
|
|
1816
1867
|
|
|
1817
1868
|
:param delim: Delimiter to use when concatenating the various
|
|
1818
|
-
Markdown parts.
|
|
1819
|
-
:type delim: str
|
|
1869
|
+
Markdown parts. (Default value = "\n").
|
|
1870
|
+
:type delim: str = "\n"
|
|
1820
1871
|
:param from_element: Body slicing start index (inclusive).
|
|
1821
|
-
|
|
1822
|
-
:type from_element: int
|
|
1872
|
+
(Default value = 0).
|
|
1873
|
+
:type from_element: int = 0
|
|
1823
1874
|
:param to_element: Body slicing stop index
|
|
1824
|
-
(exclusive).
|
|
1825
|
-
:type to_element: int
|
|
1826
|
-
:param
|
|
1827
|
-
:
|
|
1828
|
-
:param
|
|
1829
|
-
|
|
1830
|
-
:
|
|
1831
|
-
:param
|
|
1832
|
-
|
|
1833
|
-
:
|
|
1834
|
-
:param
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
:param indent:
|
|
1875
|
+
(exclusive). (Default value = maxint).
|
|
1876
|
+
:type to_element: int = sys.maxsize
|
|
1877
|
+
:param labels: The set of document labels to include in the export.
|
|
1878
|
+
:type labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS
|
|
1879
|
+
:param strict_text: bool: Whether to only include the text content
|
|
1880
|
+
of the document. (Default value = False).
|
|
1881
|
+
:type strict_text: bool = False
|
|
1882
|
+
:param image_placeholder: The placeholder to include to position
|
|
1883
|
+
images in the markdown. (Default value = "\<!-- image --\>").
|
|
1884
|
+
:type image_placeholder: str = "<!-- image -->"
|
|
1885
|
+
:param image_mode: The mode to use for including images in the
|
|
1886
|
+
markdown. (Default value = ImageRefMode.PLACEHOLDER).
|
|
1887
|
+
:type image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
|
|
1888
|
+
:param indent: The indent in spaces of the nested lists.
|
|
1889
|
+
(Default value = 4).
|
|
1890
|
+
:type indent: int = 4
|
|
1838
1891
|
:returns: The exported Markdown representation.
|
|
1839
1892
|
:rtype: str
|
|
1840
1893
|
"""
|
|
@@ -2037,7 +2090,7 @@ class DoclingDocument(BaseModel):
|
|
|
2037
2090
|
if artifacts_dir is None:
|
|
2038
2091
|
# Remove the extension and add '_pictures'
|
|
2039
2092
|
artifacts_dir = filename.with_suffix("")
|
|
2040
|
-
artifacts_dir = artifacts_dir.with_name(artifacts_dir.
|
|
2093
|
+
artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")
|
|
2041
2094
|
if artifacts_dir.is_absolute():
|
|
2042
2095
|
reference_path = None
|
|
2043
2096
|
else:
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Models for io."""
|
|
7
|
+
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DocumentStream(BaseModel):
|
|
14
|
+
"""Wrapper class for a bytes stream with a filename."""
|
|
15
|
+
|
|
16
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
17
|
+
|
|
18
|
+
name: str
|
|
19
|
+
stream: BytesIO
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""File-related utilities."""
|
|
7
|
+
|
|
8
|
+
import importlib
|
|
9
|
+
import tempfile
|
|
10
|
+
from io import BytesIO
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, Optional, Union
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
16
|
+
from typing_extensions import deprecated
|
|
17
|
+
|
|
18
|
+
from docling_core.types.io import DocumentStream
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def resolve_remote_filename(
|
|
22
|
+
http_url: AnyHttpUrl,
|
|
23
|
+
response_headers: Dict[str, str],
|
|
24
|
+
fallback_filename="file",
|
|
25
|
+
) -> str:
|
|
26
|
+
"""Resolves the filename from a remote url and its response headers.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
source AnyHttpUrl: The source http url.
|
|
30
|
+
response_headers Dict: Headers received while fetching the remote file.
|
|
31
|
+
fallback_filename str: Filename to use in case none can be determined.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
str: The actual filename of the remote url.
|
|
35
|
+
"""
|
|
36
|
+
fname = None
|
|
37
|
+
# try to get filename from response header
|
|
38
|
+
if cont_disp := response_headers.get("Content-Disposition"):
|
|
39
|
+
for par in cont_disp.strip().split(";"):
|
|
40
|
+
# currently only handling directive "filename" (not "*filename")
|
|
41
|
+
if (split := par.split("=")) and split[0].strip() == "filename":
|
|
42
|
+
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
|
43
|
+
break
|
|
44
|
+
# otherwise, use name from URL:
|
|
45
|
+
if fname is None:
|
|
46
|
+
fname = Path(http_url.path or "").name or fallback_filename
|
|
47
|
+
|
|
48
|
+
return fname
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def resolve_source_to_stream(
|
|
52
|
+
source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
|
|
53
|
+
) -> DocumentStream:
|
|
54
|
+
"""Resolves the source (URL, path) of a file to a binary stream.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
|
|
58
|
+
headers (Dict | None): Optional set of headers to use for fetching
|
|
59
|
+
the remote URL.
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If source is of unexpected type.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
DocumentStream: The resolved file loaded as a stream.
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
|
69
|
+
|
|
70
|
+
# make all header keys lower case
|
|
71
|
+
_headers = headers or {}
|
|
72
|
+
req_headers = {k.lower(): v for k, v in _headers.items()}
|
|
73
|
+
# add user-agent is not set
|
|
74
|
+
if "user-agent" not in req_headers:
|
|
75
|
+
agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
|
|
76
|
+
req_headers["user-agent"] = agent_name
|
|
77
|
+
|
|
78
|
+
# fetch the page
|
|
79
|
+
res = requests.get(http_url, stream=True, headers=req_headers)
|
|
80
|
+
res.raise_for_status()
|
|
81
|
+
fname = resolve_remote_filename(http_url=http_url, response_headers=res.headers)
|
|
82
|
+
|
|
83
|
+
stream = BytesIO(res.content)
|
|
84
|
+
doc_stream = DocumentStream(name=fname, stream=stream)
|
|
85
|
+
except ValidationError:
|
|
86
|
+
try:
|
|
87
|
+
local_path = TypeAdapter(Path).validate_python(source)
|
|
88
|
+
stream = BytesIO(local_path.read_bytes())
|
|
89
|
+
doc_stream = DocumentStream(name=local_path.name, stream=stream)
|
|
90
|
+
except ValidationError:
|
|
91
|
+
raise ValueError(f"Unexpected source type encountered: {type(source)}")
|
|
92
|
+
return doc_stream
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _resolve_source_to_path(
|
|
96
|
+
source: Union[Path, AnyHttpUrl, str],
|
|
97
|
+
headers: Optional[Dict[str, str]] = None,
|
|
98
|
+
workdir: Optional[Path] = None,
|
|
99
|
+
) -> Path:
|
|
100
|
+
doc_stream = resolve_source_to_stream(source=source, headers=headers)
|
|
101
|
+
|
|
102
|
+
# use a temporary directory if not specified
|
|
103
|
+
if workdir is None:
|
|
104
|
+
workdir = Path(tempfile.mkdtemp())
|
|
105
|
+
|
|
106
|
+
# create the parent workdir if it doesn't exist
|
|
107
|
+
workdir.mkdir(exist_ok=True, parents=True)
|
|
108
|
+
|
|
109
|
+
# save result to a local file
|
|
110
|
+
local_path = workdir / doc_stream.name
|
|
111
|
+
with local_path.open("wb") as f:
|
|
112
|
+
f.write(doc_stream.stream.read())
|
|
113
|
+
|
|
114
|
+
return local_path
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def resolve_source_to_path(
|
|
118
|
+
source: Union[Path, AnyHttpUrl, str],
|
|
119
|
+
headers: Optional[Dict[str, str]] = None,
|
|
120
|
+
workdir: Optional[Path] = None,
|
|
121
|
+
) -> Path:
|
|
122
|
+
"""Resolves the source (URL, path) of a file to a local file path.
|
|
123
|
+
|
|
124
|
+
If a URL is provided, the content is first downloaded to a local file, located in
|
|
125
|
+
the provided workdir or in a temporary directory if no workdir provided.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
|
|
129
|
+
headers (Dict | None): Optional set of headers to use for fetching
|
|
130
|
+
the remote URL.
|
|
131
|
+
workdir (Path | None): If set, the work directory where the file will
|
|
132
|
+
be downloaded, otherwise a temp dir will be used.
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
ValueError: If source is of unexpected type.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Path: The local file path.
|
|
139
|
+
"""
|
|
140
|
+
return _resolve_source_to_path(
|
|
141
|
+
source=source,
|
|
142
|
+
headers=headers,
|
|
143
|
+
workdir=workdir,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@deprecated("Use `resolve_source_to_path()` or `resolve_source_to_stream()` instead")
|
|
148
|
+
def resolve_file_source(
|
|
149
|
+
source: Union[Path, AnyHttpUrl, str],
|
|
150
|
+
headers: Optional[Dict[str, str]] = None,
|
|
151
|
+
) -> Path:
|
|
152
|
+
"""Resolves the source (URL, path) of a file to a local file path.
|
|
153
|
+
|
|
154
|
+
If a URL is provided, the content is first downloaded to a temporary local file.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
|
|
158
|
+
headers (Dict | None): Optional set of headers to use for fetching
|
|
159
|
+
the remote URL.
|
|
160
|
+
|
|
161
|
+
Raises:
|
|
162
|
+
ValueError: If source is of unexpected type.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Path: The local file path.
|
|
166
|
+
"""
|
|
167
|
+
return _resolve_source_to_path(
|
|
168
|
+
source=source,
|
|
169
|
+
headers=headers,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def relative_path(src: Path, target: Path) -> Path:
|
|
174
|
+
"""Compute the relative path from `src` to `target`.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
src (str | Path): The source directory or file path (must be absolute).
|
|
178
|
+
target (str | Path): The target directory or file path (must be absolute).
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Path: The relative path from `src` to `target`.
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
ValueError: If either `src` or `target` is not an absolute path.
|
|
185
|
+
"""
|
|
186
|
+
src = Path(src).resolve()
|
|
187
|
+
target = Path(target).resolve()
|
|
188
|
+
|
|
189
|
+
# Ensure both paths are absolute
|
|
190
|
+
if not src.is_absolute():
|
|
191
|
+
raise ValueError(f"The source path must be absolute: {src}")
|
|
192
|
+
if not target.is_absolute():
|
|
193
|
+
raise ValueError(f"The target path must be absolute: {target}")
|
|
194
|
+
|
|
195
|
+
# Find the common ancestor
|
|
196
|
+
common_parts = []
|
|
197
|
+
for src_part, target_part in zip(src.parts, target.parts):
|
|
198
|
+
if src_part == target_part:
|
|
199
|
+
common_parts.append(src_part)
|
|
200
|
+
else:
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
# Determine the path to go up from src to the common ancestor
|
|
204
|
+
up_segments = [".."] * (len(src.parts) - len(common_parts))
|
|
205
|
+
|
|
206
|
+
# Add the path from the common ancestor to the target
|
|
207
|
+
down_segments = target.parts[len(common_parts) :]
|
|
208
|
+
|
|
209
|
+
# Combine and return the result
|
|
210
|
+
return Path(*up_segments, *down_segments)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.6.0"
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
authors = [
|
|
@@ -54,6 +54,7 @@ tabulate = "^0.9.0"
|
|
|
54
54
|
pandas = "^2.1.4"
|
|
55
55
|
pillow = "^10.3.0"
|
|
56
56
|
pyyaml = ">=5.1,<7.0.0"
|
|
57
|
+
typing-extensions = "^4.12.2"
|
|
57
58
|
|
|
58
59
|
[tool.poetry.group.dev.dependencies]
|
|
59
60
|
black = "^24.4.2"
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""File-related utilities."""
|
|
7
|
-
|
|
8
|
-
import importlib
|
|
9
|
-
import tempfile
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import Dict, Optional, Union
|
|
12
|
-
|
|
13
|
-
import requests
|
|
14
|
-
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def resolve_file_source(
|
|
18
|
-
source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
|
|
19
|
-
) -> Path:
|
|
20
|
-
"""Resolves the source (URL, path) of a file to a local file path.
|
|
21
|
-
|
|
22
|
-
If a URL is provided, the content is first downloaded to a temporary local file.
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
|
|
26
|
-
|
|
27
|
-
Raises:
|
|
28
|
-
ValueError: If source is of unexpected type.
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
Path: The local file path.
|
|
32
|
-
"""
|
|
33
|
-
try:
|
|
34
|
-
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
|
35
|
-
|
|
36
|
-
# make all header keys lower case
|
|
37
|
-
_headers = headers or {}
|
|
38
|
-
req_headers = {k.lower(): v for k, v in _headers.items()}
|
|
39
|
-
# add user-agent is not set
|
|
40
|
-
if "user-agent" not in req_headers:
|
|
41
|
-
agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
|
|
42
|
-
req_headers["user-agent"] = agent_name
|
|
43
|
-
|
|
44
|
-
# fetch the page
|
|
45
|
-
res = requests.get(http_url, stream=True, headers=req_headers)
|
|
46
|
-
res.raise_for_status()
|
|
47
|
-
fname = None
|
|
48
|
-
# try to get filename from response header
|
|
49
|
-
if cont_disp := res.headers.get("Content-Disposition"):
|
|
50
|
-
for par in cont_disp.strip().split(";"):
|
|
51
|
-
# currently only handling directive "filename" (not "*filename")
|
|
52
|
-
if (split := par.split("=")) and split[0].strip() == "filename":
|
|
53
|
-
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
|
54
|
-
break
|
|
55
|
-
# otherwise, use name from URL:
|
|
56
|
-
if fname is None:
|
|
57
|
-
fname = Path(http_url.path or "").name or "file"
|
|
58
|
-
local_path = Path(tempfile.mkdtemp()) / fname
|
|
59
|
-
with open(local_path, "wb") as f:
|
|
60
|
-
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
|
61
|
-
f.write(chunk)
|
|
62
|
-
except ValidationError:
|
|
63
|
-
try:
|
|
64
|
-
local_path = TypeAdapter(Path).validate_python(source)
|
|
65
|
-
except ValidationError:
|
|
66
|
-
raise ValueError(f"Unexpected source type encountered: {type(source)}")
|
|
67
|
-
return local_path
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def relative_path(src: Path, target: Path) -> Path:
|
|
71
|
-
"""Compute the relative path from `src` to `target`.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
src (str | Path): The source directory or file path (must be absolute).
|
|
75
|
-
target (str | Path): The target directory or file path (must be absolute).
|
|
76
|
-
|
|
77
|
-
Returns:
|
|
78
|
-
Path: The relative path from `src` to `target`.
|
|
79
|
-
|
|
80
|
-
Raises:
|
|
81
|
-
ValueError: If either `src` or `target` is not an absolute path.
|
|
82
|
-
"""
|
|
83
|
-
src = Path(src).resolve()
|
|
84
|
-
target = Path(target).resolve()
|
|
85
|
-
|
|
86
|
-
# Ensure both paths are absolute
|
|
87
|
-
if not src.is_absolute():
|
|
88
|
-
raise ValueError(f"The source path must be absolute: {src}")
|
|
89
|
-
if not target.is_absolute():
|
|
90
|
-
raise ValueError(f"The target path must be absolute: {target}")
|
|
91
|
-
|
|
92
|
-
# Find the common ancestor
|
|
93
|
-
common_parts = []
|
|
94
|
-
for src_part, target_part in zip(src.parts, target.parts):
|
|
95
|
-
if src_part == target_part:
|
|
96
|
-
common_parts.append(src_part)
|
|
97
|
-
else:
|
|
98
|
-
break
|
|
99
|
-
|
|
100
|
-
# Determine the path to go up from src to the common ancestor
|
|
101
|
-
up_segments = [".."] * (len(src.parts) - len(common_parts))
|
|
102
|
-
|
|
103
|
-
# Add the path from the common ancestor to the target
|
|
104
|
-
down_segments = target.parts[len(common_parts) :]
|
|
105
|
-
|
|
106
|
-
# Combine and return the result
|
|
107
|
-
return Path(*up_segments, *down_segments)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.5.0 → docling_core-2.6.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.5.0 → docling_core-2.6.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.5.0 → docling_core-2.6.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|