thds.core 1.31.20250213162956__py3-none-any.whl → 1.32.20250218201534__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.core might be problematic. Click here for more details.
- thds/core/cm.py +29 -0
- thds/core/files.py +2 -2
- thds/core/logical_root.py +36 -0
- thds/core/meta.json +2 -2
- thds/core/source/__init__.py +9 -0
- thds/core/source/_construct.py +79 -0
- thds/core/source/_download.py +84 -0
- thds/core/source/serde.py +105 -0
- thds/core/source/src.py +86 -0
- thds/core/source/tree.py +106 -0
- thds/core/source_serde.py +2 -104
- {thds.core-1.31.20250213162956.dist-info → thds.core-1.32.20250218201534.dist-info}/METADATA +1 -1
- {thds.core-1.31.20250213162956.dist-info → thds.core-1.32.20250218201534.dist-info}/RECORD +16 -9
- thds/core/source.py +0 -238
- {thds.core-1.31.20250213162956.dist-info → thds.core-1.32.20250218201534.dist-info}/WHEEL +0 -0
- {thds.core-1.31.20250213162956.dist-info → thds.core-1.32.20250218201534.dist-info}/entry_points.txt +0 -0
- {thds.core-1.31.20250213162956.dist-info → thds.core-1.32.20250218201534.dist-info}/top_level.txt +0 -0
thds/core/cm.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""A keep-alive wrapper for context managers. Let's say you've got a thread pool executor
|
|
2
|
+
that you've created, and you want to be able to pass it to multiple users that expect to
|
|
3
|
+
'enter' the thread pool themselves, using a `with` statement. But you don't want the
|
|
4
|
+
threads to be destroyed after the first use; you want to open the context yourself, but
|
|
5
|
+
still pass the expected context manager to the users. This is a way to do that.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import contextlib
|
|
9
|
+
import typing as ty
|
|
10
|
+
|
|
11
|
+
T = ty.TypeVar("T")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _AlreadyEnteredContext(ty.ContextManager[T]):
|
|
15
|
+
def __init__(self, entered_context: T):
|
|
16
|
+
self.entered_context = entered_context
|
|
17
|
+
|
|
18
|
+
def __enter__(self) -> T:
|
|
19
|
+
# No-op enter; just return the underlying thing
|
|
20
|
+
return self.entered_context
|
|
21
|
+
|
|
22
|
+
def __exit__(self, exc_type, exc_value, traceback): # type: ignore
|
|
23
|
+
pass # No-op exit
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@contextlib.contextmanager
|
|
27
|
+
def keep_context(context_manager: ty.ContextManager[T]) -> ty.Iterator[ty.ContextManager[T]]:
|
|
28
|
+
with context_manager as entered_context:
|
|
29
|
+
yield _AlreadyEnteredContext(entered_context)
|
thds/core/files.py
CHANGED
|
@@ -19,7 +19,7 @@ FILE_SCHEME = "file://"
|
|
|
19
19
|
logger = getLogger(__name__)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def set_read_only(fpath: StrOrPath):
|
|
22
|
+
def set_read_only(fpath: StrOrPath) -> None:
|
|
23
23
|
# thank you https://stackoverflow.com/a/51262451
|
|
24
24
|
logger.debug("Setting '%s' to read-only", fpath)
|
|
25
25
|
perms = stat.S_IMODE(os.lstat(fpath).st_mode)
|
|
@@ -87,7 +87,7 @@ def set_file_limit(n: int):
|
|
|
87
87
|
assert resource.getrlimit(resource.RLIMIT_NOFILE) == (n, n)
|
|
88
88
|
|
|
89
89
|
|
|
90
|
-
def bump_limits():
|
|
90
|
+
def bump_limits() -> None:
|
|
91
91
|
"""It was common to have to do this manually on our macs. Now that is no longer required."""
|
|
92
92
|
set_file_limit(OPEN_FILES_LIMIT())
|
|
93
93
|
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def find_common_prefix(uris: ty.Iterable[str]) -> ty.List[str]:
|
|
5
|
+
uri_parts_list = [uri.split("/") for uri in uris]
|
|
6
|
+
if not uri_parts_list:
|
|
7
|
+
return list()
|
|
8
|
+
|
|
9
|
+
reference = uri_parts_list[0]
|
|
10
|
+
|
|
11
|
+
for i, part in enumerate(reference):
|
|
12
|
+
for uri_parts in uri_parts_list[1:]:
|
|
13
|
+
if i >= len(uri_parts) or uri_parts[i] != part:
|
|
14
|
+
if i == 0:
|
|
15
|
+
raise ValueError(f"Paths have no common prefix: {uris}")
|
|
16
|
+
return reference[:i]
|
|
17
|
+
return reference # the whole thing must be the common prefix
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def find(paths: ty.Iterable[str], higher_logical_root: str = "") -> str:
|
|
21
|
+
common = find_common_prefix(paths)
|
|
22
|
+
if not higher_logical_root:
|
|
23
|
+
return "/".join(common) # lowest common root
|
|
24
|
+
|
|
25
|
+
# Split higher_logical_root into components
|
|
26
|
+
root_parts = higher_logical_root.split("/")
|
|
27
|
+
|
|
28
|
+
# Look for the sequence of parts in common
|
|
29
|
+
for i in range(len(common) - len(root_parts) + 1):
|
|
30
|
+
if common[i : i + len(root_parts)] == root_parts:
|
|
31
|
+
return "/".join(common[: i + 1])
|
|
32
|
+
# the logical root is the top level directory that corresponds to the
|
|
33
|
+
# higher_logical_root string, which may have multiple components that needed to
|
|
34
|
+
# match. So we take the common parts plus 1 to get that top level root.
|
|
35
|
+
|
|
36
|
+
raise ValueError(f"Higher root '{higher_logical_root}' not found")
|
thds/core/meta.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
|
-
"git_commit": "
|
|
2
|
+
"git_commit": "6aa6700b782d76b65b65256eaa49bd70e8217015",
|
|
3
3
|
"git_branch": "main",
|
|
4
4
|
"git_is_clean": true,
|
|
5
|
-
"pyproject_version": "1.
|
|
5
|
+
"pyproject_version": "1.32.20250218201534",
|
|
6
6
|
"thds_user": "runner",
|
|
7
7
|
"misc": {}
|
|
8
8
|
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Wrap openable, read-only data that is either locally-present or downloadable,
|
|
2
|
+
|
|
3
|
+
yet will not be downloaded (if non-local) until it is actually opened or unwrapped.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from . import serde, tree # noqa: F401
|
|
7
|
+
from ._construct import from_file, from_uri, path_from_uri, register_from_uri_handler # noqa: F401
|
|
8
|
+
from ._download import Downloader, register_download_handler # noqa: F401
|
|
9
|
+
from .src import Source # noqa: F401
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
from functools import partial
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ..files import is_file_uri, path_from_uri, to_uri
|
|
6
|
+
from ..hashing import Hash
|
|
7
|
+
from ..types import StrOrPath
|
|
8
|
+
from . import _download
|
|
9
|
+
from .src import Source
|
|
10
|
+
|
|
11
|
+
# Creation from local Files or from remote URIs
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def from_file(filename: StrOrPath, hash: ty.Optional[Hash] = None, uri: str = "") -> Source:
|
|
15
|
+
"""Create a read-only Source from a local file that already exists.
|
|
16
|
+
|
|
17
|
+
If URI is passed, the local file will be read and hashed, but the final URI in the
|
|
18
|
+
Source will be the one provided explicitly. NO UPLOAD IS PERFORMED. It is your
|
|
19
|
+
responsibility to ensure that your file has been uploaded to the URI you provide.
|
|
20
|
+
"""
|
|
21
|
+
path = path_from_uri(filename) if isinstance(filename, str) else filename
|
|
22
|
+
assert isinstance(path, Path)
|
|
23
|
+
if not path.exists():
|
|
24
|
+
raise FileNotFoundError(path)
|
|
25
|
+
|
|
26
|
+
if uri:
|
|
27
|
+
src = from_uri(uri, _download._check_hash(hash, path))
|
|
28
|
+
else:
|
|
29
|
+
src = Source(to_uri(path), _download._check_hash(hash, path))
|
|
30
|
+
src._set_cached_path(path) # internally, it's okay to hack around immutability.
|
|
31
|
+
return src
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FromUri(ty.Protocol):
|
|
35
|
+
def __call__(self, hash: ty.Optional[Hash]) -> Source:
|
|
36
|
+
"""Closure over a URI that creates a Source from a URI.
|
|
37
|
+
|
|
38
|
+
The Hash may be used to short-circuit creation that would result in creating
|
|
39
|
+
a Source that cannot match the expected Hash, but this is not required,
|
|
40
|
+
and the hash will be included in the Source object regardless, and will
|
|
41
|
+
be validated (if non-nil) at the time of source data access.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class FromUriHandler(ty.Protocol):
|
|
46
|
+
def __call__(self, uri: str) -> ty.Optional[FromUri]:
|
|
47
|
+
"""Returns a FromUri object containing the URI if this URI can be handled. Returns
|
|
48
|
+
None if this URI cannot be handled.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def register_from_uri_handler(key: str, handler: FromUriHandler):
|
|
53
|
+
"""If a library wants to customize how Sources are created from URIs that it handles,
|
|
54
|
+
it can register a handler here.
|
|
55
|
+
"""
|
|
56
|
+
# key is not currently used for anything other than avoiding
|
|
57
|
+
# having duplicates registered for whatever reason.
|
|
58
|
+
_FROM_URI_HANDLERS[key] = handler
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
_FROM_URI_HANDLERS: ty.Dict[str, FromUriHandler] = dict()
|
|
62
|
+
register_from_uri_handler(
|
|
63
|
+
"local_file", lambda uri: partial(from_file, path_from_uri(uri)) if is_file_uri(uri) else None
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def from_uri(uri: str, hash: ty.Optional[Hash] = None) -> Source:
|
|
68
|
+
"""Create a read-only Source from a URI. The data should already exist at this remote
|
|
69
|
+
URI, although Source itself can make no guarantee that it always represents real data
|
|
70
|
+
- only that any data it does represent is read-only.
|
|
71
|
+
|
|
72
|
+
It may be advantageous for a URI-handling library to register a more specific
|
|
73
|
+
implementation of this function, if it is capable of determining a Hash for the blob
|
|
74
|
+
represented by the URI without downloading the blob.
|
|
75
|
+
"""
|
|
76
|
+
for handler in _FROM_URI_HANDLERS.values():
|
|
77
|
+
if from_uri_ := handler(uri):
|
|
78
|
+
return from_uri_(hash)
|
|
79
|
+
return Source(uri=uri, hash=hash)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Wrap openable, read-only data that is either locally-present or downloadable,
|
|
2
|
+
|
|
3
|
+
yet will not be downloaded (if non-local) until it is actually opened or unwrapped.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import typing as ty
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .. import log
|
|
10
|
+
from ..files import is_file_uri, path_from_uri
|
|
11
|
+
from ..hash_cache import filehash
|
|
12
|
+
from ..hashing import Hash
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Downloader(ty.Protocol):
|
|
16
|
+
def __call__(self, hash: ty.Optional[Hash]) -> Path:
|
|
17
|
+
"""Closure over a URI that downloads a file to a local path and returns the path.
|
|
18
|
+
The file may be placed anywhere as long as the file will be readable until the
|
|
19
|
+
program exits.
|
|
20
|
+
|
|
21
|
+
If the URI points to a missing file, this MUST raise any Exception that the
|
|
22
|
+
underlying implementation desires. It MUST NOT return a Path pointing to a
|
|
23
|
+
non-existent file.
|
|
24
|
+
|
|
25
|
+
The Hash may be used to short-circuit a download that would result in downloading
|
|
26
|
+
a file that does not match the expected hash, but the Downloader need not verify
|
|
27
|
+
the Hash of the file downloaded after the fact, as that will be performed by
|
|
28
|
+
default by the Source object.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DownloadHandler(ty.Protocol):
|
|
33
|
+
def __call__(self, uri: str) -> ty.Optional[Downloader]:
|
|
34
|
+
"""Returns a Downloader containing the URI if this URI can be handled. Returns
|
|
35
|
+
None if this URI cannot be handled.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _LocalFileHandler(uri: str) -> ty.Optional[Downloader]:
|
|
40
|
+
if not is_file_uri(uri):
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
def download_file(hash: ty.Optional[Hash]) -> Path:
|
|
44
|
+
lpath = path_from_uri(uri)
|
|
45
|
+
if not lpath.exists():
|
|
46
|
+
raise FileNotFoundError(lpath)
|
|
47
|
+
if hash:
|
|
48
|
+
_check_hash(hash, lpath)
|
|
49
|
+
return lpath
|
|
50
|
+
|
|
51
|
+
return download_file
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def register_download_handler(key: str, handler: DownloadHandler):
|
|
55
|
+
# key is not currently used for anything other than avoiding
|
|
56
|
+
# having duplicates registered for whatever reason.
|
|
57
|
+
_DOWNLOAD_HANDLERS[key] = handler
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_DOWNLOAD_HANDLERS: ty.Dict[str, DownloadHandler] = dict()
|
|
61
|
+
register_download_handler("local_file", _LocalFileHandler)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _get_download_handler(uri: str) -> Downloader:
|
|
65
|
+
for handler in _DOWNLOAD_HANDLERS.values():
|
|
66
|
+
if downloader := handler(uri):
|
|
67
|
+
return downloader
|
|
68
|
+
raise ValueError(f"No SourcePath download handler for uri: {uri}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class SourceHashMismatchError(ValueError):
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _check_hash(expected_hash: ty.Optional[Hash], path: Path) -> Hash:
|
|
76
|
+
hash_algo = expected_hash.algo if expected_hash else "sha256"
|
|
77
|
+
with log.logger_context(hash_for=f"source-{hash_algo}"):
|
|
78
|
+
computed_hash = filehash(hash_algo, path)
|
|
79
|
+
if expected_hash and expected_hash != computed_hash:
|
|
80
|
+
raise SourceHashMismatchError(
|
|
81
|
+
f"{expected_hash.algo} mismatch for {path};"
|
|
82
|
+
f" got {computed_hash.bytes!r}, expected {expected_hash.bytes!r}"
|
|
83
|
+
)
|
|
84
|
+
return computed_hash
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# this should later get promoted somewhere, probably
|
|
2
|
+
import json
|
|
3
|
+
import typing as ty
|
|
4
|
+
from functools import partial
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from thds.core import files, hashing, log, types
|
|
8
|
+
|
|
9
|
+
from . import _construct
|
|
10
|
+
from .src import Source
|
|
11
|
+
|
|
12
|
+
_SHA256_B64 = "sha256b64"
|
|
13
|
+
_MD5_B64 = "md5b64"
|
|
14
|
+
|
|
15
|
+
logger = log.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _from_sha256b64(d: dict) -> ty.Optional[hashing.Hash]:
|
|
19
|
+
if "sha256b64" in d:
|
|
20
|
+
return hashing.Hash(algo="sha256", bytes=hashing.db64(d[_SHA256_B64]))
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _from_md5b64(d: dict) -> ty.Optional[hashing.Hash]:
|
|
25
|
+
if "md5b64" in d:
|
|
26
|
+
return hashing.Hash(algo="md5", bytes=hashing.db64(d[_MD5_B64]))
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
HashParser = ty.Callable[[dict], ty.Optional[hashing.Hash]]
|
|
31
|
+
_BASE_PARSERS = (_from_sha256b64, _from_md5b64)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def base_parsers() -> ty.Tuple[HashParser, ...]:
|
|
35
|
+
return _BASE_PARSERS
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def from_json(json_source: str, hash_parsers: ty.Collection[HashParser] = base_parsers()) -> Source:
|
|
39
|
+
d = json.loads(json_source)
|
|
40
|
+
return _construct.from_uri(
|
|
41
|
+
uri=d["uri"],
|
|
42
|
+
hash=next(filter(None, (p(d) for p in hash_parsers)), None),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _generic_hash_serializer(
|
|
47
|
+
algo: str, stringify_hash: ty.Callable[[bytes], str], keyname: str, hash: hashing.Hash
|
|
48
|
+
) -> ty.Optional[dict]:
|
|
49
|
+
if hash.algo == algo:
|
|
50
|
+
return {keyname: stringify_hash(hash.bytes)}
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_to_sha256b64 = partial(_generic_hash_serializer, "sha256", hashing.b64, _SHA256_B64)
|
|
55
|
+
_to_md5b64 = partial(_generic_hash_serializer, "md5", hashing.b64, _MD5_B64)
|
|
56
|
+
|
|
57
|
+
HashSerializer = ty.Callable[[hashing.Hash], ty.Optional[dict]]
|
|
58
|
+
_BASE_HASH_SERIALIZERS: ty.Tuple[HashSerializer, ...] = (_to_md5b64, _to_sha256b64) # type: ignore
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def base_hash_serializers() -> ty.Tuple[HashSerializer, ...]:
|
|
62
|
+
return _BASE_HASH_SERIALIZERS
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def to_json(
|
|
66
|
+
source: Source, hash_serializers: ty.Collection[HashSerializer] = base_hash_serializers()
|
|
67
|
+
) -> str:
|
|
68
|
+
hash_dict = (
|
|
69
|
+
next(filter(None, (ser(source.hash) for ser in hash_serializers if source.hash)), None)
|
|
70
|
+
) or dict()
|
|
71
|
+
return json.dumps(dict(uri=source.uri, **hash_dict))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def from_unknown_user_path(path: types.StrOrPath, desired_uri: str) -> Source:
|
|
75
|
+
"""Sometimes you may want to load a Source directly from a Path provided by a user.
|
|
76
|
+
|
|
77
|
+
It _might_ represent something loadable as a from_json Source, but it might just be a
|
|
78
|
+
raw file that needs to be loaded with from_file!
|
|
79
|
+
|
|
80
|
+
This is a _reasonable_ (but not guaranteed!) way of trying to ascertain which one it
|
|
81
|
+
is, and specifying where it should live 'remotely' if such a thing becomes
|
|
82
|
+
necessary.
|
|
83
|
+
|
|
84
|
+
Your application might need to implement something more robust if the
|
|
85
|
+
actual underlying data is likely to be a JSON blob containing the key `uri`, for
|
|
86
|
+
instance.
|
|
87
|
+
"""
|
|
88
|
+
with open(path) as readable:
|
|
89
|
+
try:
|
|
90
|
+
return from_json(readable.read(4096))
|
|
91
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
92
|
+
return _construct.from_file(path, uri=desired_uri)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def write_to_json_file(source: Source, local_file: Path) -> bool:
|
|
96
|
+
"""Write the canonical JSON serialization of the Source to a file."""
|
|
97
|
+
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
98
|
+
previous_source = local_file.read_text() if local_file.exists() else None
|
|
99
|
+
new_source = to_json(source) + "\n"
|
|
100
|
+
if new_source != previous_source:
|
|
101
|
+
with files.atomic_text_writer(local_file) as f:
|
|
102
|
+
logger.info(f"Writing {source} to {local_file}")
|
|
103
|
+
f.write(new_source)
|
|
104
|
+
return True
|
|
105
|
+
return False
|
thds/core/source/src.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import typing as ty
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .. import hashing
|
|
7
|
+
from . import _download
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class Source(os.PathLike):
|
|
12
|
+
"""Source is meant to be a consistent in-memory representation for an abstract,
|
|
13
|
+
**read-only** source of data that may not be present locally when an application
|
|
14
|
+
starts.
|
|
15
|
+
|
|
16
|
+
A Source uses `os.PathLike` (`__fspath__`) to support transparent `open(src)` calls,
|
|
17
|
+
so in many cases it will be a drop-in replacement for Path or str filenames. If you
|
|
18
|
+
need an actual Path object, you can call `path()` to get one, but you should prefer to
|
|
19
|
+
defer this until the actual location of use.
|
|
20
|
+
|
|
21
|
+
By 'wrapping' read-only data in these objects, we can unify the code around how we
|
|
22
|
+
unwrap and use them, which should allow us to more easily support different execution
|
|
23
|
+
environments and sources of data.
|
|
24
|
+
|
|
25
|
+
For instance, a Source could be a file on disk, but it could also be a file in
|
|
26
|
+
ADLS.
|
|
27
|
+
|
|
28
|
+
Furthermore, libraries which build on top of this one may use this representation to
|
|
29
|
+
identify opportunities for optimization, by representing the Source in a stable
|
|
30
|
+
and consistent format that allows different underlying data sources to fulfill the
|
|
31
|
+
request for the data based on environmental context. A library could choose to
|
|
32
|
+
transparently transform a local-path-based Source into a Source representing a
|
|
33
|
+
remote file, without changing the semantics of the object as observed by the code.
|
|
34
|
+
|
|
35
|
+
One reason a Hash is part of the interface is so that libraries interacting with the
|
|
36
|
+
object can use the hash as a canonical 'name' for the data, if one is available.
|
|
37
|
+
|
|
38
|
+
Another reason is that we can add a layer of consistency checking to data we're
|
|
39
|
+
working with, at the cost of a few compute cycles. Since Sources are meant to represent
|
|
40
|
+
read-only data, the Hash is a meaningful and persistent marker of data identity.
|
|
41
|
+
|
|
42
|
+
Do not call its constructor in application code. Use `from_file` or `from_uri` instead.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
uri: str
|
|
46
|
+
hash: ty.Optional[hashing.Hash] = None
|
|
47
|
+
# hash and equality are based only on the _identity_ of the object,
|
|
48
|
+
# not on the other properties that provide some caching functionality.
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def cached_path(self) -> ty.Optional[Path]:
|
|
52
|
+
"""This is part of the public interface as far as checking to see whether a file
|
|
53
|
+
is already present locally, but its existence and value is not part of equality or
|
|
54
|
+
the hash for this class - it exists purely as an optimization.
|
|
55
|
+
"""
|
|
56
|
+
return getattr(self, "__cached_path", None)
|
|
57
|
+
|
|
58
|
+
def _set_cached_path(self, lpath: ty.Optional[Path]):
|
|
59
|
+
"""protected interface for setting a cached Path since the attribute is not
|
|
60
|
+
available via the constructor.
|
|
61
|
+
"""
|
|
62
|
+
super().__setattr__("__cached_path", lpath) # this works around dataclass.frozen.
|
|
63
|
+
# https://noklam.github.io/blog/posts/2022-04-22-python-dataclass-partiala-immutable.html
|
|
64
|
+
|
|
65
|
+
def path(self) -> Path:
|
|
66
|
+
"""Any Source can be turned into a local file path.
|
|
67
|
+
|
|
68
|
+
Remember that the resulting data is meant to be read-only. If you want to mutate
|
|
69
|
+
the data, you should first make a copy.
|
|
70
|
+
|
|
71
|
+
If not already present locally, this will incur a one-time download. Then, if the
|
|
72
|
+
Source has a Hash, the Hash will be validated against the downloaded file, and a
|
|
73
|
+
failure will raise SourceHashMismatchError.
|
|
74
|
+
"""
|
|
75
|
+
if self.cached_path is None or not self.cached_path.exists():
|
|
76
|
+
lpath = _download._get_download_handler(self.uri)(self.hash)
|
|
77
|
+
# path() used to be responsible for checking the hash, but since we pass it to the downloader,
|
|
78
|
+
# it really makes more sense to allow the downloader to decide how to verify its own download,
|
|
79
|
+
# and we don't want to duplicate any effort that it may have already put in.
|
|
80
|
+
self._set_cached_path(lpath)
|
|
81
|
+
|
|
82
|
+
assert self.cached_path and self.cached_path.exists()
|
|
83
|
+
return self.cached_path
|
|
84
|
+
|
|
85
|
+
def __fspath__(self) -> str:
|
|
86
|
+
return os.fspath(self.path())
|
thds/core/source/tree.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import typing as ty
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .. import cm, link, logical_root, parallel, thunks, types
|
|
9
|
+
from .src import Source
|
|
10
|
+
|
|
11
|
+
_MAX_PARALLELISM = 90
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _logical_tree_replication_operations(
|
|
15
|
+
local_paths: ty.Iterable[Path], logical_local_root: Path, dest_dir: Path
|
|
16
|
+
) -> ty.Tuple[Path, ty.List[ty.Tuple[Path, Path]]]:
|
|
17
|
+
"""
|
|
18
|
+
Pure function that determines required copy operations.
|
|
19
|
+
Returns (logical_dest, list of (src, dest) pairs)
|
|
20
|
+
"""
|
|
21
|
+
logical_dest = dest_dir / logical_local_root.name
|
|
22
|
+
operations = [(src, logical_dest / src.relative_to(logical_local_root)) for src in local_paths]
|
|
23
|
+
return logical_dest, operations
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def replicate_logical_tree(
|
|
27
|
+
local_paths: ty.Iterable[Path],
|
|
28
|
+
logical_local_root: Path,
|
|
29
|
+
dest_dir: Path,
|
|
30
|
+
copy: ty.Callable[[Path, Path], ty.Any] = link.cheap_copy,
|
|
31
|
+
executor_cm: ty.Optional[ty.ContextManager[concurrent.futures.Executor]] = None,
|
|
32
|
+
) -> Path:
|
|
33
|
+
"""
|
|
34
|
+
Replicate only the specified files from logical_root into dest_dir.
|
|
35
|
+
Returns the path to the logical root in the new location.
|
|
36
|
+
"""
|
|
37
|
+
logical_dest, operations = _logical_tree_replication_operations(
|
|
38
|
+
local_paths, logical_local_root, dest_dir
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
top_level_of_logical_dest_dir = dest_dir / logical_local_root.name
|
|
42
|
+
shutil.rmtree(top_level_of_logical_dest_dir, ignore_errors=True)
|
|
43
|
+
|
|
44
|
+
def copy_to(src: Path, dest: Path) -> None:
|
|
45
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
copy(src, dest)
|
|
47
|
+
|
|
48
|
+
for _ in parallel.failfast(
|
|
49
|
+
parallel.yield_all(
|
|
50
|
+
((src, thunks.thunking(copy_to)(src, dest)) for src, dest in operations),
|
|
51
|
+
executor_cm=executor_cm,
|
|
52
|
+
)
|
|
53
|
+
):
|
|
54
|
+
pass
|
|
55
|
+
return top_level_of_logical_dest_dir
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class SourceTree(os.PathLike):
|
|
60
|
+
"""Represent a fixed set of sources (with hashes where available) as a list of
|
|
61
|
+
sources, plus the (optional) logical root of the tree, so that they can be 'unwrapped'
|
|
62
|
+
as a local directory structure.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
sources: ty.List[Source]
|
|
66
|
+
higher_logical_root: str = ""
|
|
67
|
+
# there may be cases where, rather than identifying the 'lowest common prefix' of a
|
|
68
|
+
# set of sources/URIs, we may wish to represent a 'higher' root for the sake of some
|
|
69
|
+
# consuming system. in those cases, this can be specified and we'll find the lowest
|
|
70
|
+
# common prefix _above_ that.
|
|
71
|
+
|
|
72
|
+
def path(self, dest_dir: ty.Optional[types.StrOrPath] = None) -> Path:
|
|
73
|
+
"""Return a local path to a directory that corresponds to the logical root.
|
|
74
|
+
|
|
75
|
+
This incurs a download of _all_ sources explicitly represented by the list.
|
|
76
|
+
|
|
77
|
+
If you want to _ensure_ that _only_ the listed sources are present in the
|
|
78
|
+
directory, despite any other files which may be present in an
|
|
79
|
+
implementation-specific cache, you must pass a Path to a directory that you are
|
|
80
|
+
willing to have emptied, and this method will copy the files into it.
|
|
81
|
+
"""
|
|
82
|
+
with cm.keep_context(
|
|
83
|
+
concurrent.futures.ThreadPoolExecutor(max_workers=_MAX_PARALLELISM)
|
|
84
|
+
) as thread_pool:
|
|
85
|
+
local_paths = [
|
|
86
|
+
local_path
|
|
87
|
+
for _, local_path in parallel.failfast(
|
|
88
|
+
parallel.yield_all(
|
|
89
|
+
# src.path() is a thunk that downloads the data if not already present locally.
|
|
90
|
+
# Source allows registration of download handlers by URI scheme.
|
|
91
|
+
((src, src.path) for src in self.sources),
|
|
92
|
+
executor_cm=thread_pool,
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
]
|
|
96
|
+
local_logical_root = Path(logical_root.find(map(str, local_paths), self.higher_logical_root))
|
|
97
|
+
assert local_logical_root.is_dir()
|
|
98
|
+
if not dest_dir:
|
|
99
|
+
return local_logical_root
|
|
100
|
+
|
|
101
|
+
return replicate_logical_tree(
|
|
102
|
+
local_paths, local_logical_root, Path(dest_dir).resolve(), executor_cm=thread_pool
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def __fspath__(self) -> str: # implement the os.PathLike protocol
|
|
106
|
+
return str(self.path())
|
thds/core/source_serde.py
CHANGED
|
@@ -1,104 +1,2 @@
|
|
|
1
|
-
#
|
|
2
|
-
import
|
|
3
|
-
import typing as ty
|
|
4
|
-
from functools import partial
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
from thds.core import files, hashing, log, source, types
|
|
8
|
-
|
|
9
|
-
_SHA256_B64 = "sha256b64"
|
|
10
|
-
_MD5_B64 = "md5b64"
|
|
11
|
-
|
|
12
|
-
logger = log.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _from_sha256b64(d: dict) -> ty.Optional[hashing.Hash]:
|
|
16
|
-
if "sha256b64" in d:
|
|
17
|
-
return hashing.Hash(algo="sha256", bytes=hashing.db64(d[_SHA256_B64]))
|
|
18
|
-
return None
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def _from_md5b64(d: dict) -> ty.Optional[hashing.Hash]:
|
|
22
|
-
if "md5b64" in d:
|
|
23
|
-
return hashing.Hash(algo="md5", bytes=hashing.db64(d[_MD5_B64]))
|
|
24
|
-
return None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
HashParser = ty.Callable[[dict], ty.Optional[hashing.Hash]]
|
|
28
|
-
_BASE_PARSERS = (_from_sha256b64, _from_md5b64)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def base_parsers() -> ty.Tuple[HashParser, ...]:
|
|
32
|
-
return _BASE_PARSERS
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def from_json(
|
|
36
|
-
json_source: str, hash_parsers: ty.Collection[HashParser] = base_parsers()
|
|
37
|
-
) -> source.Source:
|
|
38
|
-
d = json.loads(json_source)
|
|
39
|
-
return source.from_uri(
|
|
40
|
-
uri=d["uri"],
|
|
41
|
-
hash=next(filter(None, (p(d) for p in hash_parsers)), None),
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def _generic_hash_serializer(
|
|
46
|
-
algo: str, stringify_hash: ty.Callable[[bytes], str], keyname: str, hash: hashing.Hash
|
|
47
|
-
) -> ty.Optional[dict]:
|
|
48
|
-
if hash.algo == algo:
|
|
49
|
-
return {keyname: stringify_hash(hash.bytes)}
|
|
50
|
-
return None
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
_to_sha256b64 = partial(_generic_hash_serializer, "sha256", hashing.b64, _SHA256_B64)
|
|
54
|
-
_to_md5b64 = partial(_generic_hash_serializer, "md5", hashing.b64, _MD5_B64)
|
|
55
|
-
|
|
56
|
-
HashSerializer = ty.Callable[[hashing.Hash], ty.Optional[dict]]
|
|
57
|
-
_BASE_HASH_SERIALIZERS: ty.Tuple[HashSerializer, ...] = (_to_md5b64, _to_sha256b64) # type: ignore
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def base_hash_serializers() -> ty.Tuple[HashSerializer, ...]:
|
|
61
|
-
return _BASE_HASH_SERIALIZERS
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def to_json(
|
|
65
|
-
source: source.Source, hash_serializers: ty.Collection[HashSerializer] = base_hash_serializers()
|
|
66
|
-
) -> str:
|
|
67
|
-
hash_dict = (
|
|
68
|
-
next(filter(None, (ser(source.hash) for ser in hash_serializers if source.hash)), None)
|
|
69
|
-
) or dict()
|
|
70
|
-
return json.dumps(dict(uri=source.uri, **hash_dict))
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def from_unknown_user_path(path: types.StrOrPath, desired_uri: str) -> source.Source:
|
|
74
|
-
"""Sometimes you may want to load a Source directly from a Path provided by a user.
|
|
75
|
-
|
|
76
|
-
It _might_ represent something loadable as a from_json Source, but it might just be a
|
|
77
|
-
raw file that needs to be loaded with from_file!
|
|
78
|
-
|
|
79
|
-
This is a _reasonable_ (but not guaranteed!) way of trying to ascertain which one it
|
|
80
|
-
is, and specifying where it should live 'remotely' if such a thing becomes
|
|
81
|
-
necessary.
|
|
82
|
-
|
|
83
|
-
Your application might need to implement something more robust if the
|
|
84
|
-
actual underlying data is likely to be a JSON blob containing the key `uri`, for
|
|
85
|
-
instance.
|
|
86
|
-
"""
|
|
87
|
-
with open(path) as readable:
|
|
88
|
-
try:
|
|
89
|
-
return from_json(readable.read(4096))
|
|
90
|
-
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
91
|
-
return source.from_file(path, uri=desired_uri)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def write_to_json_file(source: source.Source, local_file: Path) -> bool:
|
|
95
|
-
"""Write the canonical JSON serialization of the Source to a file."""
|
|
96
|
-
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
97
|
-
previous_source = local_file.read_text() if local_file.exists() else None
|
|
98
|
-
new_source = to_json(source) + "\n"
|
|
99
|
-
if new_source != previous_source:
|
|
100
|
-
with files.atomic_text_writer(local_file) as f:
|
|
101
|
-
logger.info(f"Writing {source} to {local_file}")
|
|
102
|
-
f.write(new_source)
|
|
103
|
-
return True
|
|
104
|
-
return False
|
|
1
|
+
# deprecated alias for source.serde
|
|
2
|
+
from .source.serde import from_json, from_unknown_user_path, to_json, write_to_json_file # noqa: F401
|
|
@@ -2,13 +2,14 @@ thds/core/__init__.py,sha256=imKpmnrBV0_7-1d1Pc2yR5jxbvOrmIplLm7Ig_eK1OU,934
|
|
|
2
2
|
thds/core/ansi_esc.py,sha256=QZ3CptZbX4N_hyP2IgqfTbNt9tBPaqy7ReTMQIzGbrc,870
|
|
3
3
|
thds/core/cache.py,sha256=nL0oAyZrhPqyBBLevnOWSWVoEBrftaG3aE6Qq6tvmAA,7153
|
|
4
4
|
thds/core/calgitver.py,sha256=HklIz-SczK92Vm2rXtTSDiVxAcxUW_GPVCRRGt4BmBA,2324
|
|
5
|
+
thds/core/cm.py,sha256=WZB8eQU0DaBYj9s97nc3PuCtai9guovfyiQH68zhLzY,1086
|
|
5
6
|
thds/core/concurrency.py,sha256=NQunF_tJ_z8cfVyhzkTPlb-nZrgu-vIk9_3XffgscKQ,3520
|
|
6
7
|
thds/core/config.py,sha256=N-WVpPDrfTSFKz0m7WrqZPBdd17dycpDx9nhbATkf3c,9092
|
|
7
8
|
thds/core/decos.py,sha256=VpFTKTArXepICxN4U8C8J6Z5KDq-yVjFZQzqs2jeVAk,1341
|
|
8
9
|
thds/core/dict_utils.py,sha256=MAVkGJg4KQN1UGBLEKuPdQucZaXg_jJakujQ-GUrYzw,6471
|
|
9
10
|
thds/core/env.py,sha256=M36CYkPZ5AUf_-n8EqjsMGwWOzaKEn0KgRwnqUK7jS4,1094
|
|
10
11
|
thds/core/exit_after.py,sha256=0lz63nz2NTiIdyBDYyRa9bQShxQKe7eISy8VhXeW4HU,3485
|
|
11
|
-
thds/core/files.py,sha256=
|
|
12
|
+
thds/core/files.py,sha256=35vhbaDv4OkL_n1PCM-ki54708aibFMrnURpth_5UsA,4556
|
|
12
13
|
thds/core/fretry.py,sha256=Tui2q6vXV6c7mjTa1czLrXiugHUEwQp-sZdiwXfxvmM,3829
|
|
13
14
|
thds/core/generators.py,sha256=rcdFpPj0NMJWSaSZTnBfTeZxTTORNB633Lng-BW1284,1939
|
|
14
15
|
thds/core/git.py,sha256=I6kaEvwcvVxCLYHhTTfnHle-GkmgOR9_fHs03QxgBfI,2792
|
|
@@ -21,8 +22,9 @@ thds/core/inspect.py,sha256=vCxKqw8XG2W1cuj0MwjdXhe9TLQrGdjRraS6UEYsbf8,1955
|
|
|
21
22
|
thds/core/iterators.py,sha256=d3iTQDR0gCW1nMRmknQeodR_4THzR9Ajmp8F8KCCFgg,208
|
|
22
23
|
thds/core/lazy.py,sha256=e1WvG4LsbEydV0igEr_Vl1cq05zlQNIE8MFYT90yglE,3289
|
|
23
24
|
thds/core/link.py,sha256=kmFJIFvEZc16-7S7IGvtTpzwl3VuvFl3yPlE6WJJ03w,5404
|
|
25
|
+
thds/core/logical_root.py,sha256=gWkIYRv9kNQfzbpxJaYiwNXVz1neZ2NvnvProtOn9d8,1399
|
|
24
26
|
thds/core/merge_args.py,sha256=7oj7dtO1-XVkfTM3aBlq3QlZbo8tb6X7E3EVIR-60t8,5781
|
|
25
|
-
thds/core/meta.json,sha256=
|
|
27
|
+
thds/core/meta.json,sha256=yYe9d8_WiKFL5-fGrUD1hBo8tCJtbY-wet_cGdgcy_A,196
|
|
26
28
|
thds/core/meta.py,sha256=IPLAKrH06HooPMNf5FeqJvUcM-JljTGXddrAQ5oAX8E,16896
|
|
27
29
|
thds/core/parallel.py,sha256=HXAn9aIYqNE5rnRN5ypxR6CUucdfzE5T5rJ_MUv-pFk,7590
|
|
28
30
|
thds/core/pickle_visit.py,sha256=QNMWIi5buvk2zsvx1-D-FKL7tkrFUFDs387vxgGebgU,833
|
|
@@ -32,8 +34,7 @@ thds/core/protocols.py,sha256=4na2EeWUDWfLn5-SxfMmKegDSndJ5z-vwMhDavhCpEM,409
|
|
|
32
34
|
thds/core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
35
|
thds/core/scaling.py,sha256=f7CtdgK0sN6nroTq5hLAkG8xwbWhbCZUULstSKjoxO0,1615
|
|
34
36
|
thds/core/scope.py,sha256=iPRhS-lIe-axDctqxBtEPeF0PM_w-0tRS-9kPweUGBY,7205
|
|
35
|
-
thds/core/
|
|
36
|
-
thds/core/source_serde.py,sha256=TqW4MTrXQ49JJaXrkmFcymSBIWvBlkxO2JOPNQap_F4,3523
|
|
37
|
+
thds/core/source_serde.py,sha256=X4c7LiT3VidejqtTel9YB6dWGB3x-ct39KF9E50Nbx4,139
|
|
37
38
|
thds/core/stack_context.py,sha256=17lPOuYWclUpZ-VXRkPgI4WbiMzq7_ZY6Kj1QK_1oNo,1332
|
|
38
39
|
thds/core/thunks.py,sha256=p1OvMBJ4VGMsD8BVA7zwPeAp0L3y_nxVozBF2E78t3M,1053
|
|
39
40
|
thds/core/timer.py,sha256=1FfcQ4-Gp6WQFXR0GKeT_8jwtamEfnTukdSbDKTAJVM,5432
|
|
@@ -45,6 +46,12 @@ thds/core/log/json_formatter.py,sha256=C5bRsSbAqaQqfTm88jc3mYe3vwKZZLAxET8s7_u7a
|
|
|
45
46
|
thds/core/log/kw_formatter.py,sha256=9-MVOd2r5NEkYNne9qWyFMeR5lac3w7mjHXsDa681i0,3379
|
|
46
47
|
thds/core/log/kw_logger.py,sha256=CyZVPnkUMtrUL2Lyk261AIEPmoP-buf_suFAhQlU1io,4063
|
|
47
48
|
thds/core/log/logfmt.py,sha256=qS6BbdlOZPRnxmHenVL3uK43OQ30NJUnz92S6d_Yh2A,10360
|
|
49
|
+
thds/core/source/__init__.py,sha256=RiaUHNunoaw4XJUrwR5vJzSS6HGxOUKUONR_ipX5654,424
|
|
50
|
+
thds/core/source/_construct.py,sha256=klN6-fSJrsbbUhp92wzhJcF73h_PKKJItNLC__vwlIs,3122
|
|
51
|
+
thds/core/source/_download.py,sha256=pUhkphHdB7y4ZpxZZ6ITIS5giXMHuRf420yYAJwx6aE,2924
|
|
52
|
+
thds/core/source/serde.py,sha256=wXCfuv_Dv3QvJJr-uebGmTrfhCU_1a8VX3VJnXhVHfU,3539
|
|
53
|
+
thds/core/source/src.py,sha256=A1PSR5vANLwnUWLsFNVLkkeUdaidzRAzq8vri_a5w9E,4141
|
|
54
|
+
thds/core/source/tree.py,sha256=vjAqnQXGE0XiI0WvlLyXGqEAZbyjq6XmdUeWAR0HI4M,4144
|
|
48
55
|
thds/core/sqlite/__init__.py,sha256=tDMzuO76qTtckJHldPQ6nPZ6kcvhhoJrVuuW42JtaSQ,606
|
|
49
56
|
thds/core/sqlite/connect.py,sha256=l4QaSAI8RjP7Qh2FjmJ3EwRgfGf65Z3-LjtC9ocHM_U,977
|
|
50
57
|
thds/core/sqlite/copy.py,sha256=y3IRQTBrWDfKuVIfW7fYuEgwRCRKHjN0rxVFkIb9VrQ,1155
|
|
@@ -60,8 +67,8 @@ thds/core/sqlite/structured.py,sha256=swCbDoyVT6cE7Kl79Wh_rg5Z1-yrUDJbiVJF4bjset
|
|
|
60
67
|
thds/core/sqlite/types.py,sha256=oUkfoKRYNGDPZRk29s09rc9ha3SCk2SKr_K6WKebBFs,1308
|
|
61
68
|
thds/core/sqlite/upsert.py,sha256=BmKK6fsGVedt43iY-Lp7dnAu8aJ1e9CYlPVEQR2pMj4,5827
|
|
62
69
|
thds/core/sqlite/write.py,sha256=z0219vDkQDCnsV0WLvsj94keItr7H4j7Y_evbcoBrWU,3458
|
|
63
|
-
thds.core-1.
|
|
64
|
-
thds.core-1.
|
|
65
|
-
thds.core-1.
|
|
66
|
-
thds.core-1.
|
|
67
|
-
thds.core-1.
|
|
70
|
+
thds.core-1.32.20250218201534.dist-info/METADATA,sha256=fBdWYnrnIKHNioHwRHhFjWY_tUm-atCCLoHJgdoSqds,2123
|
|
71
|
+
thds.core-1.32.20250218201534.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
72
|
+
thds.core-1.32.20250218201534.dist-info/entry_points.txt,sha256=bOCOVhKZv7azF3FvaWX6uxE6yrjK6FcjqhtxXvLiFY8,161
|
|
73
|
+
thds.core-1.32.20250218201534.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
74
|
+
thds.core-1.32.20250218201534.dist-info/RECORD,,
|
thds/core/source.py
DELETED
|
@@ -1,238 +0,0 @@
|
|
|
1
|
-
"""Wrap openable, read-only data that is either locally-present or downloadable,
|
|
2
|
-
|
|
3
|
-
yet will not be downloaded (if non-local) until it is actually opened or unwrapped.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
import typing as ty
|
|
8
|
-
from dataclasses import dataclass
|
|
9
|
-
from functools import partial
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
|
|
12
|
-
from . import log
|
|
13
|
-
from .files import is_file_uri, path_from_uri, to_uri
|
|
14
|
-
from .hash_cache import filehash
|
|
15
|
-
from .hashing import Hash
|
|
16
|
-
from .types import StrOrPath
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class Downloader(ty.Protocol):
|
|
20
|
-
def __call__(self, hash: ty.Optional[Hash]) -> Path:
|
|
21
|
-
"""Closure over a URI that downloads a file to a local path and returns the path.
|
|
22
|
-
The file may be placed anywhere as long as the file will be readable until the
|
|
23
|
-
program exits.
|
|
24
|
-
|
|
25
|
-
If the URI points to a missing file, this MUST raise any Exception that the
|
|
26
|
-
underlying implementation desires. It MUST NOT return a Path pointing to a
|
|
27
|
-
non-existent file.
|
|
28
|
-
|
|
29
|
-
The Hash may be used to short-circuit a download that would result in downloading
|
|
30
|
-
a file that does not match the expected hash, but the Downloader need not verify
|
|
31
|
-
the Hash of the file downloaded after the fact, as that will be performed by
|
|
32
|
-
default by the Source object.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class DownloadHandler(ty.Protocol):
|
|
37
|
-
def __call__(self, uri: str) -> ty.Optional[Downloader]:
|
|
38
|
-
"""Returns a Downloader containing the URI if this URI can be handled. Returns
|
|
39
|
-
None if this URI cannot be handled.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def _LocalFileHandler(uri: str) -> ty.Optional[Downloader]:
|
|
44
|
-
if not is_file_uri(uri):
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
def download_file(hash: ty.Optional[Hash]) -> Path:
|
|
48
|
-
lpath = path_from_uri(uri)
|
|
49
|
-
if not lpath.exists():
|
|
50
|
-
raise FileNotFoundError(lpath)
|
|
51
|
-
if hash:
|
|
52
|
-
_check_hash(hash, lpath)
|
|
53
|
-
return lpath
|
|
54
|
-
|
|
55
|
-
return download_file
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def register_download_handler(key: str, handler: DownloadHandler):
|
|
59
|
-
# key is not currently used for anything other than avoiding
|
|
60
|
-
# having duplicates registered for whatever reason.
|
|
61
|
-
_DOWNLOAD_HANDLERS[key] = handler
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
_DOWNLOAD_HANDLERS: ty.Dict[str, DownloadHandler] = dict()
|
|
65
|
-
register_download_handler("local_file", _LocalFileHandler)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def _get_download_handler(uri: str) -> Downloader:
|
|
69
|
-
for handler in _DOWNLOAD_HANDLERS.values():
|
|
70
|
-
if downloader := handler(uri):
|
|
71
|
-
return downloader
|
|
72
|
-
raise ValueError(f"No SourcePath download handler for uri: {uri}")
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class SourceHashMismatchError(ValueError):
|
|
76
|
-
pass
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _check_hash(expected_hash: ty.Optional[Hash], path: Path) -> Hash:
|
|
80
|
-
hash_algo = expected_hash.algo if expected_hash else "sha256"
|
|
81
|
-
with log.logger_context(hash_for=f"source-{hash_algo}"):
|
|
82
|
-
computed_hash = filehash(hash_algo, path)
|
|
83
|
-
if expected_hash and expected_hash != computed_hash:
|
|
84
|
-
raise SourceHashMismatchError(
|
|
85
|
-
f"{expected_hash.algo} mismatch for {path};"
|
|
86
|
-
f" got {computed_hash.bytes!r}, expected {expected_hash.bytes!r}"
|
|
87
|
-
)
|
|
88
|
-
return computed_hash
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
@dataclass(frozen=True)
|
|
92
|
-
class Source(os.PathLike):
|
|
93
|
-
"""Source is meant to be a consistent in-memory representation for an abstract,
|
|
94
|
-
**read-only** source of data that may not be present locally when an application
|
|
95
|
-
starts.
|
|
96
|
-
|
|
97
|
-
A Source uses `os.PathLike` (`__fspath__`) to support transparent `open(src)` calls,
|
|
98
|
-
so in many cases it will be a drop-in replacement for Path or str filenames. If you
|
|
99
|
-
need an actual Path object, you can call `path()` to get one, but you should prefer to
|
|
100
|
-
defer this until the actual location of use.
|
|
101
|
-
|
|
102
|
-
By 'wrapping' read-only data in these objects, we can unify the code around how we
|
|
103
|
-
unwrap and use them, which should allow us to more easily support different execution
|
|
104
|
-
environments and sources of data.
|
|
105
|
-
|
|
106
|
-
For instance, a Source could be a file on disk, but it could also be a file in
|
|
107
|
-
ADLS.
|
|
108
|
-
|
|
109
|
-
Furthermore, libraries which build on top of this one may use this representation to
|
|
110
|
-
identify opportunities for optimization, by representing the Source in a stable
|
|
111
|
-
and consistent format that allows different underlying data sources to fulfill the
|
|
112
|
-
request for the data based on environmental context. A library could choose to
|
|
113
|
-
transparently transform a local-path-based Source into a Source representing a
|
|
114
|
-
remote file, without changing the semantics of the object as observed by the code.
|
|
115
|
-
|
|
116
|
-
One reason a Hash is part of the interface is so that libraries interacting with the
|
|
117
|
-
object can use the hash as a canonical 'name' for the data, if one is available.
|
|
118
|
-
|
|
119
|
-
Another reason is that we can add a layer of consistency checking to data we're
|
|
120
|
-
working with, at the cost of a few compute cycles. Since Sources are meant to represent
|
|
121
|
-
read-only data, the Hash is a meaningful and persistent marker of data identity.
|
|
122
|
-
|
|
123
|
-
Do not call its constructor in application code. Use `from_file` or `from_uri` instead.
|
|
124
|
-
"""
|
|
125
|
-
|
|
126
|
-
uri: str
|
|
127
|
-
hash: ty.Optional[Hash] = None
|
|
128
|
-
# hash and equality are based only on the _identity_ of the object,
|
|
129
|
-
# not on the other properties that provide some caching functionality.
|
|
130
|
-
|
|
131
|
-
@property
|
|
132
|
-
def cached_path(self) -> ty.Optional[Path]:
|
|
133
|
-
"""This is part of the public interface as far as checking to see whether a file
|
|
134
|
-
is already present locally, but its existence and value is not part of equality or
|
|
135
|
-
the hash for this class - it exists purely as an optimization.
|
|
136
|
-
"""
|
|
137
|
-
return getattr(self, "__cached_path", None)
|
|
138
|
-
|
|
139
|
-
def _set_cached_path(self, lpath: ty.Optional[Path]):
|
|
140
|
-
"""protected interface for setting a cached Path since the attribute is not
|
|
141
|
-
available via the constructor.
|
|
142
|
-
"""
|
|
143
|
-
super().__setattr__("__cached_path", lpath) # this works around dataclass.frozen.
|
|
144
|
-
# https://noklam.github.io/blog/posts/2022-04-22-python-dataclass-partiala-immutable.html
|
|
145
|
-
|
|
146
|
-
def path(self) -> Path:
|
|
147
|
-
"""Any Source can be turned into a local file path.
|
|
148
|
-
|
|
149
|
-
Remember that the resulting data is meant to be read-only. If you want to mutate
|
|
150
|
-
the data, you should first make a copy.
|
|
151
|
-
|
|
152
|
-
If not already present locally, this will incur a one-time download. Then, if the
|
|
153
|
-
Source has a Hash, the Hash will be validated against the downloaded file, and a
|
|
154
|
-
failure will raise SourceHashMismatchError.
|
|
155
|
-
"""
|
|
156
|
-
if self.cached_path is None or not self.cached_path.exists():
|
|
157
|
-
lpath = _get_download_handler(self.uri)(self.hash)
|
|
158
|
-
# path() used to be responsible for checking the hash, but since we pass it to the downloader,
|
|
159
|
-
# it really makes more sense to allow the downloader to decide how to verify its own download,
|
|
160
|
-
# and we don't want to duplicate any effort that it may have already put in.
|
|
161
|
-
self._set_cached_path(lpath)
|
|
162
|
-
|
|
163
|
-
assert self.cached_path and self.cached_path.exists()
|
|
164
|
-
return self.cached_path
|
|
165
|
-
|
|
166
|
-
def __fspath__(self) -> str:
|
|
167
|
-
return os.fspath(self.path())
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
# Creation from local Files or from remote URIs
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def from_file(filename: StrOrPath, hash: ty.Optional[Hash] = None, uri: str = "") -> Source:
|
|
174
|
-
"""Create a read-only Source from a local file that already exists.
|
|
175
|
-
|
|
176
|
-
If URI is passed, the local file will be read and hashed, but the final URI in the
|
|
177
|
-
Source will be the one provided explicitly. NO UPLOAD IS PERFORMED. It is your
|
|
178
|
-
responsibility to ensure that your file has been uploaded to the URI you provide.
|
|
179
|
-
"""
|
|
180
|
-
path = path_from_uri(filename) if isinstance(filename, str) else filename
|
|
181
|
-
assert isinstance(path, Path)
|
|
182
|
-
if not path.exists():
|
|
183
|
-
raise FileNotFoundError(path)
|
|
184
|
-
|
|
185
|
-
if uri:
|
|
186
|
-
src = from_uri(uri, _check_hash(hash, path))
|
|
187
|
-
else:
|
|
188
|
-
src = Source(to_uri(path), _check_hash(hash, path))
|
|
189
|
-
src._set_cached_path(path) # internally, it's okay to hack around immutability.
|
|
190
|
-
return src
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
class FromUri(ty.Protocol):
|
|
194
|
-
def __call__(self, hash: ty.Optional[Hash]) -> Source:
|
|
195
|
-
"""Closure over a URI that creates a Source from a URI.
|
|
196
|
-
|
|
197
|
-
The Hash may be used to short-circuit creation that would result in creating
|
|
198
|
-
a Source that cannot match the expected Hash, but this is not required,
|
|
199
|
-
and the hash will be included in the Source object regardless, and will
|
|
200
|
-
be validated (if non-nil) at the time of source data access.
|
|
201
|
-
"""
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
class FromUriHandler(ty.Protocol):
|
|
205
|
-
def __call__(self, uri: str) -> ty.Optional[FromUri]:
|
|
206
|
-
"""Returns a FromUri object containing the URI if this URI can be handled. Returns
|
|
207
|
-
None if this URI cannot be handled.
|
|
208
|
-
"""
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
def register_from_uri_handler(key: str, handler: FromUriHandler):
|
|
212
|
-
"""If a library wants to customize how Sources are created from URIs that it handles,
|
|
213
|
-
it can register a handler here.
|
|
214
|
-
"""
|
|
215
|
-
# key is not currently used for anything other than avoiding
|
|
216
|
-
# having duplicates registered for whatever reason.
|
|
217
|
-
_FROM_URI_HANDLERS[key] = handler
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
_FROM_URI_HANDLERS: ty.Dict[str, FromUriHandler] = dict()
|
|
221
|
-
register_from_uri_handler(
|
|
222
|
-
"local_file", lambda uri: partial(from_file, path_from_uri(uri)) if is_file_uri(uri) else None
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
def from_uri(uri: str, hash: ty.Optional[Hash] = None) -> Source:
|
|
227
|
-
"""Create a read-only Source from a URI. The data should already exist at this remote
|
|
228
|
-
URI, although Source itself can make no guarantee that it always represents real data
|
|
229
|
-
- only that any data it does represent is read-only.
|
|
230
|
-
|
|
231
|
-
It may be advantageous for a URI-handling library to register a more specific
|
|
232
|
-
implementation of this function, if it is capable of determining a Hash for the blob
|
|
233
|
-
represented by the URI without downloading the blob.
|
|
234
|
-
"""
|
|
235
|
-
for handler in _FROM_URI_HANDLERS.values():
|
|
236
|
-
if from_uri_ := handler(uri):
|
|
237
|
-
return from_uri_(hash)
|
|
238
|
-
return Source(uri=uri, hash=hash)
|
|
File without changes
|
{thds.core-1.31.20250213162956.dist-info → thds.core-1.32.20250218201534.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{thds.core-1.31.20250213162956.dist-info → thds.core-1.32.20250218201534.dist-info}/top_level.txt
RENAMED
|
File without changes
|