thds.core 0.0.1__py3-none-any.whl → 1.31.20250123022540__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.core might be problematic. Click here for more details.
- thds/core/__init__.py +48 -0
- thds/core/ansi_esc.py +46 -0
- thds/core/cache.py +201 -0
- thds/core/calgitver.py +82 -0
- thds/core/concurrency.py +100 -0
- thds/core/config.py +250 -0
- thds/core/decos.py +55 -0
- thds/core/dict_utils.py +188 -0
- thds/core/env.py +40 -0
- thds/core/exit_after.py +121 -0
- thds/core/files.py +125 -0
- thds/core/fretry.py +115 -0
- thds/core/generators.py +56 -0
- thds/core/git.py +81 -0
- thds/core/hash_cache.py +86 -0
- thds/core/hashing.py +106 -0
- thds/core/home.py +15 -0
- thds/core/hostname.py +10 -0
- thds/core/imports.py +17 -0
- thds/core/inspect.py +58 -0
- thds/core/iterators.py +9 -0
- thds/core/lazy.py +83 -0
- thds/core/link.py +153 -0
- thds/core/log/__init__.py +29 -0
- thds/core/log/basic_config.py +171 -0
- thds/core/log/json_formatter.py +43 -0
- thds/core/log/kw_formatter.py +84 -0
- thds/core/log/kw_logger.py +93 -0
- thds/core/log/logfmt.py +302 -0
- thds/core/merge_args.py +168 -0
- thds/core/meta.json +8 -0
- thds/core/meta.py +518 -0
- thds/core/parallel.py +200 -0
- thds/core/pickle_visit.py +24 -0
- thds/core/prof.py +276 -0
- thds/core/progress.py +112 -0
- thds/core/protocols.py +17 -0
- thds/core/py.typed +0 -0
- thds/core/scaling.py +39 -0
- thds/core/scope.py +199 -0
- thds/core/source.py +238 -0
- thds/core/source_serde.py +104 -0
- thds/core/sqlite/__init__.py +21 -0
- thds/core/sqlite/connect.py +33 -0
- thds/core/sqlite/copy.py +35 -0
- thds/core/sqlite/ddl.py +4 -0
- thds/core/sqlite/functions.py +63 -0
- thds/core/sqlite/index.py +22 -0
- thds/core/sqlite/insert_utils.py +23 -0
- thds/core/sqlite/merge.py +84 -0
- thds/core/sqlite/meta.py +190 -0
- thds/core/sqlite/read.py +66 -0
- thds/core/sqlite/sqlmap.py +179 -0
- thds/core/sqlite/structured.py +138 -0
- thds/core/sqlite/types.py +64 -0
- thds/core/sqlite/upsert.py +139 -0
- thds/core/sqlite/write.py +99 -0
- thds/core/stack_context.py +41 -0
- thds/core/thunks.py +40 -0
- thds/core/timer.py +214 -0
- thds/core/tmp.py +85 -0
- thds/core/types.py +4 -0
- thds.core-1.31.20250123022540.dist-info/METADATA +68 -0
- thds.core-1.31.20250123022540.dist-info/RECORD +67 -0
- {thds.core-0.0.1.dist-info → thds.core-1.31.20250123022540.dist-info}/WHEEL +1 -1
- thds.core-1.31.20250123022540.dist-info/entry_points.txt +4 -0
- thds.core-1.31.20250123022540.dist-info/top_level.txt +1 -0
- thds.core-0.0.1.dist-info/METADATA +0 -8
- thds.core-0.0.1.dist-info/RECORD +0 -4
- thds.core-0.0.1.dist-info/top_level.txt +0 -1
thds/core/files.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Various assorted file-related utilities."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
import resource
|
|
6
|
+
import shutil
|
|
7
|
+
import stat
|
|
8
|
+
import typing as ty
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from io import BufferedWriter, TextIOWrapper
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from . import config, hashing
|
|
14
|
+
from .log import getLogger
|
|
15
|
+
from .tmp import temppath_same_fs
|
|
16
|
+
from .types import StrOrPath
|
|
17
|
+
|
|
18
|
+
FILE_SCHEME = "file://"
|
|
19
|
+
logger = getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def set_read_only(fpath: StrOrPath):
|
|
23
|
+
# thank you https://stackoverflow.com/a/51262451
|
|
24
|
+
logger.debug("Setting '%s' to read-only", fpath)
|
|
25
|
+
perms = stat.S_IMODE(os.lstat(fpath).st_mode)
|
|
26
|
+
ro_mask = 0o777 ^ (stat.S_IWRITE | stat.S_IWGRP | stat.S_IWOTH)
|
|
27
|
+
os.chmod(fpath, perms & ro_mask)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def remove_file_scheme(uri: str) -> str:
|
|
31
|
+
"""Does not require the file scheme to exist, but removes it if it's there."""
|
|
32
|
+
return uri[len(FILE_SCHEME) :] if uri.startswith(FILE_SCHEME) else uri
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def path_from_uri(uri: str) -> Path:
|
|
36
|
+
str_path = remove_file_scheme(uri)
|
|
37
|
+
if not str_path:
|
|
38
|
+
raise ValueError('Cannot convert an empty string to a Path. Did you mean to use "."?')
|
|
39
|
+
return Path(str_path)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def to_uri(path: Path) -> str:
|
|
43
|
+
return FILE_SCHEME + os.fspath(path.resolve())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def is_file_uri(uri: str) -> bool:
|
|
47
|
+
return uri.startswith(FILE_SCHEME)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@contextmanager
|
|
51
|
+
def atomic_write_path(destination: StrOrPath) -> ty.Iterator[Path]:
|
|
52
|
+
"""Shorthand context manager for doing an atomic write (i.e., write to a temporary file,
|
|
53
|
+
then atomically move that temporary file to your final destination.
|
|
54
|
+
|
|
55
|
+
You must open and then close the file within the provided context. Unclosed files
|
|
56
|
+
will likely result in data loss or other bugs.
|
|
57
|
+
"""
|
|
58
|
+
destpath = path_from_uri(destination) if isinstance(destination, str) else Path(destination)
|
|
59
|
+
with temppath_same_fs(destpath) as temp_writable_path:
|
|
60
|
+
yield temp_writable_path
|
|
61
|
+
destpath.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
shutil.move(str(temp_writable_path), destpath)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@contextmanager
|
|
66
|
+
def atomic_binary_writer(destination: StrOrPath) -> ty.Iterator[BufferedWriter]:
|
|
67
|
+
"""Even shorter shorthand for writing binary data to a file, atomically."""
|
|
68
|
+
with atomic_write_path(destination) as temp_writable_path:
|
|
69
|
+
with open(temp_writable_path, "wb") as f:
|
|
70
|
+
yield f
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@contextmanager
|
|
74
|
+
def atomic_text_writer(destination: StrOrPath) -> ty.Iterator[TextIOWrapper]:
|
|
75
|
+
"""Even shorter shorthand for writing text data to a file, atomically."""
|
|
76
|
+
with atomic_write_path(destination) as temp_writable_path:
|
|
77
|
+
with open(temp_writable_path, "w") as f:
|
|
78
|
+
yield f
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
OPEN_FILES_LIMIT = config.item("limit_open", 10000)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def set_file_limit(n: int):
|
|
85
|
+
"""Works like calling `ulimit -Sn <N>` on a Mac."""
|
|
86
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (n, n))
|
|
87
|
+
assert resource.getrlimit(resource.RLIMIT_NOFILE) == (n, n)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def bump_limits():
|
|
91
|
+
"""It was common to have to do this manually on our macs. Now that is no longer required."""
|
|
92
|
+
set_file_limit(OPEN_FILES_LIMIT())
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def shorten_filename(maybe_too_long_name: StrOrPath, max_len: int = 255, retain_last: int = 30) -> str:
|
|
96
|
+
"""Shortens a filename, using a deterministic and probabilistically-unique (hash-based) algorithm.
|
|
97
|
+
|
|
98
|
+
The filename is only changed if it exceeds the provided max_len limit.
|
|
99
|
+
|
|
100
|
+
The limit defaults to 255 _bytes_ since that is what many filesystems have
|
|
101
|
+
generally supported. https://en.wikipedia.org/wiki/Comparison_of_file_systems#Limits
|
|
102
|
+
|
|
103
|
+
We intentionally take our 'bite' out of the middle of the filename, so that the file extension is preserved
|
|
104
|
+
and so that the first part of the path also remains human-readable.
|
|
105
|
+
"""
|
|
106
|
+
# p for Path, s for str, b for bytes - too many things flying around to keep track of without this.
|
|
107
|
+
s_maybe_too_long_name = Path(maybe_too_long_name).name
|
|
108
|
+
b_filename = s_maybe_too_long_name.encode()
|
|
109
|
+
|
|
110
|
+
if len(b_filename) <= max_len:
|
|
111
|
+
# no need to mess with anything - it will 'fit' inside the root path already.
|
|
112
|
+
return s_maybe_too_long_name
|
|
113
|
+
|
|
114
|
+
b_md5_of_filename = (
|
|
115
|
+
b"-md5-" + hashing.hash_using(b_filename, hashlib.md5()).hexdigest().encode() + b"-"
|
|
116
|
+
)
|
|
117
|
+
b_last_n = b_filename[-retain_last:]
|
|
118
|
+
b_first_n = b_filename[: max_len - len(b_md5_of_filename) - len(b_last_n)]
|
|
119
|
+
b_modified_filename = b_first_n + b_md5_of_filename + b_last_n
|
|
120
|
+
assert len(b_modified_filename) <= max_len, (
|
|
121
|
+
b_modified_filename,
|
|
122
|
+
len(b_modified_filename),
|
|
123
|
+
max_len,
|
|
124
|
+
)
|
|
125
|
+
return b_modified_filename.decode()
|
thds/core/fretry.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""A more composable retry decorator."""
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
import typing as ty
|
|
6
|
+
from functools import wraps
|
|
7
|
+
from logging import getLogger
|
|
8
|
+
from timeit import default_timer
|
|
9
|
+
|
|
10
|
+
F = ty.TypeVar("F", bound=ty.Callable)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
IsRetryable = ty.Callable[[Exception], bool]
|
|
14
|
+
RetryStrategy = ty.Iterable[IsRetryable]
|
|
15
|
+
RetryStrategyFactory = ty.Callable[[], RetryStrategy]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def expo(
|
|
19
|
+
*, retries: int, delay: float = 1.0, backoff: int = 2, jitter: bool = True
|
|
20
|
+
) -> ty.Callable[[], ty.Iterator[float]]:
|
|
21
|
+
"""End iteration after yielding 'retries' times.
|
|
22
|
+
|
|
23
|
+
The first retry is immediate (i.e. 0). Subsequent retries will follow the schedule
|
|
24
|
+
established by the exponential backoff algorithm. The default algorithm is 1, 3, 7,
|
|
25
|
+
15, etc., but also adds jitter.
|
|
26
|
+
|
|
27
|
+
If you want infinite exponential values, pass a negative number for 'retries'.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def expo_() -> ty.Iterator[float]:
|
|
31
|
+
count = 0
|
|
32
|
+
accum_jitter = 0.0
|
|
33
|
+
while retries < 0 or count < retries:
|
|
34
|
+
expo_delay = (backoff**count * delay) - delay # first retry is immediate
|
|
35
|
+
if jitter:
|
|
36
|
+
jitter_delay = random.uniform(0.5, 1.5) * expo_delay
|
|
37
|
+
yield jitter_delay + accum_jitter
|
|
38
|
+
accum_jitter = expo_delay - jitter_delay
|
|
39
|
+
else:
|
|
40
|
+
yield expo_delay
|
|
41
|
+
count += 1
|
|
42
|
+
|
|
43
|
+
return expo_
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def sleep(
|
|
47
|
+
mk_seconds_iter: ty.Callable[[], ty.Iterable[float]],
|
|
48
|
+
sleeper: ty.Callable[[float], ty.Any] = time.sleep,
|
|
49
|
+
) -> ty.Callable[[], ty.Iterator[str]]:
|
|
50
|
+
"""A common base strategy for separating retries by sleeps.
|
|
51
|
+
|
|
52
|
+
Yield once prior to the first sleep, and once before each sleep.
|
|
53
|
+
In other words, the total number of yields is the length of the input iterable (if it is finite).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def sleep_() -> ty.Iterator[str]:
|
|
57
|
+
start = default_timer()
|
|
58
|
+
|
|
59
|
+
so_far = 0.0
|
|
60
|
+
for i, secs in enumerate(mk_seconds_iter(), start=1):
|
|
61
|
+
yield f"attempt {i} after {so_far:.2f}s"
|
|
62
|
+
so_far = default_timer() - start
|
|
63
|
+
sleeper(secs)
|
|
64
|
+
|
|
65
|
+
return sleep_
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def retry(retry_strategy_factory: RetryStrategyFactory) -> ty.Callable[[F], F]:
|
|
69
|
+
"""Uses your retry strategy every time an exception is raised.
|
|
70
|
+
Your iterable can therefore provide different handling for each
|
|
71
|
+
incrementing error, as well as configurable delays between errors,
|
|
72
|
+
etc.
|
|
73
|
+
|
|
74
|
+
If the retry_strategy iterator itself ends (or is empty to begin
|
|
75
|
+
with), the function will be called one final time.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def _retry_decorator(func: F) -> F:
|
|
79
|
+
@wraps(func)
|
|
80
|
+
def retry_wrapper(*args, **kwargs):
|
|
81
|
+
for i, is_retryable in enumerate(retry_strategy_factory(), start=1):
|
|
82
|
+
try:
|
|
83
|
+
return func(*args, **kwargs)
|
|
84
|
+
except Exception as ex:
|
|
85
|
+
if not is_retryable(ex):
|
|
86
|
+
raise ex
|
|
87
|
+
getLogger(__name__).info("Retry #%d for %s due to exception %s", i, func, ex)
|
|
88
|
+
# one final retry that, if it fails, will not get caught and retried.
|
|
89
|
+
return func(*args, **kwargs)
|
|
90
|
+
|
|
91
|
+
return ty.cast(F, retry_wrapper)
|
|
92
|
+
|
|
93
|
+
return _retry_decorator
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def retry_regular(
|
|
97
|
+
is_retryable: IsRetryable,
|
|
98
|
+
intervals_factory: ty.Callable[[], ty.Iterable[ty.Any]],
|
|
99
|
+
) -> ty.Callable[[F], F]:
|
|
100
|
+
return retry(lambda: (is_retryable for _ in intervals_factory()))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def retry_sleep(
|
|
104
|
+
is_retryable: IsRetryable,
|
|
105
|
+
seconds_iter: ty.Callable[[], ty.Iterable[float]],
|
|
106
|
+
) -> ty.Callable[[F], F]:
|
|
107
|
+
"""E.g. retry_sleep(expo(retries=5)) to get max 6 calls to the function."""
|
|
108
|
+
return retry_regular(is_retryable, sleep(seconds_iter))
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def is_exc(*exc_types: ty.Type[Exception]) -> IsRetryable:
|
|
112
|
+
def _is_exc_retryable(exc: Exception) -> bool:
|
|
113
|
+
return isinstance(exc, exc_types)
|
|
114
|
+
|
|
115
|
+
return _is_exc_retryable
|
thds/core/generators.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Import this module by its name, so that references to things within it are qualified by
|
|
2
|
+
the word 'generators', e.g. generators.sender()
|
|
3
|
+
"""
|
|
4
|
+
import contextlib
|
|
5
|
+
import typing as ty
|
|
6
|
+
|
|
7
|
+
T = ty.TypeVar("T")
|
|
8
|
+
R = ty.TypeVar("R")
|
|
9
|
+
GEN = ty.TypeVar("GEN", bound=ty.Generator)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class return_wrapper(contextlib.AbstractContextManager, ty.Generic[GEN, R]):
|
|
13
|
+
"""Allows you to wrap a generator that accepts and/or yields values,
|
|
14
|
+
but this will prime the generator and also close it at the end and fetch
|
|
15
|
+
its return value.
|
|
16
|
+
|
|
17
|
+
This will be somewhat easier in 3.13 with the new `gen.close()` behavior.
|
|
18
|
+
https://discuss.python.org/t/let-generator-close-return-stopiteration-value/24786
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, gen: GEN):
|
|
22
|
+
self.gen = gen
|
|
23
|
+
|
|
24
|
+
def __enter__(self) -> GEN:
|
|
25
|
+
next(self.gen) # prime the generator
|
|
26
|
+
return self.gen
|
|
27
|
+
|
|
28
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
29
|
+
if exc_type is not None:
|
|
30
|
+
# TODO confirm that this is the correct behavior
|
|
31
|
+
self.gen.throw(exc_type, exc_value, traceback)
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
self.gen.throw(GeneratorExit)
|
|
35
|
+
# equivalent to gen.close() but also gives us StopIteration.value
|
|
36
|
+
except StopIteration as e:
|
|
37
|
+
self._return_value = e.value
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def return_value(self) -> R:
|
|
41
|
+
"""Only available after the context manager has exited."""
|
|
42
|
+
return self._return_value
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def iterator_sender(gen: ty.Generator[ty.Any, T, R], iterator: ty.Iterable[T]) -> R:
|
|
46
|
+
"""This encapsulates the send/close behavior we want in general. See
|
|
47
|
+
https://discuss.python.org/t/let-generator-close-return-stopiteration-value/24786
|
|
48
|
+
for how a simple `gen.close()` will do this in 3.13.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
gen_wrapper: return_wrapper[ty.Generator, R] = return_wrapper(gen) # type: ignore[arg-type]
|
|
52
|
+
with gen_wrapper:
|
|
53
|
+
for i in iterator:
|
|
54
|
+
gen.send(i)
|
|
55
|
+
|
|
56
|
+
return gen_wrapper.return_value
|
thds/core/git.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# some basic git utilities.
|
|
2
|
+
#
|
|
3
|
+
# All of these will error if git is not available, or if the repo is not present. The
|
|
4
|
+
# caller is expected to catch subprocess.CalledProcessError as well as FileNotFoundError.
|
|
5
|
+
import os
|
|
6
|
+
import subprocess as sp
|
|
7
|
+
import typing as ty
|
|
8
|
+
|
|
9
|
+
from . import log
|
|
10
|
+
|
|
11
|
+
LOGGER = log.getLogger(__name__)
|
|
12
|
+
CALGITVER_NO_SECONDS_FORMAT = "%Y%m%d.%H%M"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
NO_GIT = (sp.CalledProcessError, FileNotFoundError)
|
|
16
|
+
# FileNotFoundError can happen if git is not installed at all.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _simple_run(s_or_l_cmd: ty.Union[str, ty.List[str]], env=None, cwd=None) -> str:
|
|
20
|
+
kwargs = dict(text=True, shell=True, env=env, cwd=cwd)
|
|
21
|
+
if isinstance(s_or_l_cmd, list):
|
|
22
|
+
kwargs["shell"] = False
|
|
23
|
+
return sp.check_output(s_or_l_cmd, **kwargs).rstrip("\n")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_repo_name() -> str:
|
|
27
|
+
return _simple_run("git remote get-url origin").split("/")[-1].rstrip().split(".")[0]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_commit_hash() -> str:
|
|
31
|
+
LOGGER.debug("`get_commit` reading from Git repo.")
|
|
32
|
+
return _simple_run("git rev-parse --verify HEAD")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_clean() -> bool:
|
|
36
|
+
LOGGER.debug("`is_clean` reading from Git repo.")
|
|
37
|
+
# command will show changes (staged and unstaged) in the working tree since the last commit.
|
|
38
|
+
# if there are none (i.e the repo is clean), an empty string will be printed
|
|
39
|
+
# https://git-scm.com/docs/git-diff#Documentation/git-diff.txt-Variouswaystocheckyourworkingtree
|
|
40
|
+
return "" == _simple_run("git diff HEAD")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_branch() -> str:
|
|
44
|
+
LOGGER.debug("`get_branch` reading from Git repo.")
|
|
45
|
+
return _simple_run("git branch --show-current")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_commit_datetime_and_hash(
|
|
49
|
+
*file_patterns: str,
|
|
50
|
+
cwd: ty.Optional[str] = None,
|
|
51
|
+
date_format: str = CALGITVER_NO_SECONDS_FORMAT,
|
|
52
|
+
) -> ty.Tuple[str, str]:
|
|
53
|
+
"""Useful for making a CalGitVer from a file or set of matching files.
|
|
54
|
+
|
|
55
|
+
If no file patterns were provided, it will return the commit datetime and hash of the
|
|
56
|
+
most recent commit.
|
|
57
|
+
"""
|
|
58
|
+
assert " " not in date_format, "date_format cannot contain spaces"
|
|
59
|
+
dt, hash = (
|
|
60
|
+
_simple_run(
|
|
61
|
+
# the space between %cd and %h allows us to split on it
|
|
62
|
+
f"git log -n 1 --date=format-local:{date_format} --format=format:'%cd %H' -- "
|
|
63
|
+
+ " ".join(file_patterns),
|
|
64
|
+
env=dict(os.environ, TZ="UTC0"),
|
|
65
|
+
cwd=cwd,
|
|
66
|
+
)
|
|
67
|
+
.strip("'")
|
|
68
|
+
.split(" ")
|
|
69
|
+
)
|
|
70
|
+
return dt, hash
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_merge_base(branch1: str = "", branch2: str = "main") -> str:
|
|
74
|
+
return _simple_run(f"git merge-base {branch1 or get_branch()} {branch2}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_commit_datetime_str(commit_hash: str, date_format: str = CALGITVER_NO_SECONDS_FORMAT) -> str:
|
|
78
|
+
return _simple_run(
|
|
79
|
+
f"git log -n 1 --date=format-local:{date_format} --format=format:'%cd' {commit_hash}",
|
|
80
|
+
env=dict(os.environ, TZ="UTC0"),
|
|
81
|
+
)
|
thds/core/hash_cache.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Sometimes, you just want to cache hashes. Specifically, hashes of files.
|
|
2
|
+
|
|
3
|
+
We cache these hashes as files themselves, and the default location is under the user's
|
|
4
|
+
home directory.
|
|
5
|
+
|
|
6
|
+
The name of the file is an implementation detail that includes the hash of the file path,
|
|
7
|
+
the directory it lives in is the hashlib name of the hash algorithm, and the contents of
|
|
8
|
+
the file are the raw bytes of the hash. However, none of these details is guaranteed to
|
|
9
|
+
remain stable over time, and the only stable interface is the `hash_file` and `filehash`
|
|
10
|
+
functions themselves.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import os
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from . import config, files
|
|
19
|
+
from .hashing import Hash, hash_using
|
|
20
|
+
from .home import HOMEDIR
|
|
21
|
+
from .log import getLogger
|
|
22
|
+
from .types import StrOrPath
|
|
23
|
+
|
|
24
|
+
CACHE_HASH_DIR = config.item("directory", HOMEDIR() / ".hash-cache", parse=Path)
|
|
25
|
+
_1GB = 1 * 2**30 # log if hashing a file larger than this, since it will be slow.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _filecachekey(path: Path, hashtype: str) -> Path:
|
|
32
|
+
# the construction of our cache key here is somewhat arbitrary,
|
|
33
|
+
# and the name substring is really just for debugging purposes.
|
|
34
|
+
# however, the filesize is a useful bit of additional 'entropy'
|
|
35
|
+
# that will help us avoid edge cases that might arise from race
|
|
36
|
+
# conditions, and the approach must remain stable over time for
|
|
37
|
+
# the cache to provide a meaningful advantage.
|
|
38
|
+
path_str = str(path)
|
|
39
|
+
path_hash = hash_using(path_str.encode(), hashlib.sha256()).hexdigest()
|
|
40
|
+
# we use a compressed (hashed) version of the path because
|
|
41
|
+
# filenames can get kind of long and we don't want to deal with
|
|
42
|
+
# long filenames blowing up our system by being unwritable.
|
|
43
|
+
return (
|
|
44
|
+
CACHE_HASH_DIR()
|
|
45
|
+
/ hashtype
|
|
46
|
+
/ (path_str[-50:].replace("/", "|") + "-" + path_hash + "+" + str(path.stat().st_size))
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _is_no_older_than(file: Path, other: Path) -> bool:
|
|
51
|
+
"""Returns True if `file` is no older than `other`. Both files must exist."""
|
|
52
|
+
return file.stat().st_mtime >= other.stat().st_mtime
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def hash_file(filepath: StrOrPath, hasher: Any) -> bytes:
|
|
56
|
+
"""Hashes a file with the given hashlib hasher. If we've already previously computed
|
|
57
|
+
the given hash for the file and the file hasn't changed (according to filesystem
|
|
58
|
+
mtime) since we stored that hash, we'll just return the cached hash.
|
|
59
|
+
|
|
60
|
+
File must exist and respond positively to stat().
|
|
61
|
+
"""
|
|
62
|
+
resolved_path = Path(filepath).resolve()
|
|
63
|
+
cached_hash_path = _filecachekey(resolved_path, hasher.name)
|
|
64
|
+
# now we can check to see if we have hash bytes for that file somewhere already.
|
|
65
|
+
hash_cached = "hash-cached" if cached_hash_path.exists() else ""
|
|
66
|
+
if hash_cached and _is_no_older_than(cached_hash_path, resolved_path):
|
|
67
|
+
logger.debug("Reusing known hash for %s - cache key %s", resolved_path, cached_hash_path)
|
|
68
|
+
return cached_hash_path.read_bytes()
|
|
69
|
+
|
|
70
|
+
psize = resolved_path.stat().st_size
|
|
71
|
+
if psize > _1GB:
|
|
72
|
+
log_at_lvl = logger.warning if hash_cached else logger.info
|
|
73
|
+
# I want to know how often we're finding 'outdated' hashes; those should be rare.
|
|
74
|
+
log_at_lvl(f"Hashing {psize/_1GB:.2f} GB file at {resolved_path}{hash_cached}")
|
|
75
|
+
|
|
76
|
+
hash_bytes = hash_using(resolved_path, hasher).digest()
|
|
77
|
+
cached_hash_path.parent.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
with files.atomic_binary_writer(cached_hash_path) as f:
|
|
79
|
+
f.write(hash_bytes)
|
|
80
|
+
return hash_bytes
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def filehash(algo: str, pathlike: os.PathLike) -> Hash:
|
|
84
|
+
"""Wraps a cached hash of a file in a core.hashing.Hash object, which carries the name
|
|
85
|
+
of the hash algorithm used."""
|
|
86
|
+
return Hash(algo, hash_file(pathlike, hashlib.new(algo)))
|
thds/core/hashing.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""
|
|
2
|
+
https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
|
|
3
|
+
I have written this code too many times to write it again. Why isn't this in the stdlib?
|
|
4
|
+
"""
|
|
5
|
+
import base64
|
|
6
|
+
import contextlib
|
|
7
|
+
import io
|
|
8
|
+
import os
|
|
9
|
+
import threading
|
|
10
|
+
import typing as ty
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
# Python threads don't allow for significant CPU parallelism, so
|
|
14
|
+
# allowing for more than a few of these per process is a recipe for
|
|
15
|
+
# getting nothing done.
|
|
16
|
+
_SEMAPHORE = threading.BoundedSemaphore(int(os.getenv("THDS_CORE_HASHING_PARALLELISM", 4)))
|
|
17
|
+
_CHUNK_SIZE = int(os.getenv("THDS_CORE_HASHING_CHUNK_SIZE", 65536))
|
|
18
|
+
# https://stackoverflow.com/questions/17731660/hashlib-optimal-size-of-chunks-to-be-used-in-md5-update
|
|
19
|
+
# this may not apply to us as the architecture is 32 bit, but it's at
|
|
20
|
+
# least a halfway decent guess and benchmarking this ourselves would
|
|
21
|
+
# be a massive waste of time.
|
|
22
|
+
|
|
23
|
+
T = ty.TypeVar("T")
|
|
24
|
+
SomehowReadable = ty.Union[ty.AnyStr, ty.IO[ty.AnyStr], Path]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def hash_readable_chunks(bytes_readable: ty.IO[bytes], hasher: T) -> T:
|
|
28
|
+
"""Return thing you can call .digest or .hexdigest on.
|
|
29
|
+
|
|
30
|
+
E.g.:
|
|
31
|
+
|
|
32
|
+
hash_readable_chunks(open(Path('foo/bar'), 'rb'), hashlib.sha256()).hexdigest()
|
|
33
|
+
"""
|
|
34
|
+
with _SEMAPHORE:
|
|
35
|
+
for chunk in iter(lambda: bytes_readable.read(_CHUNK_SIZE), b""):
|
|
36
|
+
hasher.update(chunk) # type: ignore
|
|
37
|
+
return hasher
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@contextlib.contextmanager
|
|
41
|
+
def attempt_readable(thing: SomehowReadable) -> ty.Iterator[ty.IO[bytes]]:
|
|
42
|
+
"""Best effort: make this object a bytes-readable."""
|
|
43
|
+
if hasattr(thing, "read") and hasattr(thing, "seek"):
|
|
44
|
+
try:
|
|
45
|
+
yield thing # type: ignore
|
|
46
|
+
return
|
|
47
|
+
finally:
|
|
48
|
+
thing.seek(0) # type: ignore
|
|
49
|
+
elif isinstance(thing, bytes):
|
|
50
|
+
yield io.BytesIO(thing)
|
|
51
|
+
return
|
|
52
|
+
with open(thing, "rb") as readable: # type: ignore
|
|
53
|
+
yield readable
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def hash_using(data: SomehowReadable, hasher: T) -> T:
|
|
57
|
+
"""This is quite dynamic - but if your data object is not readable
|
|
58
|
+
bytes and is not openable as bytes, you'll get a
|
|
59
|
+
FileNotFoundError, or possibly a TypeError or other gremlin.
|
|
60
|
+
|
|
61
|
+
Therefore, you may pass whatever you want unless it's an actual
|
|
62
|
+
string - if you want your actual string hashed, you should encode
|
|
63
|
+
it as actual bytes first.
|
|
64
|
+
"""
|
|
65
|
+
with attempt_readable(data) as readable:
|
|
66
|
+
return hash_readable_chunks(readable, hasher)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def hash_anything(data: SomehowReadable, hasher: T) -> ty.Optional[T]:
|
|
70
|
+
try:
|
|
71
|
+
return hash_using(data, hasher)
|
|
72
|
+
except (FileNotFoundError, TypeError):
|
|
73
|
+
# it's unlikely we can operate on this data?
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def b64(digest: bytes) -> str:
|
|
78
|
+
"""The string representation commonly used by Azure utilities.
|
|
79
|
+
|
|
80
|
+
We use it in cases where we want to represent the same hash that
|
|
81
|
+
ADLS will have in UTF-8 string (instead of bytes) format.
|
|
82
|
+
"""
|
|
83
|
+
return base64.b64encode(digest).decode()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def db64(s: str) -> bytes:
|
|
87
|
+
"""Shorthand for the inverse of b64."""
|
|
88
|
+
return base64.b64decode(s)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _repr_bytes(bs: bytes) -> str:
|
|
92
|
+
return f"db64('{b64(bs)}')"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Hash(ty.NamedTuple):
|
|
96
|
+
"""Algorithm name needs to match something supported by hashlib.
|
|
97
|
+
|
|
98
|
+
A good choice would be sha256. Use md5 if you have to.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
algo: str
|
|
102
|
+
# valid algorithm names listed here: https://docs.python.org/3/library/hashlib.html#constructors
|
|
103
|
+
bytes: bytes
|
|
104
|
+
|
|
105
|
+
def __repr__(self) -> str:
|
|
106
|
+
return f"Hash(algo='{self.algo}', bytes={_repr_bytes(self.bytes)})"
|
thds/core/home.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from .config import item
|
|
5
|
+
|
|
6
|
+
# On our GitHub runners, we can't make hardlinks from /runner/home to where our stuff actually goes.
|
|
7
|
+
# This allows us to use 'a' home directory that is on the same filesystem.
|
|
8
|
+
_RUNNER_WORK = Path("/runner/_work")
|
|
9
|
+
if os.getenv("CI") and _RUNNER_WORK.exists() and _RUNNER_WORK.is_dir():
|
|
10
|
+
__home = _RUNNER_WORK
|
|
11
|
+
else:
|
|
12
|
+
__home = Path.home()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
HOMEDIR = item("thds.core.homedir", parse=Path, default=__home)
|
thds/core/hostname.py
ADDED
thds/core/imports.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from importlib import import_module
|
|
2
|
+
from importlib.resources import Package
|
|
3
|
+
|
|
4
|
+
from .meta import get_base_package
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def try_imports(*modules: str, module: Package = "", extra: str = "") -> None:
|
|
8
|
+
try:
|
|
9
|
+
for m in modules:
|
|
10
|
+
import_module(m)
|
|
11
|
+
except ImportError:
|
|
12
|
+
if extra and module:
|
|
13
|
+
raise ImportError(
|
|
14
|
+
f"Install the '{extra}' extra for `{get_base_package(module)}` to use `{module}`."
|
|
15
|
+
)
|
|
16
|
+
else:
|
|
17
|
+
raise ImportError(f"Install {list(modules)}{f' to use `{module}`.' if module else ''}")
|
thds/core/inspect.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
|
|
3
|
+
import attrs
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@attrs.frozen
|
|
7
|
+
class CallerInfo:
|
|
8
|
+
module: str = ""
|
|
9
|
+
klass: str = ""
|
|
10
|
+
caller: str = ""
|
|
11
|
+
line: int = 0
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_caller_info(skip: int = 2) -> CallerInfo:
|
|
15
|
+
# Credit: https://gist.github.com/lee-pai-long/d3004225e1847b84acb4fbba0c2aea91
|
|
16
|
+
# I have made some small modifications to the code
|
|
17
|
+
"""Get the name of a caller in the format module.class.method.
|
|
18
|
+
Copied from: https://gist.github.com/techtonik/2151727
|
|
19
|
+
:arguments:
|
|
20
|
+
- skip (integer): Specifies how many levels of stack
|
|
21
|
+
to skip while getting caller name.
|
|
22
|
+
skip=1 means "who calls me",
|
|
23
|
+
skip=2 "who calls my caller" etc.
|
|
24
|
+
:returns:
|
|
25
|
+
- module (string): full dotted name of caller module.
|
|
26
|
+
- klass (string): caller classname if one otherwise None.
|
|
27
|
+
- caller (string): caller function or method (if a class exist).
|
|
28
|
+
- line (int): the line of the call.
|
|
29
|
+
- An empty string is returned if skipped levels exceed stack height.
|
|
30
|
+
"""
|
|
31
|
+
stack = inspect.stack()
|
|
32
|
+
start = 0 + skip
|
|
33
|
+
if len(stack) < start + 1:
|
|
34
|
+
raise RuntimeError(f"The stack has less than f{skip} + 1 frames in it.")
|
|
35
|
+
parentframe = stack[start][0]
|
|
36
|
+
|
|
37
|
+
# full dotted name of caller module
|
|
38
|
+
module_info = inspect.getmodule(parentframe)
|
|
39
|
+
module = module_info.__name__ if module_info else ""
|
|
40
|
+
|
|
41
|
+
# class name
|
|
42
|
+
klass = ""
|
|
43
|
+
if "self" in parentframe.f_locals:
|
|
44
|
+
klass = parentframe.f_locals["self"].__class__.__name__
|
|
45
|
+
|
|
46
|
+
# method or function name
|
|
47
|
+
caller = ""
|
|
48
|
+
if parentframe.f_code.co_name != "<module>": # top level usually
|
|
49
|
+
caller = parentframe.f_code.co_name
|
|
50
|
+
|
|
51
|
+
# call line
|
|
52
|
+
line = parentframe.f_lineno
|
|
53
|
+
|
|
54
|
+
# Remove reference to frame
|
|
55
|
+
# See: https://docs.python.org/3/library/inspect.html#the-interpreter-stack
|
|
56
|
+
del parentframe
|
|
57
|
+
|
|
58
|
+
return CallerInfo(module=module, klass=klass, caller=caller, line=line)
|