thds.core 0.0.1__py3-none-any.whl → 1.31.20250123022540__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.core might be problematic. Click here for more details.

Files changed (70) hide show
  1. thds/core/__init__.py +48 -0
  2. thds/core/ansi_esc.py +46 -0
  3. thds/core/cache.py +201 -0
  4. thds/core/calgitver.py +82 -0
  5. thds/core/concurrency.py +100 -0
  6. thds/core/config.py +250 -0
  7. thds/core/decos.py +55 -0
  8. thds/core/dict_utils.py +188 -0
  9. thds/core/env.py +40 -0
  10. thds/core/exit_after.py +121 -0
  11. thds/core/files.py +125 -0
  12. thds/core/fretry.py +115 -0
  13. thds/core/generators.py +56 -0
  14. thds/core/git.py +81 -0
  15. thds/core/hash_cache.py +86 -0
  16. thds/core/hashing.py +106 -0
  17. thds/core/home.py +15 -0
  18. thds/core/hostname.py +10 -0
  19. thds/core/imports.py +17 -0
  20. thds/core/inspect.py +58 -0
  21. thds/core/iterators.py +9 -0
  22. thds/core/lazy.py +83 -0
  23. thds/core/link.py +153 -0
  24. thds/core/log/__init__.py +29 -0
  25. thds/core/log/basic_config.py +171 -0
  26. thds/core/log/json_formatter.py +43 -0
  27. thds/core/log/kw_formatter.py +84 -0
  28. thds/core/log/kw_logger.py +93 -0
  29. thds/core/log/logfmt.py +302 -0
  30. thds/core/merge_args.py +168 -0
  31. thds/core/meta.json +8 -0
  32. thds/core/meta.py +518 -0
  33. thds/core/parallel.py +200 -0
  34. thds/core/pickle_visit.py +24 -0
  35. thds/core/prof.py +276 -0
  36. thds/core/progress.py +112 -0
  37. thds/core/protocols.py +17 -0
  38. thds/core/py.typed +0 -0
  39. thds/core/scaling.py +39 -0
  40. thds/core/scope.py +199 -0
  41. thds/core/source.py +238 -0
  42. thds/core/source_serde.py +104 -0
  43. thds/core/sqlite/__init__.py +21 -0
  44. thds/core/sqlite/connect.py +33 -0
  45. thds/core/sqlite/copy.py +35 -0
  46. thds/core/sqlite/ddl.py +4 -0
  47. thds/core/sqlite/functions.py +63 -0
  48. thds/core/sqlite/index.py +22 -0
  49. thds/core/sqlite/insert_utils.py +23 -0
  50. thds/core/sqlite/merge.py +84 -0
  51. thds/core/sqlite/meta.py +190 -0
  52. thds/core/sqlite/read.py +66 -0
  53. thds/core/sqlite/sqlmap.py +179 -0
  54. thds/core/sqlite/structured.py +138 -0
  55. thds/core/sqlite/types.py +64 -0
  56. thds/core/sqlite/upsert.py +139 -0
  57. thds/core/sqlite/write.py +99 -0
  58. thds/core/stack_context.py +41 -0
  59. thds/core/thunks.py +40 -0
  60. thds/core/timer.py +214 -0
  61. thds/core/tmp.py +85 -0
  62. thds/core/types.py +4 -0
  63. thds.core-1.31.20250123022540.dist-info/METADATA +68 -0
  64. thds.core-1.31.20250123022540.dist-info/RECORD +67 -0
  65. {thds.core-0.0.1.dist-info → thds.core-1.31.20250123022540.dist-info}/WHEEL +1 -1
  66. thds.core-1.31.20250123022540.dist-info/entry_points.txt +4 -0
  67. thds.core-1.31.20250123022540.dist-info/top_level.txt +1 -0
  68. thds.core-0.0.1.dist-info/METADATA +0 -8
  69. thds.core-0.0.1.dist-info/RECORD +0 -4
  70. thds.core-0.0.1.dist-info/top_level.txt +0 -1
thds/core/files.py ADDED
@@ -0,0 +1,125 @@
1
+ """Various assorted file-related utilities."""
2
+
3
+ import hashlib
4
+ import os
5
+ import resource
6
+ import shutil
7
+ import stat
8
+ import typing as ty
9
+ from contextlib import contextmanager
10
+ from io import BufferedWriter, TextIOWrapper
11
+ from pathlib import Path
12
+
13
+ from . import config, hashing
14
+ from .log import getLogger
15
+ from .tmp import temppath_same_fs
16
+ from .types import StrOrPath
17
+
18
+ FILE_SCHEME = "file://"
19
+ logger = getLogger(__name__)
20
+
21
+
22
+ def set_read_only(fpath: StrOrPath):
23
+ # thank you https://stackoverflow.com/a/51262451
24
+ logger.debug("Setting '%s' to read-only", fpath)
25
+ perms = stat.S_IMODE(os.lstat(fpath).st_mode)
26
+ ro_mask = 0o777 ^ (stat.S_IWRITE | stat.S_IWGRP | stat.S_IWOTH)
27
+ os.chmod(fpath, perms & ro_mask)
28
+
29
+
30
+ def remove_file_scheme(uri: str) -> str:
31
+ """Does not require the file scheme to exist, but removes it if it's there."""
32
+ return uri[len(FILE_SCHEME) :] if uri.startswith(FILE_SCHEME) else uri
33
+
34
+
35
+ def path_from_uri(uri: str) -> Path:
36
+ str_path = remove_file_scheme(uri)
37
+ if not str_path:
38
+ raise ValueError('Cannot convert an empty string to a Path. Did you mean to use "."?')
39
+ return Path(str_path)
40
+
41
+
42
+ def to_uri(path: Path) -> str:
43
+ return FILE_SCHEME + os.fspath(path.resolve())
44
+
45
+
46
+ def is_file_uri(uri: str) -> bool:
47
+ return uri.startswith(FILE_SCHEME)
48
+
49
+
50
+ @contextmanager
51
+ def atomic_write_path(destination: StrOrPath) -> ty.Iterator[Path]:
52
+ """Shorthand context manager for doing an atomic write (i.e., write to a temporary file,
53
+ then atomically move that temporary file to your final destination.
54
+
55
+ You must open and then close the file within the provided context. Unclosed files
56
+ will likely result in data loss or other bugs.
57
+ """
58
+ destpath = path_from_uri(destination) if isinstance(destination, str) else Path(destination)
59
+ with temppath_same_fs(destpath) as temp_writable_path:
60
+ yield temp_writable_path
61
+ destpath.parent.mkdir(parents=True, exist_ok=True)
62
+ shutil.move(str(temp_writable_path), destpath)
63
+
64
+
65
+ @contextmanager
66
+ def atomic_binary_writer(destination: StrOrPath) -> ty.Iterator[BufferedWriter]:
67
+ """Even shorter shorthand for writing binary data to a file, atomically."""
68
+ with atomic_write_path(destination) as temp_writable_path:
69
+ with open(temp_writable_path, "wb") as f:
70
+ yield f
71
+
72
+
73
+ @contextmanager
74
+ def atomic_text_writer(destination: StrOrPath) -> ty.Iterator[TextIOWrapper]:
75
+ """Even shorter shorthand for writing text data to a file, atomically."""
76
+ with atomic_write_path(destination) as temp_writable_path:
77
+ with open(temp_writable_path, "w") as f:
78
+ yield f
79
+
80
+
81
+ OPEN_FILES_LIMIT = config.item("limit_open", 10000)
82
+
83
+
84
+ def set_file_limit(n: int):
85
+ """Works like calling `ulimit -Sn <N>` on a Mac."""
86
+ resource.setrlimit(resource.RLIMIT_NOFILE, (n, n))
87
+ assert resource.getrlimit(resource.RLIMIT_NOFILE) == (n, n)
88
+
89
+
90
+ def bump_limits():
91
+ """It was common to have to do this manually on our macs. Now that is no longer required."""
92
+ set_file_limit(OPEN_FILES_LIMIT())
93
+
94
+
95
+ def shorten_filename(maybe_too_long_name: StrOrPath, max_len: int = 255, retain_last: int = 30) -> str:
96
+ """Shortens a filename, using a deterministic and probabilistically-unique (hash-based) algorithm.
97
+
98
+ The filename is only changed if it exceeds the provided max_len limit.
99
+
100
+ The limit defaults to 255 _bytes_ since that is what many filesystems have
101
+ generally supported. https://en.wikipedia.org/wiki/Comparison_of_file_systems#Limits
102
+
103
+ We intentionally take our 'bite' out of the middle of the filename, so that the file extension is preserved
104
+ and so that the first part of the path also remains human-readable.
105
+ """
106
+ # p for Path, s for str, b for bytes - too many things flying around to keep track of without this.
107
+ s_maybe_too_long_name = Path(maybe_too_long_name).name
108
+ b_filename = s_maybe_too_long_name.encode()
109
+
110
+ if len(b_filename) <= max_len:
111
+ # no need to mess with anything - it will 'fit' inside the root path already.
112
+ return s_maybe_too_long_name
113
+
114
+ b_md5_of_filename = (
115
+ b"-md5-" + hashing.hash_using(b_filename, hashlib.md5()).hexdigest().encode() + b"-"
116
+ )
117
+ b_last_n = b_filename[-retain_last:]
118
+ b_first_n = b_filename[: max_len - len(b_md5_of_filename) - len(b_last_n)]
119
+ b_modified_filename = b_first_n + b_md5_of_filename + b_last_n
120
+ assert len(b_modified_filename) <= max_len, (
121
+ b_modified_filename,
122
+ len(b_modified_filename),
123
+ max_len,
124
+ )
125
+ return b_modified_filename.decode()
thds/core/fretry.py ADDED
@@ -0,0 +1,115 @@
1
+ """A more composable retry decorator."""
2
+
3
+ import random
4
+ import time
5
+ import typing as ty
6
+ from functools import wraps
7
+ from logging import getLogger
8
+ from timeit import default_timer
9
+
10
+ F = ty.TypeVar("F", bound=ty.Callable)
11
+
12
+
13
+ IsRetryable = ty.Callable[[Exception], bool]
14
+ RetryStrategy = ty.Iterable[IsRetryable]
15
+ RetryStrategyFactory = ty.Callable[[], RetryStrategy]
16
+
17
+
18
+ def expo(
19
+ *, retries: int, delay: float = 1.0, backoff: int = 2, jitter: bool = True
20
+ ) -> ty.Callable[[], ty.Iterator[float]]:
21
+ """End iteration after yielding 'retries' times.
22
+
23
+ The first retry is immediate (i.e. 0). Subsequent retries will follow the schedule
24
+ established by the exponential backoff algorithm. The default algorithm is 1, 3, 7,
25
+ 15, etc., but also adds jitter.
26
+
27
+ If you want infinite exponential values, pass a negative number for 'retries'.
28
+ """
29
+
30
+ def expo_() -> ty.Iterator[float]:
31
+ count = 0
32
+ accum_jitter = 0.0
33
+ while retries < 0 or count < retries:
34
+ expo_delay = (backoff**count * delay) - delay # first retry is immediate
35
+ if jitter:
36
+ jitter_delay = random.uniform(0.5, 1.5) * expo_delay
37
+ yield jitter_delay + accum_jitter
38
+ accum_jitter = expo_delay - jitter_delay
39
+ else:
40
+ yield expo_delay
41
+ count += 1
42
+
43
+ return expo_
44
+
45
+
46
+ def sleep(
47
+ mk_seconds_iter: ty.Callable[[], ty.Iterable[float]],
48
+ sleeper: ty.Callable[[float], ty.Any] = time.sleep,
49
+ ) -> ty.Callable[[], ty.Iterator[str]]:
50
+ """A common base strategy for separating retries by sleeps.
51
+
52
+ Yield once prior to the first sleep, and once before each sleep.
53
+ In other words, the total number of yields is the length of the input iterable (if it is finite).
54
+ """
55
+
56
+ def sleep_() -> ty.Iterator[str]:
57
+ start = default_timer()
58
+
59
+ so_far = 0.0
60
+ for i, secs in enumerate(mk_seconds_iter(), start=1):
61
+ yield f"attempt {i} after {so_far:.2f}s"
62
+ so_far = default_timer() - start
63
+ sleeper(secs)
64
+
65
+ return sleep_
66
+
67
+
68
+ def retry(retry_strategy_factory: RetryStrategyFactory) -> ty.Callable[[F], F]:
69
+ """Uses your retry strategy every time an exception is raised.
70
+ Your iterable can therefore provide different handling for each
71
+ incrementing error, as well as configurable delays between errors,
72
+ etc.
73
+
74
+ If the retry_strategy iterator itself ends (or is empty to begin
75
+ with), the function will be called one final time.
76
+ """
77
+
78
+ def _retry_decorator(func: F) -> F:
79
+ @wraps(func)
80
+ def retry_wrapper(*args, **kwargs):
81
+ for i, is_retryable in enumerate(retry_strategy_factory(), start=1):
82
+ try:
83
+ return func(*args, **kwargs)
84
+ except Exception as ex:
85
+ if not is_retryable(ex):
86
+ raise ex
87
+ getLogger(__name__).info("Retry #%d for %s due to exception %s", i, func, ex)
88
+ # one final retry that, if it fails, will not get caught and retried.
89
+ return func(*args, **kwargs)
90
+
91
+ return ty.cast(F, retry_wrapper)
92
+
93
+ return _retry_decorator
94
+
95
+
96
+ def retry_regular(
97
+ is_retryable: IsRetryable,
98
+ intervals_factory: ty.Callable[[], ty.Iterable[ty.Any]],
99
+ ) -> ty.Callable[[F], F]:
100
+ return retry(lambda: (is_retryable for _ in intervals_factory()))
101
+
102
+
103
+ def retry_sleep(
104
+ is_retryable: IsRetryable,
105
+ seconds_iter: ty.Callable[[], ty.Iterable[float]],
106
+ ) -> ty.Callable[[F], F]:
107
+ """E.g. retry_sleep(expo(retries=5)) to get max 6 calls to the function."""
108
+ return retry_regular(is_retryable, sleep(seconds_iter))
109
+
110
+
111
+ def is_exc(*exc_types: ty.Type[Exception]) -> IsRetryable:
112
+ def _is_exc_retryable(exc: Exception) -> bool:
113
+ return isinstance(exc, exc_types)
114
+
115
+ return _is_exc_retryable
@@ -0,0 +1,56 @@
1
+ """Import this module by its name, so that references to things within it are qualified by
2
+ the word 'generators', e.g. generators.sender()
3
+ """
4
+ import contextlib
5
+ import typing as ty
6
+
7
+ T = ty.TypeVar("T")
8
+ R = ty.TypeVar("R")
9
+ GEN = ty.TypeVar("GEN", bound=ty.Generator)
10
+
11
+
12
+ class return_wrapper(contextlib.AbstractContextManager, ty.Generic[GEN, R]):
13
+ """Allows you to wrap a generator that accepts and/or yields values,
14
+ but this will prime the generator and also close it at the end and fetch
15
+ its return value.
16
+
17
+ This will be somewhat easier in 3.13 with the new `gen.close()` behavior.
18
+ https://discuss.python.org/t/let-generator-close-return-stopiteration-value/24786
19
+ """
20
+
21
+ def __init__(self, gen: GEN):
22
+ self.gen = gen
23
+
24
+ def __enter__(self) -> GEN:
25
+ next(self.gen) # prime the generator
26
+ return self.gen
27
+
28
+ def __exit__(self, exc_type, exc_value, traceback):
29
+ if exc_type is not None:
30
+ # TODO confirm that this is the correct behavior
31
+ self.gen.throw(exc_type, exc_value, traceback)
32
+
33
+ try:
34
+ self.gen.throw(GeneratorExit)
35
+ # equivalent to gen.close() but also gives us StopIteration.value
36
+ except StopIteration as e:
37
+ self._return_value = e.value
38
+
39
+ @property
40
+ def return_value(self) -> R:
41
+ """Only available after the context manager has exited."""
42
+ return self._return_value
43
+
44
+
45
+ def iterator_sender(gen: ty.Generator[ty.Any, T, R], iterator: ty.Iterable[T]) -> R:
46
+ """This encapsulates the send/close behavior we want in general. See
47
+ https://discuss.python.org/t/let-generator-close-return-stopiteration-value/24786
48
+ for how a simple `gen.close()` will do this in 3.13.
49
+ """
50
+
51
+ gen_wrapper: return_wrapper[ty.Generator, R] = return_wrapper(gen) # type: ignore[arg-type]
52
+ with gen_wrapper:
53
+ for i in iterator:
54
+ gen.send(i)
55
+
56
+ return gen_wrapper.return_value
thds/core/git.py ADDED
@@ -0,0 +1,81 @@
1
+ # some basic git utilities.
2
+ #
3
+ # All of these will error if git is not available, or if the repo is not present. The
4
+ # caller is expected to catch subprocess.CalledProcessError as well as FileNotFoundError.
5
+ import os
6
+ import subprocess as sp
7
+ import typing as ty
8
+
9
+ from . import log
10
+
11
+ LOGGER = log.getLogger(__name__)
12
+ CALGITVER_NO_SECONDS_FORMAT = "%Y%m%d.%H%M"
13
+
14
+
15
+ NO_GIT = (sp.CalledProcessError, FileNotFoundError)
16
+ # FileNotFoundError can happen if git is not installed at all.
17
+
18
+
19
+ def _simple_run(s_or_l_cmd: ty.Union[str, ty.List[str]], env=None, cwd=None) -> str:
20
+ kwargs = dict(text=True, shell=True, env=env, cwd=cwd)
21
+ if isinstance(s_or_l_cmd, list):
22
+ kwargs["shell"] = False
23
+ return sp.check_output(s_or_l_cmd, **kwargs).rstrip("\n")
24
+
25
+
26
+ def get_repo_name() -> str:
27
+ return _simple_run("git remote get-url origin").split("/")[-1].rstrip().split(".")[0]
28
+
29
+
30
+ def get_commit_hash() -> str:
31
+ LOGGER.debug("`get_commit` reading from Git repo.")
32
+ return _simple_run("git rev-parse --verify HEAD")
33
+
34
+
35
+ def is_clean() -> bool:
36
+ LOGGER.debug("`is_clean` reading from Git repo.")
37
+ # command will show changes (staged and unstaged) in the working tree since the last commit.
38
+ # if there are none (i.e the repo is clean), an empty string will be printed
39
+ # https://git-scm.com/docs/git-diff#Documentation/git-diff.txt-Variouswaystocheckyourworkingtree
40
+ return "" == _simple_run("git diff HEAD")
41
+
42
+
43
+ def get_branch() -> str:
44
+ LOGGER.debug("`get_branch` reading from Git repo.")
45
+ return _simple_run("git branch --show-current")
46
+
47
+
48
+ def get_commit_datetime_and_hash(
49
+ *file_patterns: str,
50
+ cwd: ty.Optional[str] = None,
51
+ date_format: str = CALGITVER_NO_SECONDS_FORMAT,
52
+ ) -> ty.Tuple[str, str]:
53
+ """Useful for making a CalGitVer from a file or set of matching files.
54
+
55
+ If no file patterns were provided, it will return the commit datetime and hash of the
56
+ most recent commit.
57
+ """
58
+ assert " " not in date_format, "date_format cannot contain spaces"
59
+ dt, hash = (
60
+ _simple_run(
61
+ # the space between %cd and %h allows us to split on it
62
+ f"git log -n 1 --date=format-local:{date_format} --format=format:'%cd %H' -- "
63
+ + " ".join(file_patterns),
64
+ env=dict(os.environ, TZ="UTC0"),
65
+ cwd=cwd,
66
+ )
67
+ .strip("'")
68
+ .split(" ")
69
+ )
70
+ return dt, hash
71
+
72
+
73
+ def get_merge_base(branch1: str = "", branch2: str = "main") -> str:
74
+ return _simple_run(f"git merge-base {branch1 or get_branch()} {branch2}")
75
+
76
+
77
+ def get_commit_datetime_str(commit_hash: str, date_format: str = CALGITVER_NO_SECONDS_FORMAT) -> str:
78
+ return _simple_run(
79
+ f"git log -n 1 --date=format-local:{date_format} --format=format:'%cd' {commit_hash}",
80
+ env=dict(os.environ, TZ="UTC0"),
81
+ )
@@ -0,0 +1,86 @@
1
+ """Sometimes, you just want to cache hashes. Specifically, hashes of files.
2
+
3
+ We cache these hashes as files themselves, and the default location is under the user's
4
+ home directory.
5
+
6
+ The name of the file is an implementation detail that includes the hash of the file path,
7
+ the directory it lives in is the hashlib name of the hash algorithm, and the contents of
8
+ the file are the raw bytes of the hash. However, none of these details is guaranteed to
9
+ remain stable over time, and the only stable interface is the `hash_file` and `filehash`
10
+ functions themselves.
11
+ """
12
+
13
+ import hashlib
14
+ import os
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from . import config, files
19
+ from .hashing import Hash, hash_using
20
+ from .home import HOMEDIR
21
+ from .log import getLogger
22
+ from .types import StrOrPath
23
+
24
+ CACHE_HASH_DIR = config.item("directory", HOMEDIR() / ".hash-cache", parse=Path)
25
+ _1GB = 1 * 2**30 # log if hashing a file larger than this, since it will be slow.
26
+
27
+
28
+ logger = getLogger(__name__)
29
+
30
+
31
+ def _filecachekey(path: Path, hashtype: str) -> Path:
32
+ # the construction of our cache key here is somewhat arbitrary,
33
+ # and the name substring is really just for debugging purposes.
34
+ # however, the filesize is a useful bit of additional 'entropy'
35
+ # that will help us avoid edge cases that might arise from race
36
+ # conditions, and the approach must remain stable over time for
37
+ # the cache to provide a meaningful advantage.
38
+ path_str = str(path)
39
+ path_hash = hash_using(path_str.encode(), hashlib.sha256()).hexdigest()
40
+ # we use a compressed (hashed) version of the path because
41
+ # filenames can get kind of long and we don't want to deal with
42
+ # long filenames blowing up our system by being unwritable.
43
+ return (
44
+ CACHE_HASH_DIR()
45
+ / hashtype
46
+ / (path_str[-50:].replace("/", "|") + "-" + path_hash + "+" + str(path.stat().st_size))
47
+ )
48
+
49
+
50
+ def _is_no_older_than(file: Path, other: Path) -> bool:
51
+ """Returns True if `file` is no older than `other`. Both files must exist."""
52
+ return file.stat().st_mtime >= other.stat().st_mtime
53
+
54
+
55
+ def hash_file(filepath: StrOrPath, hasher: Any) -> bytes:
56
+ """Hashes a file with the given hashlib hasher. If we've already previously computed
57
+ the given hash for the file and the file hasn't changed (according to filesystem
58
+ mtime) since we stored that hash, we'll just return the cached hash.
59
+
60
+ File must exist and respond positively to stat().
61
+ """
62
+ resolved_path = Path(filepath).resolve()
63
+ cached_hash_path = _filecachekey(resolved_path, hasher.name)
64
+ # now we can check to see if we have hash bytes for that file somewhere already.
65
+ hash_cached = "hash-cached" if cached_hash_path.exists() else ""
66
+ if hash_cached and _is_no_older_than(cached_hash_path, resolved_path):
67
+ logger.debug("Reusing known hash for %s - cache key %s", resolved_path, cached_hash_path)
68
+ return cached_hash_path.read_bytes()
69
+
70
+ psize = resolved_path.stat().st_size
71
+ if psize > _1GB:
72
+ log_at_lvl = logger.warning if hash_cached else logger.info
73
+ # I want to know how often we're finding 'outdated' hashes; those should be rare.
74
+ log_at_lvl(f"Hashing {psize/_1GB:.2f} GB file at {resolved_path}{hash_cached}")
75
+
76
+ hash_bytes = hash_using(resolved_path, hasher).digest()
77
+ cached_hash_path.parent.mkdir(parents=True, exist_ok=True)
78
+ with files.atomic_binary_writer(cached_hash_path) as f:
79
+ f.write(hash_bytes)
80
+ return hash_bytes
81
+
82
+
83
+ def filehash(algo: str, pathlike: os.PathLike) -> Hash:
84
+ """Wraps a cached hash of a file in a core.hashing.Hash object, which carries the name
85
+ of the hash algorithm used."""
86
+ return Hash(algo, hash_file(pathlike, hashlib.new(algo)))
thds/core/hashing.py ADDED
@@ -0,0 +1,106 @@
1
+ """
2
+ https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
3
+ I have written this code too many times to write it again. Why isn't this in the stdlib?
4
+ """
5
+ import base64
6
+ import contextlib
7
+ import io
8
+ import os
9
+ import threading
10
+ import typing as ty
11
+ from pathlib import Path
12
+
13
+ # Python threads don't allow for significant CPU parallelism, so
14
+ # allowing for more than a few of these per process is a recipe for
15
+ # getting nothing done.
16
+ _SEMAPHORE = threading.BoundedSemaphore(int(os.getenv("THDS_CORE_HASHING_PARALLELISM", 4)))
17
+ _CHUNK_SIZE = int(os.getenv("THDS_CORE_HASHING_CHUNK_SIZE", 65536))
18
+ # https://stackoverflow.com/questions/17731660/hashlib-optimal-size-of-chunks-to-be-used-in-md5-update
19
+ # this may not apply to us as the architecture is 32 bit, but it's at
20
+ # least a halfway decent guess and benchmarking this ourselves would
21
+ # be a massive waste of time.
22
+
23
+ T = ty.TypeVar("T")
24
+ SomehowReadable = ty.Union[ty.AnyStr, ty.IO[ty.AnyStr], Path]
25
+
26
+
27
+ def hash_readable_chunks(bytes_readable: ty.IO[bytes], hasher: T) -> T:
28
+ """Return thing you can call .digest or .hexdigest on.
29
+
30
+ E.g.:
31
+
32
+ hash_readable_chunks(open(Path('foo/bar'), 'rb'), hashlib.sha256()).hexdigest()
33
+ """
34
+ with _SEMAPHORE:
35
+ for chunk in iter(lambda: bytes_readable.read(_CHUNK_SIZE), b""):
36
+ hasher.update(chunk) # type: ignore
37
+ return hasher
38
+
39
+
40
+ @contextlib.contextmanager
41
+ def attempt_readable(thing: SomehowReadable) -> ty.Iterator[ty.IO[bytes]]:
42
+ """Best effort: make this object a bytes-readable."""
43
+ if hasattr(thing, "read") and hasattr(thing, "seek"):
44
+ try:
45
+ yield thing # type: ignore
46
+ return
47
+ finally:
48
+ thing.seek(0) # type: ignore
49
+ elif isinstance(thing, bytes):
50
+ yield io.BytesIO(thing)
51
+ return
52
+ with open(thing, "rb") as readable: # type: ignore
53
+ yield readable
54
+
55
+
56
+ def hash_using(data: SomehowReadable, hasher: T) -> T:
57
+ """This is quite dynamic - but if your data object is not readable
58
+ bytes and is not openable as bytes, you'll get a
59
+ FileNotFoundError, or possibly a TypeError or other gremlin.
60
+
61
+ Therefore, you may pass whatever you want unless it's an actual
62
+ string - if you want your actual string hashed, you should encode
63
+ it as actual bytes first.
64
+ """
65
+ with attempt_readable(data) as readable:
66
+ return hash_readable_chunks(readable, hasher)
67
+
68
+
69
+ def hash_anything(data: SomehowReadable, hasher: T) -> ty.Optional[T]:
70
+ try:
71
+ return hash_using(data, hasher)
72
+ except (FileNotFoundError, TypeError):
73
+ # it's unlikely we can operate on this data?
74
+ return None
75
+
76
+
77
+ def b64(digest: bytes) -> str:
78
+ """The string representation commonly used by Azure utilities.
79
+
80
+ We use it in cases where we want to represent the same hash that
81
+ ADLS will have in UTF-8 string (instead of bytes) format.
82
+ """
83
+ return base64.b64encode(digest).decode()
84
+
85
+
86
+ def db64(s: str) -> bytes:
87
+ """Shorthand for the inverse of b64."""
88
+ return base64.b64decode(s)
89
+
90
+
91
+ def _repr_bytes(bs: bytes) -> str:
92
+ return f"db64('{b64(bs)}')"
93
+
94
+
95
+ class Hash(ty.NamedTuple):
96
+ """Algorithm name needs to match something supported by hashlib.
97
+
98
+ A good choice would be sha256. Use md5 if you have to.
99
+ """
100
+
101
+ algo: str
102
+ # valid algorithm names listed here: https://docs.python.org/3/library/hashlib.html#constructors
103
+ bytes: bytes
104
+
105
+ def __repr__(self) -> str:
106
+ return f"Hash(algo='{self.algo}', bytes={_repr_bytes(self.bytes)})"
thds/core/home.py ADDED
@@ -0,0 +1,15 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from .config import item
5
+
6
+ # On our GitHub runners, we can't make hardlinks from /runner/home to where our stuff actually goes.
7
+ # This allows us to use 'a' home directory that is on the same filesystem.
8
+ _RUNNER_WORK = Path("/runner/_work")
9
+ if os.getenv("CI") and _RUNNER_WORK.exists() and _RUNNER_WORK.is_dir():
10
+ __home = _RUNNER_WORK
11
+ else:
12
+ __home = Path.home()
13
+
14
+
15
+ HOMEDIR = item("thds.core.homedir", parse=Path, default=__home)
thds/core/hostname.py ADDED
@@ -0,0 +1,10 @@
1
+ import socket
2
+
3
+
4
+ def friendly() -> str:
5
+ hn = socket.gethostname()
6
+ if hn.endswith(".local"):
7
+ hn = hn[: -len(".local")]
8
+ if hn.startswith("MBP-"):
9
+ hn = hn[len("MBP-") :]
10
+ return hn
thds/core/imports.py ADDED
@@ -0,0 +1,17 @@
1
+ from importlib import import_module
2
+ from importlib.resources import Package
3
+
4
+ from .meta import get_base_package
5
+
6
+
7
+ def try_imports(*modules: str, module: Package = "", extra: str = "") -> None:
8
+ try:
9
+ for m in modules:
10
+ import_module(m)
11
+ except ImportError:
12
+ if extra and module:
13
+ raise ImportError(
14
+ f"Install the '{extra}' extra for `{get_base_package(module)}` to use `{module}`."
15
+ )
16
+ else:
17
+ raise ImportError(f"Install {list(modules)}{f' to use `{module}`.' if module else ''}")
thds/core/inspect.py ADDED
@@ -0,0 +1,58 @@
1
+ import inspect
2
+
3
+ import attrs
4
+
5
+
6
+ @attrs.frozen
7
+ class CallerInfo:
8
+ module: str = ""
9
+ klass: str = ""
10
+ caller: str = ""
11
+ line: int = 0
12
+
13
+
14
+ def get_caller_info(skip: int = 2) -> CallerInfo:
15
+ # Credit: https://gist.github.com/lee-pai-long/d3004225e1847b84acb4fbba0c2aea91
16
+ # I have made some small modifications to the code
17
+ """Get the name of a caller in the format module.class.method.
18
+ Copied from: https://gist.github.com/techtonik/2151727
19
+ :arguments:
20
+ - skip (integer): Specifies how many levels of stack
21
+ to skip while getting caller name.
22
+ skip=1 means "who calls me",
23
+ skip=2 "who calls my caller" etc.
24
+ :returns:
25
+ - module (string): full dotted name of caller module.
26
+ - klass (string): caller classname if one otherwise None.
27
+ - caller (string): caller function or method (if a class exist).
28
+ - line (int): the line of the call.
29
+ - An empty string is returned if skipped levels exceed stack height.
30
+ """
31
+ stack = inspect.stack()
32
+ start = 0 + skip
33
+ if len(stack) < start + 1:
34
+ raise RuntimeError(f"The stack has less than f{skip} + 1 frames in it.")
35
+ parentframe = stack[start][0]
36
+
37
+ # full dotted name of caller module
38
+ module_info = inspect.getmodule(parentframe)
39
+ module = module_info.__name__ if module_info else ""
40
+
41
+ # class name
42
+ klass = ""
43
+ if "self" in parentframe.f_locals:
44
+ klass = parentframe.f_locals["self"].__class__.__name__
45
+
46
+ # method or function name
47
+ caller = ""
48
+ if parentframe.f_code.co_name != "<module>": # top level usually
49
+ caller = parentframe.f_code.co_name
50
+
51
+ # call line
52
+ line = parentframe.f_lineno
53
+
54
+ # Remove reference to frame
55
+ # See: https://docs.python.org/3/library/inspect.html#the-interpreter-stack
56
+ del parentframe
57
+
58
+ return CallerInfo(module=module, klass=klass, caller=caller, line=line)
thds/core/iterators.py ADDED
@@ -0,0 +1,9 @@
1
+ import typing as ty
2
+
3
+ T = ty.TypeVar("T")
4
+
5
+
6
+ def null_safe_iter(it: ty.Optional[ty.Iterable[T]]) -> ty.Iterator[T]:
7
+ """Iterate the iterable if it is not None"""
8
+ if it is not None:
9
+ yield from it