thds.core 0.0.1__py3-none-any.whl → 1.31.20250123022540__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.core might be problematic. Click here for more details.

Files changed (70) hide show
  1. thds/core/__init__.py +48 -0
  2. thds/core/ansi_esc.py +46 -0
  3. thds/core/cache.py +201 -0
  4. thds/core/calgitver.py +82 -0
  5. thds/core/concurrency.py +100 -0
  6. thds/core/config.py +250 -0
  7. thds/core/decos.py +55 -0
  8. thds/core/dict_utils.py +188 -0
  9. thds/core/env.py +40 -0
  10. thds/core/exit_after.py +121 -0
  11. thds/core/files.py +125 -0
  12. thds/core/fretry.py +115 -0
  13. thds/core/generators.py +56 -0
  14. thds/core/git.py +81 -0
  15. thds/core/hash_cache.py +86 -0
  16. thds/core/hashing.py +106 -0
  17. thds/core/home.py +15 -0
  18. thds/core/hostname.py +10 -0
  19. thds/core/imports.py +17 -0
  20. thds/core/inspect.py +58 -0
  21. thds/core/iterators.py +9 -0
  22. thds/core/lazy.py +83 -0
  23. thds/core/link.py +153 -0
  24. thds/core/log/__init__.py +29 -0
  25. thds/core/log/basic_config.py +171 -0
  26. thds/core/log/json_formatter.py +43 -0
  27. thds/core/log/kw_formatter.py +84 -0
  28. thds/core/log/kw_logger.py +93 -0
  29. thds/core/log/logfmt.py +302 -0
  30. thds/core/merge_args.py +168 -0
  31. thds/core/meta.json +8 -0
  32. thds/core/meta.py +518 -0
  33. thds/core/parallel.py +200 -0
  34. thds/core/pickle_visit.py +24 -0
  35. thds/core/prof.py +276 -0
  36. thds/core/progress.py +112 -0
  37. thds/core/protocols.py +17 -0
  38. thds/core/py.typed +0 -0
  39. thds/core/scaling.py +39 -0
  40. thds/core/scope.py +199 -0
  41. thds/core/source.py +238 -0
  42. thds/core/source_serde.py +104 -0
  43. thds/core/sqlite/__init__.py +21 -0
  44. thds/core/sqlite/connect.py +33 -0
  45. thds/core/sqlite/copy.py +35 -0
  46. thds/core/sqlite/ddl.py +4 -0
  47. thds/core/sqlite/functions.py +63 -0
  48. thds/core/sqlite/index.py +22 -0
  49. thds/core/sqlite/insert_utils.py +23 -0
  50. thds/core/sqlite/merge.py +84 -0
  51. thds/core/sqlite/meta.py +190 -0
  52. thds/core/sqlite/read.py +66 -0
  53. thds/core/sqlite/sqlmap.py +179 -0
  54. thds/core/sqlite/structured.py +138 -0
  55. thds/core/sqlite/types.py +64 -0
  56. thds/core/sqlite/upsert.py +139 -0
  57. thds/core/sqlite/write.py +99 -0
  58. thds/core/stack_context.py +41 -0
  59. thds/core/thunks.py +40 -0
  60. thds/core/timer.py +214 -0
  61. thds/core/tmp.py +85 -0
  62. thds/core/types.py +4 -0
  63. thds.core-1.31.20250123022540.dist-info/METADATA +68 -0
  64. thds.core-1.31.20250123022540.dist-info/RECORD +67 -0
  65. {thds.core-0.0.1.dist-info → thds.core-1.31.20250123022540.dist-info}/WHEEL +1 -1
  66. thds.core-1.31.20250123022540.dist-info/entry_points.txt +4 -0
  67. thds.core-1.31.20250123022540.dist-info/top_level.txt +1 -0
  68. thds.core-0.0.1.dist-info/METADATA +0 -8
  69. thds.core-0.0.1.dist-info/RECORD +0 -4
  70. thds.core-0.0.1.dist-info/top_level.txt +0 -1
thds/core/meta.py ADDED
@@ -0,0 +1,518 @@
1
+ import importlib
2
+ import json
3
+ import os
4
+ import re
5
+ import typing as ty
6
+ from datetime import datetime, timezone
7
+ from functools import lru_cache
8
+ from getpass import getuser
9
+ from importlib.metadata import PackageNotFoundError, version
10
+ from importlib.resources import Package, open_text
11
+ from pathlib import Path
12
+ from types import MappingProxyType
13
+
14
+ import attrs
15
+ from cattrs import Converter
16
+
17
+ from . import calgitver, git
18
+ from .log import getLogger
19
+ from .types import StrOrPath
20
+
21
+ LayoutType = ty.Literal["flat", "src"]
22
+ NameFormatType = ty.Literal["git", "docker", "hive"]
23
+
24
+ TIMESTAMP_FORMAT = "%Y%m%d%H%M%S"
25
+
26
+ DOCKER_EXCLUSION_REGEX = r"[^\w\-.]+"
27
+ DOCKER_SUB_CHARACTER = "-"
28
+ HIVE_EXCLUSION_REGEX = r"[\W]+"
29
+ HIVE_SUB_CHARACTER = "_"
30
+ VERSION_EXCLUSION_REGEX = r"[^\d.]+"
31
+ VERSION_SUB_CHARACTER = ""
32
+
33
+ CI_TIMESTAMP = "CI_TIMESTAMP"
34
+ CI_USER = "runner"
35
+ DEPLOYING = "DEPLOYING"
36
+ GIT_COMMIT = "GIT_COMMIT"
37
+ GIT_IS_CLEAN = "GIT_IS_CLEAN"
38
+ GIT_IS_DIRTY = "GIT_IS_DIRTY"
39
+ GIT_BRANCH = "GIT_BRANCH"
40
+ MAIN = "main"
41
+ THDS_USER = "THDS_USER"
42
+
43
+ META_FILE = "meta.json"
44
+
45
+ LOGGER = getLogger(__name__)
46
+
47
+
48
+ def format_name(name: str, format: NameFormatType = "git") -> str:
49
+ if format == "git":
50
+ return name
51
+ elif format == "docker":
52
+ return re.sub(DOCKER_EXCLUSION_REGEX, DOCKER_SUB_CHARACTER, name)
53
+ elif format == "hive":
54
+ return re.sub(HIVE_EXCLUSION_REGEX, HIVE_SUB_CHARACTER, name).lower()
55
+ else:
56
+ raise ValueError(
57
+ f"'{format}' is not a supported `format`. Supported formats: {ty.get_args(NameFormatType)}"
58
+ )
59
+
60
+
61
+ @ty.overload
62
+ def get_timestamp() -> str:
63
+ ... # pragma: no cover
64
+
65
+
66
+ @ty.overload
67
+ def get_timestamp(as_datetime: ty.Literal[True]) -> datetime:
68
+ ... # pragma: no cover
69
+
70
+
71
+ @ty.overload
72
+ def get_timestamp(as_datetime: ty.Literal[False]) -> str:
73
+ ... # pragma: no cover
74
+
75
+
76
+ def get_timestamp(as_datetime: bool = False):
77
+ timestamp = datetime.now(timezone.utc)
78
+ return timestamp.strftime(TIMESTAMP_FORMAT) if not as_datetime else timestamp
79
+
80
+
81
+ def make_calgitver() -> str:
82
+ """An older version of calgitver that allows for non-determinstic datetimes
83
+ if the repo is dirty.
84
+
85
+ See calgitver.calgitver for docs on what this means.
86
+ """
87
+ dirty = "" if is_clean() else "dirty"
88
+ if not dirty:
89
+ # we only attempt this 'determinstic' datetime if the repo is clean, because if
90
+ # it's not clean then this isn't deterministic anyway, and so we'd rather just
91
+ # have an up-to-date timestamp
92
+ try:
93
+ return calgitver.calgitver()
94
+ except git.NO_GIT:
95
+ pass
96
+ base_components = (
97
+ datetime.now(tz=timezone.utc).strftime(git.CALGITVER_NO_SECONDS_FORMAT),
98
+ get_commit()[: calgitver.SHORT_HASH],
99
+ )
100
+ return "-".join((*base_components, dirty)).rstrip("-")
101
+
102
+
103
+ def print_calgitver():
104
+ print(make_calgitver())
105
+
106
+
107
+ @ty.overload
108
+ def extract_timestamp(version: str) -> str:
109
+ """Returns timestamp in full YYYYMMDDHHMMSS format even if the input was a CalGitVer string with no seconds."""
110
+
111
+
112
+ @ty.overload
113
+ def extract_timestamp(version: str, as_datetime: ty.Literal[True]) -> datetime:
114
+ ... # pragma: no cover
115
+
116
+
117
+ @ty.overload
118
+ def extract_timestamp(version: str, as_datetime: ty.Literal[False]) -> str:
119
+ ... # pragma: no cover
120
+
121
+
122
+ def extract_timestamp(version: str, as_datetime: bool = False):
123
+ def to_result(dt: datetime):
124
+ return dt.replace(tzinfo=timezone.utc) if as_datetime else dt.strftime(TIMESTAMP_FORMAT)
125
+
126
+ # This is intended to be general-purpose and therefore a bit heuristic.
127
+ # We attempt to parse the version as CalGitVer first, since it is a
128
+ # narrow format. Failing that, we'll try SemCalVer.
129
+ if calgitver.parse_calgitver(version):
130
+ try:
131
+ return to_result(datetime.strptime(version[:13], git.CALGITVER_NO_SECONDS_FORMAT))
132
+ except ValueError:
133
+ pass
134
+
135
+ version = re.sub(VERSION_EXCLUSION_REGEX, VERSION_SUB_CHARACTER, version)
136
+ version_ = version.split(".")
137
+ if len(version_) >= 3:
138
+ try:
139
+ return to_result(datetime.strptime(version_[2], TIMESTAMP_FORMAT))
140
+ except ValueError:
141
+ pass
142
+
143
+ raise ValueError(
144
+ f"`version`: {version} is not a timestamp-containing version string (SemCalVer or CalGitVer)."
145
+ )
146
+
147
+
148
+ def norm_name(pkg: str) -> str:
149
+ """Apparently poetry creates slightly different dist-info
150
+ directories and METADATA files than p-i-p-e-n-v did.
151
+ """
152
+ return pkg.replace(".", "_")
153
+
154
+
155
+ def _get_pkg_root_filename(pkg: Package) -> str:
156
+ if not isinstance(pkg, str):
157
+ return pkg.__file__ or ""
158
+ try:
159
+ pkg_spec = importlib.util.find_spec(pkg) # type: ignore
160
+ return pkg_spec and pkg_spec.origin or ""
161
+ except ModuleNotFoundError:
162
+ return ""
163
+
164
+
165
+ @lru_cache(None)
166
+ def get_version(pkg: Package, orig: str = "") -> str:
167
+ # first try direct lookup from the pyproject.toml, if one can be found,
168
+ # because poetry frequently has outdated info in the venv it creates.
169
+ pkg_root_file = _get_pkg_root_filename(pkg)
170
+ if pkg_root_file:
171
+ version_ = find_pyproject_toml_version(Path(pkg_root_file), str(pkg))
172
+ if version_:
173
+ return version_
174
+ try:
175
+ version_ = version(norm_name(str(pkg)))
176
+ except PackageNotFoundError:
177
+ try:
178
+ version_ = version(str(pkg))
179
+ except PackageNotFoundError:
180
+ # 'recurse' upward, assuming that the package name is overly-specified
181
+ pkg_ = pkg.split(".")
182
+ if len(pkg_) <= 1:
183
+ # Check to see if there's a
184
+ # meta.json file hanging around, and if so, see if it contains a pyproject_version.
185
+ metadata = read_metadata(orig or pkg)
186
+ if metadata and metadata.pyproject_version:
187
+ return metadata.pyproject_version
188
+
189
+ for env_var in ("CALGITVER", "GIT_COMMIT"):
190
+ env_var_version = os.getenv(env_var)
191
+ lvl = LOGGER.debug if env_var == "CALGITVER" else LOGGER.info
192
+ if env_var_version:
193
+ lvl(f"Using {env_var} {env_var_version} as fallback version for {orig or pkg}")
194
+ return env_var_version
195
+
196
+ LOGGER.warning("Could not find a version for `%s`. Package not found.", orig or pkg)
197
+ return ""
198
+ return get_version(".".join(pkg_[:-1]), orig or pkg)
199
+
200
+ return version_
201
+
202
+
203
+ class NoBasePackageFromMain(ValueError):
204
+ """
205
+ `get_base_package` needs a 'real' package or module name, not '__main__',
206
+ in order to discover a meaningful name for the package.
207
+ You may be using a dynamic library from within __main__ that calls get_base_package,
208
+ and inside __main__, Python doesn't let us do any nice introspection on a module's name.
209
+ So, please call that code from a module that isn't what was passed to `python -m` -
210
+ that is, split your code into a minimal __main__.py and a separate module.
211
+ """
212
+
213
+
214
+ @lru_cache(None)
215
+ def get_base_package(pkg: Package) -> str:
216
+ try:
217
+ str_pkg = str(pkg)
218
+ if str_pkg == "__main__":
219
+ raise NoBasePackageFromMain(NoBasePackageFromMain.__doc__)
220
+ _ = version(norm_name(str_pkg))
221
+ except PackageNotFoundError:
222
+ try:
223
+ _ = version(str(pkg))
224
+ except PackageNotFoundError:
225
+ pkg_ = pkg.split(".")
226
+ if len(pkg_) <= 1:
227
+ LOGGER.warning("Could not find the base package for `%s`. Package not found.", pkg)
228
+ return ""
229
+ else:
230
+ return get_base_package(".".join(pkg_[:-1]))
231
+
232
+ return str(pkg)
233
+
234
+
235
+ def get_repo_name() -> str:
236
+ try:
237
+ return git.get_repo_name()
238
+ except git.NO_GIT:
239
+ LOGGER.debug("`get_repo_name` found no repo name.")
240
+ return ""
241
+
242
+
243
+ def get_commit(pkg: Package = "") -> str: # should really be named get_commit_hash
244
+ if GIT_COMMIT in os.environ:
245
+ LOGGER.debug("`get_commit` reading from env var.")
246
+ return os.environ[GIT_COMMIT]
247
+
248
+ try:
249
+ return git.get_commit_hash()
250
+ except git.NO_GIT:
251
+ pass
252
+
253
+ try:
254
+ if pkg:
255
+ LOGGER.debug("`get_commit` reading from metadata.")
256
+ metadata = read_metadata(pkg)
257
+ if metadata.is_empty:
258
+ raise EmptyMetadataException
259
+ return metadata.git_commit
260
+ except EmptyMetadataException:
261
+ pass
262
+
263
+ LOGGER.warning("`get_commit` found no commit.")
264
+ return ""
265
+
266
+
267
+ def is_clean(pkg: Package = "") -> bool:
268
+ if GIT_IS_CLEAN in os.environ:
269
+ LOGGER.debug("`is_clean` reading from env var.")
270
+ return bool(os.environ[GIT_IS_CLEAN])
271
+
272
+ if GIT_IS_DIRTY in os.environ:
273
+ # compatibility with docker-tools/build_push
274
+ LOGGER.debug("`is_clean` reading from env var.")
275
+ return not bool(os.getenv(GIT_IS_DIRTY))
276
+
277
+ try:
278
+ return git.is_clean()
279
+ except git.NO_GIT:
280
+ pass
281
+
282
+ try:
283
+ if pkg:
284
+ LOGGER.debug("`is_clean` reading from metadata.")
285
+ metadata = read_metadata(pkg)
286
+ if metadata.is_empty:
287
+ raise EmptyMetadataException
288
+ return metadata.git_is_clean
289
+ except EmptyMetadataException:
290
+ pass
291
+
292
+ LOGGER.warning("`is_clean` found no cleanliness - assume dirty.")
293
+ return False
294
+
295
+
296
+ def get_branch(pkg: Package = "", format: NameFormatType = "git") -> str:
297
+ def _get_branch(pkg: Package = "") -> str:
298
+ if GIT_BRANCH in os.environ:
299
+ LOGGER.debug("`get_branch` reading from env var.")
300
+ return os.environ[GIT_BRANCH]
301
+
302
+ try:
303
+ return git.get_branch()
304
+ except git.NO_GIT:
305
+ pass
306
+
307
+ try:
308
+ if pkg:
309
+ LOGGER.debug("`get_branch` reading from metadata.")
310
+ metadata = read_metadata(pkg)
311
+ if not metadata.git_branch:
312
+ raise EmptyMetadataException
313
+ return metadata.git_branch
314
+ except EmptyMetadataException:
315
+ pass
316
+
317
+ LOGGER.warning("`get_branch` found no branch.")
318
+ return ""
319
+
320
+ return format_name(_get_branch(pkg), format)
321
+
322
+
323
+ def get_user(pkg: Package = "", format: NameFormatType = "git") -> str:
324
+ def _get_user(pkg: Package = "") -> str:
325
+ if THDS_USER in os.environ:
326
+ LOGGER.debug("`get_user` reading from env var.")
327
+ return os.environ[THDS_USER]
328
+
329
+ try:
330
+ if pkg:
331
+ LOGGER.debug("`get_user` reading from metadata.")
332
+ metadata = read_metadata(pkg)
333
+ if not metadata.thds_user:
334
+ raise EmptyMetadataException
335
+ return metadata.thds_user
336
+ except EmptyMetadataException:
337
+ pass
338
+
339
+ LOGGER.debug("`get_user` found no user data - getting system user.")
340
+ return getuser()
341
+
342
+ return format_name(_get_user(pkg), format)
343
+
344
+
345
+ def is_deployed(pkg: Package) -> bool:
346
+ meta = read_metadata(pkg)
347
+ return not meta.is_empty
348
+
349
+
350
+ def _hacky_get_pyproject_toml_version(pkg: Package, wdir: Path) -> str:
351
+ # it will be a good day when Python packages a toml reader by default.
352
+ ppt = wdir / "pyproject.toml"
353
+ if ppt.exists():
354
+ with open(ppt) as f:
355
+ toml = f.read()
356
+ # check name for sanity - we don't want to pull a version
357
+ # out of, say, the root project when that doesn't match our project name.
358
+ # TODO: extract name and version more nicely.
359
+ # TODO: normalize the name here more robustly.
360
+ if not re.search(rf"name\s*=\s*[\"']({pkg.replace('_', '-')})[\"']", toml):
361
+ LOGGER.warning(f"The package name in pyproject.toml does not match the package name ({pkg})")
362
+ for line in toml.splitlines():
363
+ if m := re.match(r"version\s*=\s*[\"'](?P<version>[a-zA-Z0-9.]+)[\"']", line):
364
+ return m.group("version")
365
+ return ""
366
+
367
+
368
+ def find_pyproject_toml_version(starting_path: Path, pkg: Package) -> str:
369
+ """A way of looking to see if there's a pyproject.toml that defines our package's
370
+ version. Only really useful in a monorepo context.
371
+ """
372
+ while starting_path != starting_path.parent:
373
+ directory = starting_path.parent
374
+ ppt = directory / "pyproject.toml"
375
+ if ppt.exists():
376
+ # the first one we find is the only one we'll try.
377
+ # anything above that can't possibly be the appropriate
378
+ # pyproject.toml.
379
+ try:
380
+ return _hacky_get_pyproject_toml_version(pkg, directory)
381
+ except ValueError as ve:
382
+ LOGGER.info(str(ve))
383
+ return ""
384
+ starting_path = directory
385
+
386
+ return ""
387
+
388
+
389
+ MiscType = ty.Mapping[str, ty.Union[str, int, float, bool]]
390
+
391
+
392
+ @attrs.frozen
393
+ class Metadata:
394
+ git_commit: str = ""
395
+ git_branch: str = ""
396
+ git_is_clean: bool = False
397
+ pyproject_version: str = "" # only present if the project defines `version` inside pyproject.toml
398
+ thds_user: str = ""
399
+ misc: MiscType = attrs.field(factory=lambda: MappingProxyType(dict()))
400
+
401
+ @property
402
+ def docker_branch(self) -> str:
403
+ return format_name(self.git_branch, "docker")
404
+
405
+ @property
406
+ def hive_branch(self) -> str:
407
+ return format_name(self.git_branch, "hive")
408
+
409
+ @property
410
+ def docker_user(self) -> str:
411
+ return format_name(self.thds_user, "docker")
412
+
413
+ @property
414
+ def hive_user(self) -> str:
415
+ return format_name(self.thds_user, "hive")
416
+
417
+ @property
418
+ def is_empty(self) -> bool:
419
+ return all(not getattr(self, field.name) for field in attrs.fields(Metadata))
420
+
421
+ @property
422
+ def git_is_dirty(self) -> bool:
423
+ return not self.git_is_clean
424
+
425
+
426
+ meta_converter = Converter(forbid_extra_keys=True)
427
+ meta_converter.register_structure_hook(
428
+ Metadata, lambda v, _: Metadata(misc=MappingProxyType(v.pop("misc", {})), **v)
429
+ )
430
+
431
+
432
+ class EmptyMetadataException(Exception):
433
+ pass
434
+
435
+
436
+ def init_metadata(misc: ty.Optional[MiscType] = None, pyproject_toml_version: str = "") -> Metadata:
437
+ return Metadata(
438
+ git_commit=get_commit(),
439
+ git_branch=get_branch(),
440
+ git_is_clean=is_clean(),
441
+ pyproject_version=pyproject_toml_version,
442
+ thds_user=os.getenv(THDS_USER, getuser()),
443
+ misc=MappingProxyType(misc) if misc else MappingProxyType(dict()),
444
+ )
445
+
446
+
447
+ def _sanitize_metadata_for_docker_tools(d: dict):
448
+ """We want our Docker builds to be able to take advantage of
449
+ caching based on the contents of the sources copied over into
450
+ them. If we embed a meta.json into each library where the commit
451
+ hash changes every time a commit happens, then we've blown away
452
+ our entire cache.
453
+
454
+ The Docker builds already inject this metadata as environment
455
+ variables after the source copies happen, so there's no need for
456
+ us to embed it this way.
457
+ """
458
+ d["git_commit"] = ""
459
+ d["git_branch"] = ""
460
+ d["git_is_clean"] = ""
461
+ d["thds_user"] = THDS_USER
462
+
463
+
464
+ def write_metadata(
465
+ pkg: str,
466
+ *,
467
+ misc: ty.Optional[MiscType] = None,
468
+ namespace: str = "thds",
469
+ layout: LayoutType = "src",
470
+ wdir: ty.Optional[StrOrPath] = None,
471
+ deploying: bool = False,
472
+ for_docker_tools_build: bool = False,
473
+ ) -> None:
474
+ wdir_ = Path(wdir) if wdir else Path(".")
475
+ assert wdir_
476
+ if os.getenv(DEPLOYING) or deploying:
477
+ LOGGER.debug("Writing metadata.")
478
+ metadata = init_metadata(
479
+ misc=misc, pyproject_toml_version=_hacky_get_pyproject_toml_version(pkg, wdir_)
480
+ )
481
+ metadata_path = os.path.join(
482
+ "src" if layout == "src" else "",
483
+ namespace.replace("-", "/").replace(".", "/"),
484
+ pkg.replace("-", "_").replace(".", "/"),
485
+ META_FILE,
486
+ )
487
+
488
+ LOGGER.info(f"Writing metadata for {pkg} to {wdir_ / metadata_path}")
489
+ with open(wdir_ / metadata_path, "w") as f:
490
+ metadata_dict = meta_converter.unstructure(metadata)
491
+ if for_docker_tools_build:
492
+ _sanitize_metadata_for_docker_tools(metadata_dict)
493
+ json.dump(metadata_dict, f, indent=2)
494
+ f.write("\n") # Add newline because Py JSON does not
495
+
496
+
497
+ @lru_cache(None)
498
+ def read_metadata(pkg: Package) -> Metadata:
499
+ LOGGER.debug("Reading metadata.")
500
+
501
+ if pkg == "__main__":
502
+ raise ValueError("`read_meta` expects a package or module name, not '__main__'.")
503
+
504
+ if not pkg:
505
+ raise ValueError(
506
+ "`read_meta` is missing a package or module name. "
507
+ "If using `__package__` make sure an __init__.py is present."
508
+ )
509
+
510
+ try:
511
+ with open_text(pkg, META_FILE) as f:
512
+ return meta_converter.structure(json.load(f), Metadata)
513
+ # pkg=__name__ will raise a TypeError unless it is called in an __init__.py
514
+ except (ModuleNotFoundError, FileNotFoundError, TypeError):
515
+ pkg_ = pkg.split(".")
516
+ if len(pkg_) <= 1:
517
+ return Metadata()
518
+ return read_metadata(".".join(pkg_[:-1]))