thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
@@ -0,0 +1,17 @@
1
+ from getpass import getuser
2
+
3
+
4
+ def parse_namespace(input_str: str) -> str:
5
+ # lowercase and replace all non-alphanumeric characters with dashes
6
+ return "".join(c if c.isalnum() else "-" for c in input_str.lower())
7
+
8
+
9
+ def user_namespace() -> str:
10
+ try:
11
+ return getuser()
12
+ except OSError:
13
+ return "CICD-Runner"
14
+
15
+
16
+ def main() -> None:
17
+ print(user_namespace())
@@ -0,0 +1,58 @@
1
+ import typing as ty
2
+
3
+ from kubernetes import client
4
+ from typing_extensions import TypedDict
5
+
6
+
7
+ class ResourceDefinition(TypedDict, total=False):
8
+ """
9
+ This works for both limits and requests.
10
+
11
+ https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
12
+ """
13
+
14
+ memory: str
15
+ """E.g., 10G"""
16
+ cpu: str
17
+ """E.g., 4.5, or 4500m (millicores)"""
18
+
19
+
20
+ class NodeNarrowing(TypedDict, total=False):
21
+ """This is a more transparent interface for selecting nodes that your job can run on.
22
+
23
+ You don't have to provide each key, but any key/value you pair you provide must be the proper type.
24
+ """
25
+
26
+ resource_requests: ResourceDefinition
27
+ resource_limits: ResourceDefinition
28
+ # https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
29
+ node_selector: ty.Mapping[str, str]
30
+ # https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/
31
+ tolerations: ty.Sequence[client.V1Toleration]
32
+ # https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
33
+
34
+
35
+ def tolerates_spot() -> client.V1Toleration:
36
+ """Return our custom spot instance toleration configuration."""
37
+ return client.V1Toleration(
38
+ key="kubernetes.azure.com/scalesetpriority", value="spot", effect="NoSchedule"
39
+ )
40
+
41
+
42
+ def tolerates_gpu() -> client.V1Toleration:
43
+ """Apply this toleration to enable use of GPUs."""
44
+ return client.V1Toleration(key="dedicated", value="gpu", effect="NoSchedule")
45
+
46
+
47
+ def tolerates_64cpu() -> client.V1Toleration:
48
+ """These node pools often do not scale up well or quickly, so by
49
+ default they're disabled. If that changes in the future, or if you
50
+ are requesting more than 32 CPUs for your Pod, you should apply
51
+ this toleration.
52
+ """
53
+ return client.V1Toleration(key="dedicated", value="64cpu", effect="NoSchedule")
54
+
55
+
56
+ def require_gpu() -> NodeNarrowing:
57
+ """Merge this with any additional NodeNarrowing (e.g. resource_requests) to run on GPUs."""
58
+ return dict(node_selector={"instance-type": "gpu"}, tolerations=[tolerates_gpu()])
thds/mops/k8s/retry.py ADDED
@@ -0,0 +1,75 @@
1
+ import time
2
+ import typing as ty
3
+ from functools import wraps
4
+
5
+ import urllib3.exceptions
6
+ from kubernetes import client
7
+
8
+ from . import auth, config
9
+ from ._shared import logger
10
+
11
+ F = ty.TypeVar("F", bound=ty.Callable)
12
+
13
+
14
+ # The first thing you should know about the Kubernetes SDK is that it
15
+ # is riddled with race conditions and timeouts and all kinds of
16
+ # horrible gremlins. This first function/decorator is an _ongoing_
17
+ # attempt to deal with the fallout from that. Hopefully from now on
18
+ # we'll be able to consolidate/maintain all of that logic in a single
19
+ # place:
20
+
21
+
22
+ _URLLIB_COMMON = (
23
+ urllib3.exceptions.ProtocolError,
24
+ urllib3.exceptions.MaxRetryError,
25
+ )
26
+
27
+
28
+ def k8s_sdk_retry(
29
+ get_retry_args_kwargs: ty.Optional[ty.Callable[[int], ty.Tuple[tuple, dict]]] = None,
30
+ should_retry: ty.Callable[[Exception], bool] = lambda _: False,
31
+ max_retries: int = 20,
32
+ ) -> ty.Callable[[F], F]:
33
+ """Handle the common cases - lets you decide about uncommon ones."""
34
+
35
+ def decorator(f: F) -> F:
36
+ @wraps(f)
37
+ def wrapper(*args, **kwargs): # type: ignore
38
+ i = 0
39
+
40
+ def _raise_if_max(i: int) -> None:
41
+ if i >= max_retries:
42
+ logger.warning(f"Failing after {i} tries")
43
+ raise
44
+
45
+ while True:
46
+ try:
47
+ return f(*args, **kwargs)
48
+ except Exception as ex:
49
+ # some shared behavior for all exceptions means we want a single except block
50
+ _raise_if_max(i)
51
+ if isinstance(ex, _URLLIB_COMMON):
52
+ # these are extremely common and should always be retried
53
+ logger.debug(
54
+ "Encountered probable connection timeout - retrying",
55
+ exc=str(ex),
56
+ )
57
+ # necessary b/c https://github.com/kubernetes-client/python/issues/1234
58
+ elif isinstance(ex, client.exceptions.ApiException) and ex.reason == "Unauthorized":
59
+ # this one is fairly common - who knows why their SDK can't handle this automatically.
60
+ #
61
+ # https://github.com/kubernetes-client/python/blob/release-18.0/kubernetes/client/exceptions.py?ts=4#L84
62
+ logger.info(f"{ex} - retrying after auth failure")
63
+ auth.load_config()
64
+ elif not should_retry(ex):
65
+ raise
66
+
67
+ i += 1
68
+ logger.info(f"Will retry after K8S error {str(ex)}; attempt {i}")
69
+ time.sleep(config.k8s_monitor_delay())
70
+ if get_retry_args_kwargs:
71
+ args, kwargs = get_retry_args_kwargs(i)
72
+
73
+ return ty.cast(F, wrapper)
74
+
75
+ return decorator
@@ -0,0 +1,42 @@
1
+ import re
2
+ import typing as ty
3
+
4
+ from kubernetes import client
5
+
6
+ _TOO_OLD_RESOURCE_VERSION = re.compile(
7
+ r"Expired: too old resource version: (?P<old>\w+) \((?P<cur>\w+)\)"
8
+ )
9
+ # holy bananas I cannot believe how much K8s' SDK sucks. this is a
10
+ # standard exception with an known retry semantic that their watchers
11
+ # are apparently unable to handle on their own - I'm staring at their
12
+ # code right now and they don't even attempt to handle this.
13
+
14
+
15
+ class TooOldResourceVersion(ty.NamedTuple):
16
+ old: str
17
+ cur: str
18
+ spread: str # only if the above are actually numbers
19
+
20
+
21
+ def parse_too_old_resource_version(
22
+ exc: Exception,
23
+ ) -> ty.Optional[TooOldResourceVersion]:
24
+ if not isinstance(exc, client.exceptions.ApiException):
25
+ return None
26
+ m = _TOO_OLD_RESOURCE_VERSION.match(exc.reason)
27
+ if m:
28
+ # this is a completely bonkers thing to have to do
29
+ # ourselves, but here we are. I can't find any
30
+ # documentation on why their SDK doesn't handle this
31
+ # themselves, and I don't even know why we haven't run
32
+ # into it before. Regardless, apparently we have to
33
+ # special-case a retry when there are enough old
34
+ # events on the server.
35
+ resource_version = m.group("cur")
36
+ old = m.group("old")
37
+ try:
38
+ spread = str(int(resource_version) - int(m.group("old")))
39
+ except ValueError:
40
+ spread = "unknown"
41
+ return TooOldResourceVersion(old, resource_version, spread)
42
+ return None
@@ -0,0 +1,50 @@
1
+ """Uses rsync to copy files to/from a Kubernetes pod.
2
+
3
+ The remote pod must have rsync installed.
4
+
5
+ CLI wrapper with help text for the krsync.sh script, which is usable on its own.
6
+ Thank you, Karl Bunch, who provided the world with this elegant implementation.
7
+ https://serverfault.com/questions/741670/rsync-files-to-a-kubernetes-pod?newreg=22b5f958cdce4e6a9a1a7ce0fc88b546
8
+
9
+ When addressing the remote, you must specify a pod name, and
10
+ optionally a namespace preceded by '@', and then a colon, followed by
11
+ the path on the remote.
12
+
13
+ Examples:
14
+
15
+ krsync ~/my/local.txt pod1234:/root/local_2.txt
16
+ krsync ~/my/local pod1234:~/local_dir -rav # recursively copies entire directory
17
+ krsync pod1234@my-namespace:/root/my.parquet your.parquet
18
+ krsync prod-udla-0@unified-directory:/var/data/labels.db ./labels.db --container prod-udla-db
19
+ """
20
+
21
+ import argparse
22
+ import importlib
23
+ import os
24
+ import subprocess
25
+ import sys
26
+
27
+ with importlib.resources.path(__package__, "krsync.sh") as p:
28
+ krsync = str(p.resolve())
29
+
30
+
31
+ def main() -> int:
32
+ remote_path = "pod-name[@namespace]:/remote/path"
33
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
34
+ parser.add_argument("src", help=f"Either a local path or {remote_path}")
35
+ parser.add_argument("dest", help=f"Either a local path or {remote_path}")
36
+ parser.add_argument(
37
+ "--container",
38
+ "-c",
39
+ help="Container name - if not provided, will use the default container",
40
+ default="",
41
+ )
42
+ args, rsync_args = parser.parse_known_args()
43
+ return subprocess.run(
44
+ ["/bin/bash", krsync, args.src, args.dest, *rsync_args],
45
+ env=dict(os.environ, KRSYNC_CONTAINER=args.container or ""),
46
+ ).returncode
47
+
48
+
49
+ if __name__ == "__main__":
50
+ sys.exit(main())
@@ -0,0 +1,22 @@
1
+ #!/bin/bash
2
+ # thank you, Karl Bunch:
3
+ # https://serverfault.com/questions/741670/rsync-files-to-a-kubernetes-pod?newreg=22b5f958cdce4e6a9a1a7ce0fc88b546
4
+ if [ -z "$KRSYNC_STARTED" ]; then
5
+ export KRSYNC_STARTED=true
6
+ exec rsync --blocking-io --rsh "$0" $@
7
+ fi
8
+
9
+ # Running as --rsh
10
+ namespace=''
11
+ pod=$1
12
+ shift
13
+
14
+ # If use uses pod@namespace rsync passes as: {us} -l pod namespace ...
15
+ if [ "X$pod" = "X-l" ]; then
16
+ pod=$1
17
+ shift
18
+ namespace="-n $1"
19
+ shift
20
+ fi
21
+
22
+ exec kubectl $namespace exec -i $pod --container "${KRSYNC_CONTAINER}" -- "$@"
@@ -0,0 +1,72 @@
1
+ """Wait for a Job to finish."""
2
+
3
+ import time
4
+ from datetime import timedelta
5
+ from timeit import default_timer
6
+
7
+ from thds.core import scope
8
+ from thds.core.log import logger_context
9
+
10
+ from .._utils.colorize import colorized
11
+ from . import config
12
+ from ._shared import logger
13
+ from .jobs import get_job, is_job_failed, is_job_succeeded
14
+
15
+ UNUSUAL = colorized(fg="white", bg="yellow")
16
+
17
+
18
+ def _max_no_job_wait() -> timedelta:
19
+ return timedelta(seconds=config.k8s_monitor_max_attempts() * config.k8s_monitor_delay())
20
+
21
+
22
+ @scope.bound
23
+ def wait_for_job(job_name: str, short_name: str = "") -> bool:
24
+ """Return True if Job completed, False if it failed.
25
+
26
+ May raise an exception if something truly unusual happened.
27
+
28
+ A _lot_ has gone in to trying to make this robust against common
29
+ failure patterns. My apologies for the resulting shape of the
30
+ code. :/
31
+ """
32
+ scope.enter(logger_context(job=job_name))
33
+ log_name = f"Job {short_name}" if short_name else "Job"
34
+ logger.debug(f"Waiting for {log_name} to finish...")
35
+ start_time = default_timer()
36
+
37
+ def _wait_for_job() -> bool:
38
+ nonlocal start_time
39
+ found_at_least_once = False
40
+ while True:
41
+ time.sleep(0.5 if found_at_least_once else 10.0)
42
+ job = get_job(job_name)
43
+ if not job:
44
+ if found_at_least_once:
45
+ logger.warning(UNUSUAL(f"Known job {job_name} no longer exists - assuming success!"))
46
+ return True
47
+ max_wait_seconds = _max_no_job_wait().total_seconds()
48
+ if default_timer() - start_time > max_wait_seconds:
49
+ logger.error(
50
+ UNUSUAL(
51
+ f"Job {job_name} has not been seen for {max_wait_seconds:.1f} seconds"
52
+ " - assuming failure!"
53
+ )
54
+ )
55
+ return False
56
+
57
+ logger.debug("%s not found... retrying.", job_name)
58
+ continue
59
+
60
+ if is_job_succeeded(job):
61
+ return True
62
+
63
+ if is_job_failed(job):
64
+ logger.error(
65
+ UNUSUAL(f"A Kubernetes Job is reporting an actual failed status: {job_name}")
66
+ )
67
+ return False
68
+
69
+ found_at_least_once = True
70
+ start_time = default_timer() # restart timer since the job has been found.
71
+
72
+ return _wait_for_job()
@@ -0,0 +1,63 @@
1
+ import threading
2
+ import typing as ty
3
+ from datetime import datetime, timezone
4
+
5
+ from kubernetes import client
6
+
7
+ from thds.core.log import getLogger
8
+
9
+ from .._utils.colorize import colorized
10
+ from . import config
11
+ from .watch import K8sList, OneShotLimiter, yield_objects_from_list
12
+
13
+ logger = getLogger(__name__)
14
+
15
+ OnCoreEvent = ty.Callable[[client.CoreV1Event], ty.Any]
16
+
17
+ YIKES = colorized(fg="black", bg="yellow")
18
+
19
+
20
+ def _emit_basic(event: client.CoreV1Event) -> None:
21
+ logger.error(YIKES(event.message))
22
+
23
+
24
+ def _warn_image_pull_backoff(namespace: str, on_backoff: OnCoreEvent = _emit_basic) -> None:
25
+ """Log scary errors when ImagePullBackoff is observed."""
26
+ start_dt = datetime.now(tz=timezone.utc)
27
+ for _ns, obj in yield_objects_from_list(
28
+ namespace,
29
+ lambda _, __: ty.cast(
30
+ # do NOT use client.EventsV1Api here - for some reason
31
+ # it does not return the right 'types' of events.
32
+ # why? who the heck knows? How much time did I spend
33
+ # trying to figure this out? Also who knows.
34
+ K8sList[client.CoreV1Event],
35
+ client.CoreV1Api().list_namespaced_event,
36
+ ),
37
+ object_type_hint="backoff-warnings",
38
+ field_selector="reason=BackOff",
39
+ ):
40
+ if None is obj.last_timestamp or obj.last_timestamp > start_dt:
41
+ on_backoff(obj)
42
+
43
+
44
+ _WARN_IMAGE_PULL_BACKOFF = OneShotLimiter()
45
+
46
+
47
+ def start_warn_image_pull_backoff_thread(
48
+ namespace: str = "", on_backoff: ty.Optional[OnCoreEvent] = None
49
+ ) -> None:
50
+ """Limit 1 thread per namespace per application.
51
+
52
+ You can pass an additional message context
53
+ """
54
+ namespace = namespace or config.k8s_namespace()
55
+
56
+ _WARN_IMAGE_PULL_BACKOFF(
57
+ namespace,
58
+ lambda ns: threading.Thread(
59
+ target=_warn_image_pull_backoff,
60
+ args=(namespace, on_backoff or _emit_basic),
61
+ daemon=True,
62
+ ).start(),
63
+ )
thds/mops/k8s/watch.py ADDED
@@ -0,0 +1,266 @@
1
+ """K8s SDK watching is very unreliable for lots of reasons.
2
+
3
+ This is a general-purpose fix for using watchers in a thread reliably.
4
+ """
5
+
6
+ import threading
7
+ import time
8
+ import typing as ty
9
+
10
+ import urllib3
11
+ from kubernetes import client
12
+ from kubernetes import watch as k8s_watch
13
+
14
+ from thds.core import scope
15
+ from thds.core.log import getLogger, logger_context
16
+
17
+ from .._utils.colorize import colorized
18
+ from . import config
19
+ from .auth import load_config
20
+ from .too_old_resource_version import parse_too_old_resource_version
21
+
22
+ logger = getLogger(__name__)
23
+
24
+ T = ty.TypeVar("T")
25
+
26
+
27
+ class V1List(ty.Protocol[T]):
28
+ api_version: str
29
+ items: ty.List[T]
30
+ kind: str
31
+ metadata: client.models.V1ListMeta
32
+
33
+
34
+ class K8sList(ty.Protocol[T]):
35
+ def __call__(self, *args: ty.Any, namespace: str, **kwargs: ty.Any) -> V1List[T]:
36
+ ...
37
+
38
+
39
+ # If this does not return a K8sList API method, the loop will exit
40
+ GetListMethod = ty.Callable[[str, ty.Optional[Exception]], ty.Optional[K8sList[T]]]
41
+ # if this returns True, the loop will exit.
42
+ OnEvent = ty.Callable[[str, T], ty.Optional[bool]]
43
+
44
+
45
+ def yield_objects_from_list(
46
+ namespace: str,
47
+ get_list_method: GetListMethod[T],
48
+ server_timeout: int = 10,
49
+ object_type_hint: str = "items",
50
+ init: ty.Optional[ty.Callable[[], None]] = None,
51
+ **kwargs: ty.Any,
52
+ ) -> ty.Iterator[ty.Tuple[str, T]]:
53
+ ex = None
54
+ if init:
55
+ init()
56
+ while True:
57
+ try:
58
+ load_config()
59
+ list_method = get_list_method(namespace, ex)
60
+ if not list_method:
61
+ logger.debug(f"No longer watching {object_type_hint} events in namespace: {namespace}")
62
+ break
63
+ initial_list = list_method(namespace=namespace)
64
+ logger.debug(
65
+ f"Listed {len(initial_list.items)} {object_type_hint} in namespace: {namespace}"
66
+ )
67
+ for object in initial_list.items:
68
+ yield namespace, object
69
+
70
+ if initial_list.metadata._continue:
71
+ logger.warning(
72
+ f"We did not fetch the whole list of {object_type_hint} the first time..."
73
+ )
74
+ for evt in k8s_watch.Watch().stream(
75
+ list_method,
76
+ namespace=namespace,
77
+ resource_version=initial_list.metadata.resource_version,
78
+ **kwargs,
79
+ _request_timeout=(server_timeout, config.k8s_job_timeout_seconds()),
80
+ ):
81
+ object = evt.get("object")
82
+ if object:
83
+ yield namespace, object
84
+ # once we've received events, let the resource version
85
+ # be managed automatically if possible.
86
+ except urllib3.exceptions.ProtocolError:
87
+ ex = None
88
+ except urllib3.exceptions.ReadTimeoutError:
89
+ ex = None
90
+ except Exception as e:
91
+ too_old = parse_too_old_resource_version(e)
92
+ if too_old:
93
+ logger.debug(f"Immediately retrying {too_old}")
94
+ else:
95
+ logger.exception(f"Unexpected exception while listing {object_type_hint}")
96
+ ex = e
97
+
98
+
99
+ def callback_events(on_event: OnEvent[T], event_yielder: ty.Iterable[ty.Tuple[str, T]]) -> None:
100
+ """Suitable for use with a daemon thread."""
101
+ for namespace, event in event_yielder:
102
+ should_exit = on_event(namespace, event)
103
+ if should_exit:
104
+ break
105
+
106
+
107
+ def _make_name(namespace: str, name: str) -> str:
108
+ return f"{namespace}/{name}"
109
+
110
+
111
+ def _default_get_name(obj: ty.Any) -> str:
112
+ return obj.metadata.name
113
+
114
+
115
+ def _default_get_namespace(obj: ty.Any) -> str:
116
+ return obj.metadata.namespace
117
+
118
+
119
+ STARTING = colorized(fg="white", bg="orange")
120
+
121
+
122
+ class OneShotLimiter:
123
+ """Do an action once per provided name. Does not wait for it to complete."""
124
+
125
+ def __init__(self) -> None:
126
+ self._lock = threading.RLock()
127
+ self._names: ty.Set[str] = set()
128
+
129
+ def __call__(self, name: str, shoot: ty.Callable[[str], ty.Any]) -> None:
130
+ """Shoot if the name has not already been shot."""
131
+ if name in self._names:
132
+ return
133
+ with self._lock:
134
+ if name in self._names:
135
+ return
136
+ shoot(name)
137
+ self._names.add(name)
138
+
139
+
140
+ def is_stale(api_last_update_time: float, obj_last_seen_time: float) -> bool:
141
+ now = time.monotonic()
142
+ allowed_stale_seconds = config.k8s_watch_object_stale_seconds()
143
+ if (time_since_api_update := now - api_last_update_time) > allowed_stale_seconds: # noqa: F841
144
+ # we haven't heard anything from the API in a while; probably
145
+ # the API is down. Ignore object staleness to avoid false positives.
146
+ return False
147
+
148
+ if not obj_last_seen_time:
149
+ return False # false positives aren't worth it
150
+
151
+ return (time_since_obj_update := now - obj_last_seen_time) > allowed_stale_seconds # noqa: F841
152
+
153
+
154
+ class WatchingObjectSource(ty.Generic[T]):
155
+ """Efficiently 'get' objects by reliably watching for changes to all such objects in a given namespace.
156
+
157
+ This is network-efficient for observing many different objects,
158
+ but not memory efficient if you really only need to fetch details
159
+ for a few objects.
160
+ """
161
+
162
+ def __init__(
163
+ self,
164
+ get_list_method: GetListMethod[T],
165
+ get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
166
+ ty.Callable[[T], str], _default_get_name
167
+ ),
168
+ backup_fetch: ty.Optional[ty.Callable[[str, str], T]] = None,
169
+ typename: str = "object",
170
+ starting: ty.Callable[[str], str] = STARTING,
171
+ ) -> None:
172
+ self.get_list_method = get_list_method
173
+ self.get_name = get_name
174
+ self.backup_fetch = backup_fetch
175
+ self.typename = typename
176
+ self._objs_by_name: ty.Dict[str, T] = dict()
177
+ # ^ is a possibly big/expensive local cache of the most recent
178
+ # state for all of the event type in the namespace. Don't use
179
+ # this class if you can't afford the memory overhead of
180
+ # observing everything in your namespace and keeping the last
181
+ # known copy of everything forever.
182
+ self._last_seen_time_by_name: ty.Dict[str, float] = dict()
183
+ self._last_api_update_time = 0.0
184
+ self._limiter = OneShotLimiter()
185
+
186
+ def _start_thread(self, namespace: str) -> None:
187
+ threading.Thread(
188
+ target=callback_events,
189
+ args=(
190
+ self._add_object,
191
+ yield_objects_from_list(
192
+ namespace,
193
+ self._get_list_method_on_restart,
194
+ object_type_hint=self.typename + "s",
195
+ init=lambda: logger.info(STARTING(f"Watching {self.typename}s in {namespace}")),
196
+ ),
197
+ ),
198
+ daemon=True,
199
+ ).start()
200
+
201
+ def _add_object(self, namespace: str, obj: T) -> None:
202
+ """This is where we receive updates from the k8s API."""
203
+ self._last_api_update_time = time.monotonic()
204
+
205
+ if not obj:
206
+ logger.warning(f"Received null/empty {self.typename}")
207
+ return
208
+
209
+ name = _make_name(namespace, self.get_name(obj))
210
+ logger.debug(f"{self.typename} {name} updated")
211
+ self._last_seen_time_by_name[name] = time.monotonic()
212
+ self._objs_by_name[name] = obj
213
+
214
+ def _get_list_method_on_restart(
215
+ self, namespace: str, exc: ty.Optional[Exception]
216
+ ) -> ty.Optional[K8sList[T]]:
217
+ suffix = ""
218
+ if exc:
219
+ too_old = parse_too_old_resource_version(exc)
220
+ if not too_old:
221
+ logger.exception(f"Not fatal, but sleeping before we retry {self.typename} scraping...")
222
+ time.sleep(config.k8s_monitor_delay())
223
+ suffix = f" after {type(exc).__name__}: {exc}"
224
+ logger.info(f"Watching {self.typename}s in namespace: {namespace}{suffix}")
225
+ return self.get_list_method(namespace, exc)
226
+
227
+ def _is_stale(self, name: str) -> bool:
228
+ return is_stale(self._last_api_update_time, self._last_seen_time_by_name.get(name) or 0)
229
+
230
+ @scope.bound
231
+ def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
232
+ namespace = namespace or config.k8s_namespace()
233
+ name = _make_name(namespace, obj_name)
234
+ scope.enter(logger_context(name=obj_name, namespace=namespace))
235
+
236
+ # first try is looking in our local cache
237
+ if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
238
+ return obj
239
+
240
+ # second try is making sure the namespace watcher is running, sleeping, and then looking in the cache again.
241
+ # This is much more efficient than a manual fetch.
242
+ self._limiter(namespace, self._start_thread)
243
+ time.sleep(config.k8s_monitor_delay())
244
+ if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
245
+ return obj
246
+
247
+ # if that doesn't work, try a manual fetch.
248
+ if self.backup_fetch:
249
+ logger.warning(f"Manually fetching {self.typename}...")
250
+ # doing a lot of manual fetches may indicate that the k8s API is having trouble keeping up...
251
+ try:
252
+ if obj := self.backup_fetch(namespace, obj_name):
253
+ self._add_object(namespace, obj) # updates last seen, too
254
+ return obj
255
+
256
+ except Exception:
257
+ logger.exception(f"Unexpected error during manual fetch of {self.typename}.")
258
+
259
+ if self._is_stale(name):
260
+ logger.warning(
261
+ f"Could not refresh {name}, and our record of it is stale - dropping stale object!"
262
+ )
263
+ self._objs_by_name.pop(name, None)
264
+ self._last_seen_time_by_name.pop(name, None)
265
+
266
+ return None
thds/mops/meta.json ADDED
@@ -0,0 +1,8 @@
1
+ {
2
+ "git_commit": "affac0a04726de27f8065525d193df4cd6376b9c",
3
+ "git_branch": "task/dbxtend/use-pure-magic",
4
+ "git_is_clean": true,
5
+ "pyproject_version": "3.6.20250219172032",
6
+ "thds_user": "peter.gaultney",
7
+ "misc": {}
8
+ }