thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
@@ -0,0 +1,27 @@
1
+ """Trilliant Health abstraction around launching K8S Jobs."""
2
+
3
+ try:
4
+ from kubernetes import client as _ # noqa
5
+ except ModuleNotFoundError as mnf:
6
+ raise ModuleNotFoundError(
7
+ "Please install mops with the `k8s` extra to use `thds.mops.k8s`."
8
+ ) from mnf
9
+
10
+ from .container_registry import autocr # noqa: F401
11
+ from .launch import K8sJobFailedError, launch, shim # noqa
12
+ from .node_selection import ( # noqa
13
+ NodeNarrowing,
14
+ ResourceDefinition,
15
+ require_gpu,
16
+ tolerates_64cpu,
17
+ tolerates_gpu,
18
+ tolerates_spot,
19
+ )
20
+
21
+ try:
22
+ from . import thds_std # noqa: F401
23
+ except ModuleNotFoundError:
24
+ pass
25
+
26
+
27
+ mops_shell = shim # deprecated alias
@@ -0,0 +1,3 @@
1
+ from thds.core.log import getLogger
2
+
3
+ logger = getLogger(__name__[: -len("._shared")])
@@ -0,0 +1,22 @@
1
+ import tempfile
2
+
3
+ from kubernetes import client, utils
4
+
5
+
6
+ def format_yaml(yaml_template_str: str, **template_values: str) -> str:
7
+ return yaml_template_str.format(**template_values)
8
+
9
+
10
+ def create_yaml_template(yaml_str: str, **template_values: str) -> None:
11
+ """Format a YAML template with the given keyword arguments, then apply it to the Kubernetes cluster.
12
+
13
+ You must already have set up your SDK config.
14
+
15
+ NOTE: This function doesn't actually apply, and can't until the next release of the K8S SDK:
16
+ https://github.com/kubernetes-client/python/pull/2252
17
+ """
18
+ formatted_yaml = format_yaml(yaml_str, **template_values)
19
+ with tempfile.NamedTemporaryFile("w", prefix="kubectl-yaml") as f:
20
+ f.write(formatted_yaml)
21
+ f.flush()
22
+ utils.create_from_yaml(client.ApiClient(), f.name)
thds/mops/k8s/auth.py ADDED
@@ -0,0 +1,49 @@
1
+ import typing as ty
2
+ from threading import RLock
3
+
4
+ from cachetools import TTLCache
5
+ from kubernetes import client, config
6
+
7
+ from thds.core import fretry, log, scope
8
+
9
+ from .._utils.locked_cache import locked_cached
10
+
11
+ logger = log.getLogger(__name__)
12
+
13
+
14
+ def _retry_config(exc: Exception) -> bool:
15
+ if isinstance(exc, config.ConfigException):
16
+ logger.debug("Retrying config load...")
17
+ return True
18
+ return False
19
+
20
+
21
+ empty_config_retry = fretry.retry_sleep(_retry_config, fretry.expo(retries=3, delay=0.2))
22
+
23
+ _AUTH_RLOCK = RLock()
24
+
25
+
26
+ # load_config gets called all over the place and way too often.
27
+ @locked_cached(TTLCache(1, ttl=120), lock=_AUTH_RLOCK)
28
+ def load_config() -> None:
29
+ logger.debug("Loading Kubernetes config...")
30
+ try:
31
+ empty_config_retry(config.load_config)()
32
+ except config.ConfigException:
33
+ logger.error("Failed to load kube-config")
34
+
35
+
36
+ @scope.bound
37
+ def upsert_namespace(namespace: str, created_cache: ty.Set[str] = set()) -> None: # noqa: B006
38
+ scope.enter(_AUTH_RLOCK)
39
+ if namespace in created_cache:
40
+ return
41
+ logger.debug("Creating namespace if not exists: %s" % namespace)
42
+ load_config()
43
+ kubeapi = client.CoreV1Api()
44
+ ns_obj = client.V1Namespace(metadata=client.V1ObjectMeta(name=namespace))
45
+ namespaces = set([item.metadata.name for item in kubeapi.list_namespace().items])
46
+ if namespace not in namespaces:
47
+ logger.info(f"Creating namespace {namespace}")
48
+ kubeapi.create_namespace(ns_obj)
49
+ created_cache.add(namespace)
@@ -0,0 +1,37 @@
1
+ from datetime import timedelta
2
+
3
+ from thds.core import config
4
+
5
+ from .namespace import parse_namespace, user_namespace
6
+
7
+ k8s_namespace = config.item("mops.k8s.namespace", user_namespace(), parse=parse_namespace)
8
+ k8s_namespace_env_var_key = config.item("mops.k8s.namespace_env_var_key", "MOPS_K8S_NAMESPACE")
9
+ # the above is used to embed the current namespace _inside_ the container as an
10
+ # environment variable. it will not affect how your namespace is selected in the first
11
+ # place.
12
+
13
+ k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 30 * 60, parse=int)
14
+ k8s_acr_url = config.item("mops.k8s.acr.url", "")
15
+ k8s_job_retry_count = config.item("mops.k8s.job.retry_count", 6, parse=int)
16
+ k8s_job_cleanup_ttl_seconds_after_completion = config.item(
17
+ "mops.k8s.job.cleanup_ttl_seconds", int(timedelta(minutes=60).total_seconds()), parse=int
18
+ )
19
+ k8s_job_timeout_seconds = config.item(
20
+ "mops.k8s.job.timeout_seconds", int(timedelta(minutes=3).total_seconds()), parse=int
21
+ )
22
+ k8s_monitor_delay = config.item("mops.k8s.monitor.delay_seconds", 5, parse=int)
23
+ k8s_monitor_max_attempts = config.item("mops.k8s.monitor.max_attempts", 100, parse=int)
24
+
25
+ # In the East, we use the newer pod managed identity by default,
26
+ # which provides access to a metadata endpoint that Azure clients know
27
+ # how to access automatically.
28
+ # https://docs.microsoft.com/en-us/azure/aks/use-azure-ad-pod-identity
29
+ aad_pod_managed_identity = config.item("mops.k8s.azure.aad_pod_managed_identity", "")
30
+
31
+ # but there's an even newer, better type of auth called Workload
32
+ # Identity, which unfortunately requires specific infrastructure
33
+ # configuration that lives outside this library.
34
+ # https://azure.github.io/azure-workload-identity/docs/introduction.html
35
+ namespaces_supporting_workload_identity = config.item(
36
+ "mops.k8s.azure.namespaces_supporting_workload_identity", ["default"]
37
+ )
@@ -0,0 +1,14 @@
1
+ from . import config
2
+
3
+
4
+ def autocr(container_image_name: str, cr_url: str = "") -> str:
5
+ """Prefix the container with the configured container registry URL.
6
+
7
+ Idempotent, so it will not apply if called a second time.
8
+ """
9
+ cr_url = cr_url or config.k8s_acr_url()
10
+ assert cr_url, "No container registry URL configured."
11
+ prefix = cr_url + "/" if cr_url and not cr_url.endswith("/") else cr_url
12
+ if not container_image_name.startswith(prefix):
13
+ return prefix + container_image_name
14
+ return container_image_name
thds/mops/k8s/jobs.py ADDED
@@ -0,0 +1,57 @@
1
+ import typing as ty
2
+
3
+ from kubernetes import client
4
+
5
+ from ._shared import logger
6
+ from .retry import k8s_sdk_retry
7
+ from .watch import WatchingObjectSource
8
+
9
+
10
+ @k8s_sdk_retry()
11
+ def _get_job(namespace: str, job_name: str) -> ty.Optional[client.models.V1Job]:
12
+ logger.debug(f"Reading job {job_name}")
13
+ return client.BatchV1Api().read_namespaced_job(
14
+ namespace=namespace,
15
+ name=job_name,
16
+ )
17
+
18
+
19
+ _JOB_SOURCE = WatchingObjectSource(
20
+ lambda _, __: client.BatchV1Api().list_namespaced_job,
21
+ lambda job: job.metadata.name, # type: ignore
22
+ _get_job,
23
+ typename="Job",
24
+ )
25
+
26
+
27
+ def get_job(job_name: str, namespace: str = "") -> ty.Optional[client.models.V1Job]:
28
+ return _JOB_SOURCE.get(job_name, namespace=namespace)
29
+
30
+
31
+ # https://github.com/kubernetes/kubernetes/issues/68712#issuecomment-514008330
32
+ # https://kubernetes.io/docs/concepts/workloads/controllers/job/#terminal-job-conditions
33
+
34
+
35
+ def is_job_succeeded(job: client.models.V1Job) -> bool:
36
+ if not job.status:
37
+ return False
38
+
39
+ if not job.status.completion_time:
40
+ return False
41
+
42
+ for condition in job.status.conditions or tuple():
43
+ if condition.type == "Complete" and condition.status == "True":
44
+ return True
45
+
46
+ return False
47
+
48
+
49
+ def is_job_failed(job: client.models.V1Job) -> bool:
50
+ if not job.status:
51
+ return False
52
+
53
+ for condition in job.status.conditions or tuple():
54
+ if condition.type == "Failed" and condition.status == "True":
55
+ return True
56
+
57
+ return False
@@ -0,0 +1,234 @@
1
+ """Provides an abstraction for launching Docker images on Kubernetes and waiting until they finish."""
2
+
3
+ import os
4
+ import threading
5
+ import typing as ty
6
+ import uuid
7
+
8
+ from kubernetes import client
9
+
10
+ from thds.core import scope
11
+ from thds.core.log import logger_context
12
+ from thds.mops.pure.runner.simple_shims import samethread_shim
13
+
14
+ from .._utils.colorize import colorized
15
+ from . import config
16
+ from ._shared import logger
17
+ from .auth import load_config, upsert_namespace
18
+ from .logging import JobLogWatcher
19
+ from .node_selection import NodeNarrowing, ResourceDefinition
20
+ from .retry import k8s_sdk_retry
21
+ from .thds_std import embed_thds_auth
22
+ from .wait_job import wait_for_job
23
+
24
+ LAUNCHED = colorized(fg="white", bg="green")
25
+ COMPLETE = colorized(fg="white", bg="blue")
26
+ FAILED = colorized(fg="white", bg="red")
27
+
28
+
29
+ class K8sJobFailedError(Exception):
30
+ """Raised by `launch` when a Job is seen to terminate in a Failed state."""
31
+
32
+
33
+ class Counter:
34
+ def __init__(self) -> None:
35
+ self.value = 0
36
+ self._lock = threading.Lock()
37
+
38
+ def inc(self) -> int:
39
+ with self._lock:
40
+ self.value += 1
41
+ return self.value
42
+
43
+
44
+ _LAUNCH_COUNT = Counter()
45
+ _FINISH_COUNT = Counter()
46
+ _SIMULTANEOUS_LAUNCHES = threading.BoundedSemaphore(20)
47
+
48
+
49
+ @scope.bound
50
+ def launch(
51
+ container_image: str,
52
+ args: ty.Sequence[str],
53
+ *,
54
+ node_narrowing: ty.Optional[NodeNarrowing] = None,
55
+ container_name: str = "jobcontainer",
56
+ env_vars: ty.Optional[ty.Mapping[str, str]] = None,
57
+ # arguments below are for launching; arguments above are for
58
+ # building. these should get separated in a future change.
59
+ name_prefix: str = "",
60
+ dry_run: bool = False,
61
+ fire_and_forget: bool = False,
62
+ suppress_logs: bool = False,
63
+ transform_job: ty.Callable[[client.models.V1Job], client.models.V1Job] = embed_thds_auth,
64
+ # this is a default for now. later if we share this code we'll need to have a wrapper interface
65
+ service_account_name: str = "",
66
+ ) -> None:
67
+ """Launch a Kubernetes job.
68
+
69
+ Required parameters are the container_image and the arguments to
70
+ that image, just as if you were running this directly with Docker.
71
+
72
+ Unless fire_and_forget=True, will poll until Job completes and
73
+ will raise K8sJobFailedError if the Job fails. None is returned
74
+ if the Job succeeds.
75
+
76
+ `name_prefix` is an optional parameter for debugging/developer
77
+ convenience. A generated suffix will be added to it.
78
+
79
+ """
80
+ if not container_image:
81
+ raise ValueError("container_image (the fully qualified Docker tag) must not be empty.")
82
+ job_num = f"{_LAUNCH_COUNT.inc():0>3}"
83
+ name = "-".join([name_prefix, str(os.getpid()), job_num, str(uuid.uuid4())[:8]]).lstrip("-")
84
+ scope.enter(logger_context(job=name))
85
+ node_narrowing = node_narrowing or dict()
86
+
87
+ # TODO move this entire function out to be separately callable
88
+ @k8s_sdk_retry()
89
+ def assemble_base_job() -> client.models.V1Job:
90
+ logger.debug(f"Assembling job named `{name}` on image `{container_image}`")
91
+ logger.debug("Fire and forget: %s", fire_and_forget)
92
+ logger.debug("Loading kube configs ...")
93
+ load_config()
94
+ logger.debug("Populating job object ...")
95
+ v1_job_body = client.V1Job(api_version="batch/v1", kind="Job")
96
+ logger.debug("Setting object meta ...")
97
+ v1_job_body.metadata = client.V1ObjectMeta(namespace=config.k8s_namespace(), name=name)
98
+
99
+ v1_job_body.status = client.V1JobStatus()
100
+ logger.debug("Creating pod template ...")
101
+ pod_template = client.V1PodTemplate()
102
+
103
+ pod_template.template = client.V1PodTemplateSpec(metadata=client.V1ObjectMeta(labels=dict()))
104
+ # we make empty labels just in case a later transformer wants to add some.
105
+
106
+ logger.debug("Applying environment variables ...")
107
+ env_list = [
108
+ client.V1EnvVar(name="MOPS_IMAGE_FULL_TAG", value=container_image),
109
+ # by setting these, things will be 'reentrant' if it is necessary to launch jobs within this job.
110
+ ]
111
+ if env_vars is not None:
112
+ for env_name, env_value in env_vars.items():
113
+ env_list.append(client.V1EnvVar(name=env_name, value=env_value))
114
+ env_list.append(
115
+ client.V1EnvVar(name=config.k8s_namespace_env_var_key(), value=config.k8s_namespace())
116
+ )
117
+
118
+ logger.debug("Creating container definition ...")
119
+ logger.debug("Setting container CPU/RAM requirements ...")
120
+ v1_container_args = dict(
121
+ args=args,
122
+ name=container_name,
123
+ image=container_image,
124
+ env=env_list,
125
+ image_pull_policy="Always", # default is IfNotPresent, which leads to staleness when reusing a tag.
126
+ # https://kubernetes.io/docs/concepts/containers/images/#updating-images
127
+ )
128
+
129
+ assert node_narrowing is not None
130
+ resource_requests: ResourceDefinition = node_narrowing.get("resource_requests", dict())
131
+ resource_limits: ResourceDefinition = node_narrowing.get("resource_limits", dict())
132
+ if resource_requests or resource_limits:
133
+ v1_container_args["resources"] = client.V1ResourceRequirements(
134
+ requests=resource_requests,
135
+ limits=resource_limits,
136
+ )
137
+
138
+ container = client.V1Container(**v1_container_args)
139
+ logger.debug("Creating podspec definition ...")
140
+ pod_template.template.spec = client.V1PodSpec(
141
+ containers=[container],
142
+ restart_policy="Never",
143
+ node_selector=node_narrowing.get("node_selector", dict()),
144
+ tolerations=node_narrowing.get("tolerations", list()),
145
+ service_account_name=service_account_name,
146
+ )
147
+
148
+ logger.debug("Creating job definition ...")
149
+ v1_job_body.spec = client.V1JobSpec(
150
+ backoff_limit=config.k8s_job_retry_count(),
151
+ completions=1,
152
+ ttl_seconds_after_finished=config.k8s_job_cleanup_ttl_seconds_after_completion(),
153
+ template=pod_template.template,
154
+ )
155
+ logger.debug("Finished creating base job definition ...")
156
+ return v1_job_body
157
+
158
+ def job_with_all_transforms() -> client.models.V1Job:
159
+ return transform_job(assemble_base_job())
160
+
161
+ if dry_run:
162
+ job_with_all_transforms()
163
+ logger.info("Dry run assembly successful; not launching...")
164
+ return
165
+
166
+ @k8s_sdk_retry()
167
+ def launch_job() -> client.models.V1Job:
168
+ with _SIMULTANEOUS_LAUNCHES:
169
+ upsert_namespace(config.k8s_namespace())
170
+ # we do the job transform after actually upserting the namespace so that
171
+ # the transform can use the namespace if necessary.
172
+ return client.BatchV1Api().create_namespaced_job(
173
+ namespace=config.k8s_namespace(), body=job_with_all_transforms()
174
+ )
175
+
176
+ job = launch_job()
177
+ logger.info(LAUNCHED(f"Job {job_num} launched!") + f" on {container_image}")
178
+ if not suppress_logs:
179
+ threading.Thread( # fire and forget a log watching thread
180
+ target=JobLogWatcher(job.metadata.name, len(job.spec.template.spec.containers)).start,
181
+ daemon=True,
182
+ ).start()
183
+
184
+ if not fire_and_forget:
185
+
186
+ def counts() -> str:
187
+ launched = _LAUNCH_COUNT.value
188
+ return f"- ({launched - _FINISH_COUNT.inc()} unfinished of {launched})"
189
+
190
+ job_name = job.metadata.name
191
+ del job # trying to save memory here while we wait...
192
+ if not wait_for_job(job_name, short_name=job_num):
193
+ logger.error(FAILED(f"Job {job_num} Failed! {counts()}"))
194
+ raise K8sJobFailedError(f"Job {job_name} failed.")
195
+ logger.info(COMPLETE(f"Job {job_num} Complete! {counts()}"))
196
+
197
+
198
+ def shim(
199
+ container_image: ty.Union[str, ty.Callable[[], str]],
200
+ disable_remote: ty.Callable[[], bool] = lambda: False,
201
+ **outer_kwargs: ty.Any,
202
+ ) -> ty.Callable[[ty.Sequence[str]], None]:
203
+ """Return a closure that can launch the given configuration and run a mops pure function.
204
+
205
+ Now supports callables that return a container image name; the
206
+ goal being to allow applications to perform this lazily on the
207
+ first actual use of the k8s runtime shim. The passed callable will be
208
+ called each time, so if you want it to be called only once, you'll
209
+ need to wrap it yourself.
210
+
211
+ Supports an optional callable argument `disable_remote` which when evaluated to True
212
+ causes the mops pure function to be run in a local shell.
213
+ """
214
+ assert (
215
+ "args" not in outer_kwargs
216
+ ), "Passing 'args' as a keyword argument will cause conflicts with the closure."
217
+
218
+ if disable_remote():
219
+ return samethread_shim
220
+
221
+ if isinstance(container_image, str):
222
+ get_container_image: ty.Callable[[], str] = lambda: container_image # noqa: E731
223
+ else:
224
+ get_container_image = container_image
225
+
226
+ def launch_container_on_k8s_with_args(args: ty.Sequence[str], **inner_kwargs: ty.Any) -> None:
227
+ assert "args" not in inner_kwargs
228
+ launch(
229
+ get_container_image(),
230
+ ["python", "-m", "thds.mops.pure.core.entry.main", *args],
231
+ **{**outer_kwargs, **inner_kwargs},
232
+ )
233
+
234
+ return launch_container_on_k8s_with_args
@@ -0,0 +1,239 @@
1
+ """Handles things having to do with getting logs out of the Pods of a Job."""
2
+
3
+ import enum
4
+ import random
5
+ import threading
6
+ import time
7
+ import typing as ty
8
+ from timeit import default_timer
9
+
10
+ import cachetools
11
+ import urllib3.exceptions
12
+ from kubernetes import client, watch
13
+
14
+ from thds import core
15
+ from thds.core.log import logger_context
16
+
17
+ from .._utils.colorize import colorized, make_colorized_out, next_color
18
+ from .._utils.locked_cache import locked_cached
19
+ from . import config
20
+ from ._shared import logger
21
+ from .jobs import get_job
22
+ from .retry import k8s_sdk_retry
23
+
24
+ NO_K8S_LOGS = core.config.item("mops.no_k8s_logs", parse=core.config.tobool, default=False)
25
+ # non-empty if you want to completely disable k8s pod logs.
26
+ K8S_LOG_POD_FRACTION = core.config.item("mops.k8s.log_pod_fraction", parse=float, default=1.0)
27
+ # fraction of pods to log. 1.0 means all pods.
28
+
29
+ BOINK = colorized(fg="white", bg="magenta")
30
+ # this module has tons of logs. occasionally you want to find a needle
31
+ # in that haystack when you're debugging something. Wrap the logged
32
+ # string in this and it'll stand out.
33
+
34
+
35
+ class JobLogWatcher:
36
+ """Will spawn one or more daemon threads.
37
+
38
+ Each pod scraped will get its own randomly-selected ANSI color for
39
+ logs printed to the terminal.
40
+
41
+ When pods enter a failure state, a new check for pods will be
42
+ launched, in the hopes that the Job is planning to create new Pods
43
+ to replace them.
44
+
45
+ If the Job goes away entirely, this may or may not eventually
46
+ terminate. Because the threads are daemon threads, this will not
47
+ affect the logic of your program, but it's possible you may see
48
+ some spurious logging messages.
49
+ """
50
+
51
+ def __init__(self, job_name: str, num_pods_expected: int = 1) -> None:
52
+ self.job_name = job_name
53
+ self.num_pods_expected = num_pods_expected
54
+ self.pods_being_scraped: ty.Set[str] = set()
55
+ self.pod_colors: ty.Dict[str, ty.Callable[[str], ty.Any]] = dict()
56
+ self.job_pods_discovery_lock = threading.Lock()
57
+
58
+ @k8s_sdk_retry()
59
+ @core.scope.bound
60
+ def start(self, failed_pod_name: str = "") -> None:
61
+ """Call this one time - it will spawn threads as needed."""
62
+ if NO_K8S_LOGS():
63
+ return
64
+
65
+ if random.random() > K8S_LOG_POD_FRACTION():
66
+ logger.info(f"Skipping log watcher for {self.job_name} due to fraction.")
67
+ return
68
+
69
+ core.scope.enter(self.job_pods_discovery_lock)
70
+ # we lock here because some of the threads we spawn may
71
+ # eventually call this same method, and we only want one
72
+ # instance of this running at a time.
73
+ core.scope.enter(logger_context(log=self.job_name))
74
+ logger.debug("Starting log watcher")
75
+ if failed_pod_name:
76
+ logger.info(
77
+ BOINK(f"Failed to scrape logs in pod {failed_pod_name}, looking for new pods...")
78
+ )
79
+ self.pods_being_scraped.discard(failed_pod_name)
80
+ # this one can be retried if it's still out there.
81
+ time.sleep(config.k8s_monitor_delay())
82
+ for pod in _yield_running_pods_for_job(
83
+ self.job_name,
84
+ self.num_pods_expected if not self.pods_being_scraped else 1,
85
+ ):
86
+ pod_name = pod.metadata.name
87
+ if pod_name not in self.pods_being_scraped:
88
+ # don't start new threads for pods we've already previously discovered - they have their own thread.
89
+ self.pods_being_scraped.add(pod_name)
90
+ if pod_name not in self.pod_colors:
91
+ self.pod_colors[pod_name] = make_colorized_out(
92
+ colorized(fg=next_color()), fmt_str=pod_name + " {}"
93
+ )
94
+ log_thread = threading.Thread(
95
+ target=_scrape_pod_logs,
96
+ args=(
97
+ self.pod_colors[pod_name],
98
+ pod_name,
99
+ self.start,
100
+ ),
101
+ daemon=True,
102
+ )
103
+ log_thread.start()
104
+
105
+
106
+ # we really don't want many threads calling the K8S API a billion times all at once
107
+ @locked_cached(cachetools.TTLCache(maxsize=1, ttl=2))
108
+ def _list_pods_in_our_namespace() -> ty.List[client.models.V1Pod]:
109
+ return client.CoreV1Api().list_namespaced_pod(namespace=config.k8s_namespace()).items
110
+
111
+
112
+ class K8sPodStatus(enum.Enum):
113
+ PENDING = "Pending"
114
+ RUNNING = "Running"
115
+ SUCCEEDED = "Succeeded"
116
+ FAILED = "Failed"
117
+ UNKNOWN = "Unknown"
118
+
119
+
120
+ def _yield_running_pods_for_job(
121
+ job_name: str, expected_number_of_pods: int = 1
122
+ ) -> ty.Iterator[client.models.V1Pod]:
123
+ """TODO: stop polling if the Job cannot be found at all."""
124
+ attempt = 0
125
+ yielded = 0
126
+ logger.debug("Polling for pods created by job: %s", job_name)
127
+ while attempt < config.k8s_monitor_max_attempts():
128
+ for pod in _list_pods_in_our_namespace():
129
+ owner_refs = pod.metadata.owner_references
130
+ if not owner_refs:
131
+ # this is a rare and undocumented case where a pod
132
+ # will have owner_references=None if it was manually created.
133
+ # since we're looking for pods created by jobs, we can safely skip these.
134
+ continue
135
+
136
+ if len(owner_refs) > 1:
137
+ logger.warning("Found multiple owner references for a pod. Taking first one...")
138
+ owner_ref = owner_refs[0]
139
+ if owner_ref.name == job_name:
140
+ if pod.status.phase in {
141
+ K8sPodStatus.RUNNING.value,
142
+ K8sPodStatus.UNKNOWN.value,
143
+ }:
144
+ logger.debug(f"Found a pod {pod.metadata.name} in phase {pod.status.phase}")
145
+ yielded += 1
146
+ yield pod
147
+ if yielded >= expected_number_of_pods:
148
+ logger.debug("Found all expected running pods.")
149
+ return
150
+ if not get_job(job_name):
151
+ logger.warning("Job not found; not a good sign for pod logs")
152
+ attempt += 50
153
+ logger.debug("Didn't find enough pods yet, sleeping for a moment...")
154
+ time.sleep(config.k8s_monitor_delay())
155
+ attempt += 1
156
+
157
+
158
+ def _get_pod_phase(pod_name: str) -> str:
159
+ return (
160
+ client.CoreV1Api()
161
+ .read_namespaced_pod(
162
+ namespace=config.k8s_namespace(),
163
+ name=pod_name,
164
+ _request_timeout=(10, config.k8s_job_timeout_seconds()),
165
+ )
166
+ .status.phase
167
+ )
168
+
169
+
170
+ def _await_pod_phases(phases: ty.Set[K8sPodStatus], pod_name: str) -> str:
171
+ while True:
172
+ phase = _get_pod_phase(pod_name)
173
+ if phase in {phase.value for phase in phases}:
174
+ return phase
175
+ time.sleep(config.k8s_monitor_delay())
176
+
177
+
178
+ @core.scope.bound
179
+ def _scrape_pod_logs(
180
+ out: ty.Callable[[str], ty.Any],
181
+ pod_name: str,
182
+ failure_callback: ty.Callable[[str], ty.Any],
183
+ ) -> None:
184
+ """Contains its own retry error boundary b/c this is notoriously unreliable."""
185
+ core.scope.enter(logger_context(log=pod_name))
186
+
187
+ last_scraped_at = default_timer()
188
+ base_kwargs = dict(
189
+ name=pod_name,
190
+ namespace=config.k8s_namespace(),
191
+ _request_timeout=(10, config.k8s_job_timeout_seconds()),
192
+ # i'm occasionally seeing the `stream()` call below hang
193
+ # indefinitely if logs don't come back from the pod for a
194
+ # while. Which is ironic, since most of this code is here to
195
+ # help us make sure we keep retrying if no logs happen on the
196
+ # pod for a while, since frequently `stream()` will just end
197
+ # quietly when that happens. In any case, at this point,
198
+ # we're better-equipped to handle all kinds of retries, so
199
+ # using the (connect, read) _request timeout tuple is probably
200
+ # what we want to try next.
201
+ )
202
+
203
+ def get_retry_kwargs(_: int) -> ty.Tuple[tuple, dict]:
204
+ return tuple(), dict(base_kwargs, since_seconds=int(default_timer() - last_scraped_at))
205
+
206
+ def scrape_logs(*_args: ty.Any, **kwargs: ty.Any) -> None:
207
+ nonlocal last_scraped_at
208
+ _await_pod_phases(
209
+ {K8sPodStatus.RUNNING, K8sPodStatus.SUCCEEDED, K8sPodStatus.FAILED},
210
+ pod_name,
211
+ )
212
+ logger.debug("Watching pod log stream...")
213
+ while True:
214
+ for e in watch.Watch().stream(
215
+ client.CoreV1Api().read_namespaced_pod_log,
216
+ **kwargs,
217
+ ):
218
+ out(e)
219
+ last_scraped_at = default_timer()
220
+ time.sleep(config.k8s_monitor_delay())
221
+ pod_phase = _get_pod_phase(pod_name)
222
+ if pod_phase == K8sPodStatus.SUCCEEDED.value:
223
+ logger.debug("Done scraping pod logs")
224
+ return
225
+ if pod_phase == K8sPodStatus.FAILED.value:
226
+ logger.warning("Pod failed - calling callback")
227
+ failure_callback(pod_name)
228
+ return
229
+ logger.debug("Pod is not complete - will retry the log watch")
230
+
231
+ def should_retry(ex: Exception) -> bool:
232
+ return isinstance(ex, urllib3.exceptions.ReadTimeoutError)
233
+
234
+ try:
235
+ k8s_sdk_retry(get_retry_kwargs, should_retry=should_retry)(scrape_logs)(**base_kwargs)
236
+ except Exception:
237
+ logger.exception(BOINK("Pod log scraping failed utterly. Pod may have died?"))
238
+ # at least let the caller know something went horribly wrong
239
+ failure_callback(pod_name)