thds.mops 3.6.20250219172032__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/__about__.py +8 -0
- thds/mops/__init__.py +3 -0
- thds/mops/_compat.py +6 -0
- thds/mops/_utils/__init__.py +0 -0
- thds/mops/_utils/colorize.py +110 -0
- thds/mops/_utils/config_tree.py +167 -0
- thds/mops/_utils/exception.py +16 -0
- thds/mops/_utils/locked_cache.py +78 -0
- thds/mops/_utils/names.py +23 -0
- thds/mops/_utils/on_slow.py +28 -0
- thds/mops/_utils/once.py +30 -0
- thds/mops/_utils/temp.py +32 -0
- thds/mops/config.py +60 -0
- thds/mops/impure/__init__.py +2 -0
- thds/mops/impure/keyfunc.py +14 -0
- thds/mops/impure/runner.py +73 -0
- thds/mops/k8s/__init__.py +27 -0
- thds/mops/k8s/_shared.py +3 -0
- thds/mops/k8s/apply_yaml.py +22 -0
- thds/mops/k8s/auth.py +49 -0
- thds/mops/k8s/config.py +37 -0
- thds/mops/k8s/container_registry.py +14 -0
- thds/mops/k8s/jobs.py +57 -0
- thds/mops/k8s/launch.py +234 -0
- thds/mops/k8s/logging.py +239 -0
- thds/mops/k8s/namespace.py +17 -0
- thds/mops/k8s/node_selection.py +58 -0
- thds/mops/k8s/retry.py +75 -0
- thds/mops/k8s/too_old_resource_version.py +42 -0
- thds/mops/k8s/tools/krsync.py +50 -0
- thds/mops/k8s/tools/krsync.sh +22 -0
- thds/mops/k8s/wait_job.py +72 -0
- thds/mops/k8s/warn_image_backoff.py +63 -0
- thds/mops/k8s/watch.py +266 -0
- thds/mops/meta.json +8 -0
- thds/mops/parallel.py +36 -0
- thds/mops/pure/__init__.py +43 -0
- thds/mops/pure/_magic/__init__.py +0 -0
- thds/mops/pure/_magic/api.py +114 -0
- thds/mops/pure/_magic/sauce.py +152 -0
- thds/mops/pure/_magic/shims.py +34 -0
- thds/mops/pure/adls/__init__.py +1 -0
- thds/mops/pure/adls/_files.py +22 -0
- thds/mops/pure/adls/blob_store.py +185 -0
- thds/mops/pure/adls/output_fqn.py +17 -0
- thds/mops/pure/core/__init__.py +0 -0
- thds/mops/pure/core/content_addressed.py +31 -0
- thds/mops/pure/core/deferred_work.py +83 -0
- thds/mops/pure/core/entry/__init__.py +2 -0
- thds/mops/pure/core/entry/main.py +47 -0
- thds/mops/pure/core/entry/route_result.py +66 -0
- thds/mops/pure/core/entry/runner_registry.py +31 -0
- thds/mops/pure/core/file_blob_store.py +120 -0
- thds/mops/pure/core/lock/__init__.py +7 -0
- thds/mops/pure/core/lock/_acquire.py +192 -0
- thds/mops/pure/core/lock/_funcs.py +37 -0
- thds/mops/pure/core/lock/cli.py +73 -0
- thds/mops/pure/core/lock/maintain.py +150 -0
- thds/mops/pure/core/lock/read.py +39 -0
- thds/mops/pure/core/lock/types.py +37 -0
- thds/mops/pure/core/lock/write.py +136 -0
- thds/mops/pure/core/memo/__init__.py +6 -0
- thds/mops/pure/core/memo/function_memospace.py +267 -0
- thds/mops/pure/core/memo/keyfunc.py +53 -0
- thds/mops/pure/core/memo/overwrite_params.py +61 -0
- thds/mops/pure/core/memo/results.py +103 -0
- thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
- thds/mops/pure/core/metadata.py +230 -0
- thds/mops/pure/core/output_naming.py +52 -0
- thds/mops/pure/core/partial.py +15 -0
- thds/mops/pure/core/pipeline_id.py +62 -0
- thds/mops/pure/core/pipeline_id_mask.py +79 -0
- thds/mops/pure/core/script_support.py +25 -0
- thds/mops/pure/core/serialize_big_objs.py +73 -0
- thds/mops/pure/core/serialize_paths.py +149 -0
- thds/mops/pure/core/source.py +291 -0
- thds/mops/pure/core/types.py +142 -0
- thds/mops/pure/core/uris.py +81 -0
- thds/mops/pure/core/use_runner.py +47 -0
- thds/mops/pure/joblib/__init__.py +1 -0
- thds/mops/pure/joblib/backend.py +81 -0
- thds/mops/pure/joblib/batching.py +67 -0
- thds/mops/pure/pickling/__init__.py +3 -0
- thds/mops/pure/pickling/_pickle.py +193 -0
- thds/mops/pure/pickling/memoize_only.py +22 -0
- thds/mops/pure/pickling/mprunner.py +173 -0
- thds/mops/pure/pickling/pickles.py +149 -0
- thds/mops/pure/pickling/remote.py +145 -0
- thds/mops/pure/pickling/sha256_b64.py +71 -0
- thds/mops/pure/runner/__init__.py +0 -0
- thds/mops/pure/runner/local.py +239 -0
- thds/mops/pure/runner/shim_builder.py +25 -0
- thds/mops/pure/runner/simple_shims.py +21 -0
- thds/mops/pure/runner/strings.py +1 -0
- thds/mops/pure/runner/types.py +28 -0
- thds/mops/pure/tools/__init__.py +0 -0
- thds/mops/pure/tools/history.py +35 -0
- thds/mops/pure/tools/inspect.py +372 -0
- thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
- thds/mops/pure/tools/stress.py +63 -0
- thds/mops/pure/tools/summarize/__init__.py +4 -0
- thds/mops/pure/tools/summarize/cli.py +293 -0
- thds/mops/pure/tools/summarize/run_summary.py +143 -0
- thds/mops/py.typed +0 -0
- thds/mops/testing/__init__.py +0 -0
- thds/mops/testing/deferred_imports.py +81 -0
- thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
- thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
- thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
- thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
- thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from getpass import getuser
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def parse_namespace(input_str: str) -> str:
|
|
5
|
+
# lowercase and replace all non-alphanumeric characters with dashes
|
|
6
|
+
return "".join(c if c.isalnum() else "-" for c in input_str.lower())
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def user_namespace() -> str:
|
|
10
|
+
try:
|
|
11
|
+
return getuser()
|
|
12
|
+
except OSError:
|
|
13
|
+
return "CICD-Runner"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main() -> None:
|
|
17
|
+
print(user_namespace())
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
3
|
+
from kubernetes import client
|
|
4
|
+
from typing_extensions import TypedDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ResourceDefinition(TypedDict, total=False):
|
|
8
|
+
"""
|
|
9
|
+
This works for both limits and requests.
|
|
10
|
+
|
|
11
|
+
https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
memory: str
|
|
15
|
+
"""E.g., 10G"""
|
|
16
|
+
cpu: str
|
|
17
|
+
"""E.g., 4.5, or 4500m (millicores)"""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NodeNarrowing(TypedDict, total=False):
|
|
21
|
+
"""This is a more transparent interface for selecting nodes that your job can run on.
|
|
22
|
+
|
|
23
|
+
You don't have to provide each key, but any key/value you pair you provide must be the proper type.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
resource_requests: ResourceDefinition
|
|
27
|
+
resource_limits: ResourceDefinition
|
|
28
|
+
# https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
|
|
29
|
+
node_selector: ty.Mapping[str, str]
|
|
30
|
+
# https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/
|
|
31
|
+
tolerations: ty.Sequence[client.V1Toleration]
|
|
32
|
+
# https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def tolerates_spot() -> client.V1Toleration:
|
|
36
|
+
"""Return our custom spot instance toleration configuration."""
|
|
37
|
+
return client.V1Toleration(
|
|
38
|
+
key="kubernetes.azure.com/scalesetpriority", value="spot", effect="NoSchedule"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def tolerates_gpu() -> client.V1Toleration:
|
|
43
|
+
"""Apply this toleration to enable use of GPUs."""
|
|
44
|
+
return client.V1Toleration(key="dedicated", value="gpu", effect="NoSchedule")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def tolerates_64cpu() -> client.V1Toleration:
|
|
48
|
+
"""These node pools often do not scale up well or quickly, so by
|
|
49
|
+
default they're disabled. If that changes in the future, or if you
|
|
50
|
+
are requesting more than 32 CPUs for your Pod, you should apply
|
|
51
|
+
this toleration.
|
|
52
|
+
"""
|
|
53
|
+
return client.V1Toleration(key="dedicated", value="64cpu", effect="NoSchedule")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def require_gpu() -> NodeNarrowing:
|
|
57
|
+
"""Merge this with any additional NodeNarrowing (e.g. resource_requests) to run on GPUs."""
|
|
58
|
+
return dict(node_selector={"instance-type": "gpu"}, tolerations=[tolerates_gpu()])
|
thds/mops/k8s/retry.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import typing as ty
|
|
3
|
+
from functools import wraps
|
|
4
|
+
|
|
5
|
+
import urllib3.exceptions
|
|
6
|
+
from kubernetes import client
|
|
7
|
+
|
|
8
|
+
from . import auth, config
|
|
9
|
+
from ._shared import logger
|
|
10
|
+
|
|
11
|
+
F = ty.TypeVar("F", bound=ty.Callable)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# The first thing you should know about the Kubernetes SDK is that it
|
|
15
|
+
# is riddled with race conditions and timeouts and all kinds of
|
|
16
|
+
# horrible gremlins. This first function/decorator is an _ongoing_
|
|
17
|
+
# attempt to deal with the fallout from that. Hopefully from now on
|
|
18
|
+
# we'll be able to consolidate/maintain all of that logic in a single
|
|
19
|
+
# place:
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_URLLIB_COMMON = (
|
|
23
|
+
urllib3.exceptions.ProtocolError,
|
|
24
|
+
urllib3.exceptions.MaxRetryError,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def k8s_sdk_retry(
|
|
29
|
+
get_retry_args_kwargs: ty.Optional[ty.Callable[[int], ty.Tuple[tuple, dict]]] = None,
|
|
30
|
+
should_retry: ty.Callable[[Exception], bool] = lambda _: False,
|
|
31
|
+
max_retries: int = 20,
|
|
32
|
+
) -> ty.Callable[[F], F]:
|
|
33
|
+
"""Handle the common cases - lets you decide about uncommon ones."""
|
|
34
|
+
|
|
35
|
+
def decorator(f: F) -> F:
|
|
36
|
+
@wraps(f)
|
|
37
|
+
def wrapper(*args, **kwargs): # type: ignore
|
|
38
|
+
i = 0
|
|
39
|
+
|
|
40
|
+
def _raise_if_max(i: int) -> None:
|
|
41
|
+
if i >= max_retries:
|
|
42
|
+
logger.warning(f"Failing after {i} tries")
|
|
43
|
+
raise
|
|
44
|
+
|
|
45
|
+
while True:
|
|
46
|
+
try:
|
|
47
|
+
return f(*args, **kwargs)
|
|
48
|
+
except Exception as ex:
|
|
49
|
+
# some shared behavior for all exceptions means we want a single except block
|
|
50
|
+
_raise_if_max(i)
|
|
51
|
+
if isinstance(ex, _URLLIB_COMMON):
|
|
52
|
+
# these are extremely common and should always be retried
|
|
53
|
+
logger.debug(
|
|
54
|
+
"Encountered probable connection timeout - retrying",
|
|
55
|
+
exc=str(ex),
|
|
56
|
+
)
|
|
57
|
+
# necessary b/c https://github.com/kubernetes-client/python/issues/1234
|
|
58
|
+
elif isinstance(ex, client.exceptions.ApiException) and ex.reason == "Unauthorized":
|
|
59
|
+
# this one is fairly common - who knows why their SDK can't handle this automatically.
|
|
60
|
+
#
|
|
61
|
+
# https://github.com/kubernetes-client/python/blob/release-18.0/kubernetes/client/exceptions.py?ts=4#L84
|
|
62
|
+
logger.info(f"{ex} - retrying after auth failure")
|
|
63
|
+
auth.load_config()
|
|
64
|
+
elif not should_retry(ex):
|
|
65
|
+
raise
|
|
66
|
+
|
|
67
|
+
i += 1
|
|
68
|
+
logger.info(f"Will retry after K8S error {str(ex)}; attempt {i}")
|
|
69
|
+
time.sleep(config.k8s_monitor_delay())
|
|
70
|
+
if get_retry_args_kwargs:
|
|
71
|
+
args, kwargs = get_retry_args_kwargs(i)
|
|
72
|
+
|
|
73
|
+
return ty.cast(F, wrapper)
|
|
74
|
+
|
|
75
|
+
return decorator
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import typing as ty
|
|
3
|
+
|
|
4
|
+
from kubernetes import client
|
|
5
|
+
|
|
6
|
+
_TOO_OLD_RESOURCE_VERSION = re.compile(
|
|
7
|
+
r"Expired: too old resource version: (?P<old>\w+) \((?P<cur>\w+)\)"
|
|
8
|
+
)
|
|
9
|
+
# holy bananas I cannot believe how much K8s' SDK sucks. this is a
|
|
10
|
+
# standard exception with an known retry semantic that their watchers
|
|
11
|
+
# are apparently unable to handle on their own - I'm staring at their
|
|
12
|
+
# code right now and they don't even attempt to handle this.
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TooOldResourceVersion(ty.NamedTuple):
|
|
16
|
+
old: str
|
|
17
|
+
cur: str
|
|
18
|
+
spread: str # only if the above are actually numbers
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parse_too_old_resource_version(
|
|
22
|
+
exc: Exception,
|
|
23
|
+
) -> ty.Optional[TooOldResourceVersion]:
|
|
24
|
+
if not isinstance(exc, client.exceptions.ApiException):
|
|
25
|
+
return None
|
|
26
|
+
m = _TOO_OLD_RESOURCE_VERSION.match(exc.reason)
|
|
27
|
+
if m:
|
|
28
|
+
# this is a completely bonkers thing to have to do
|
|
29
|
+
# ourselves, but here we are. I can't find any
|
|
30
|
+
# documentation on why their SDK doesn't handle this
|
|
31
|
+
# themselves, and I don't even know why we haven't run
|
|
32
|
+
# into it before. Regardless, apparently we have to
|
|
33
|
+
# special-case a retry when there are enough old
|
|
34
|
+
# events on the server.
|
|
35
|
+
resource_version = m.group("cur")
|
|
36
|
+
old = m.group("old")
|
|
37
|
+
try:
|
|
38
|
+
spread = str(int(resource_version) - int(m.group("old")))
|
|
39
|
+
except ValueError:
|
|
40
|
+
spread = "unknown"
|
|
41
|
+
return TooOldResourceVersion(old, resource_version, spread)
|
|
42
|
+
return None
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Uses rsync to copy files to/from a Kubernetes pod.
|
|
2
|
+
|
|
3
|
+
The remote pod must have rsync installed.
|
|
4
|
+
|
|
5
|
+
CLI wrapper with help text for the krsync.sh script, which is usable on its own.
|
|
6
|
+
Thank you, Karl Bunch, who provided the world with this elegant implementation.
|
|
7
|
+
https://serverfault.com/questions/741670/rsync-files-to-a-kubernetes-pod?newreg=22b5f958cdce4e6a9a1a7ce0fc88b546
|
|
8
|
+
|
|
9
|
+
When addressing the remote, you must specify a pod name, and
|
|
10
|
+
optionally a namespace preceded by '@', and then a colon, followed by
|
|
11
|
+
the path on the remote.
|
|
12
|
+
|
|
13
|
+
Examples:
|
|
14
|
+
|
|
15
|
+
krsync ~/my/local.txt pod1234:/root/local_2.txt
|
|
16
|
+
krsync ~/my/local pod1234:~/local_dir -rav # recursively copies entire directory
|
|
17
|
+
krsync pod1234@my-namespace:/root/my.parquet your.parquet
|
|
18
|
+
krsync prod-udla-0@unified-directory:/var/data/labels.db ./labels.db --container prod-udla-db
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import importlib
|
|
23
|
+
import os
|
|
24
|
+
import subprocess
|
|
25
|
+
import sys
|
|
26
|
+
|
|
27
|
+
with importlib.resources.path(__package__, "krsync.sh") as p:
|
|
28
|
+
krsync = str(p.resolve())
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main() -> int:
|
|
32
|
+
remote_path = "pod-name[@namespace]:/remote/path"
|
|
33
|
+
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
|
|
34
|
+
parser.add_argument("src", help=f"Either a local path or {remote_path}")
|
|
35
|
+
parser.add_argument("dest", help=f"Either a local path or {remote_path}")
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--container",
|
|
38
|
+
"-c",
|
|
39
|
+
help="Container name - if not provided, will use the default container",
|
|
40
|
+
default="",
|
|
41
|
+
)
|
|
42
|
+
args, rsync_args = parser.parse_known_args()
|
|
43
|
+
return subprocess.run(
|
|
44
|
+
["/bin/bash", krsync, args.src, args.dest, *rsync_args],
|
|
45
|
+
env=dict(os.environ, KRSYNC_CONTAINER=args.container or ""),
|
|
46
|
+
).returncode
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
sys.exit(main())
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# thank you, Karl Bunch:
|
|
3
|
+
# https://serverfault.com/questions/741670/rsync-files-to-a-kubernetes-pod?newreg=22b5f958cdce4e6a9a1a7ce0fc88b546
|
|
4
|
+
if [ -z "$KRSYNC_STARTED" ]; then
|
|
5
|
+
export KRSYNC_STARTED=true
|
|
6
|
+
exec rsync --blocking-io --rsh "$0" $@
|
|
7
|
+
fi
|
|
8
|
+
|
|
9
|
+
# Running as --rsh
|
|
10
|
+
namespace=''
|
|
11
|
+
pod=$1
|
|
12
|
+
shift
|
|
13
|
+
|
|
14
|
+
# If use uses pod@namespace rsync passes as: {us} -l pod namespace ...
|
|
15
|
+
if [ "X$pod" = "X-l" ]; then
|
|
16
|
+
pod=$1
|
|
17
|
+
shift
|
|
18
|
+
namespace="-n $1"
|
|
19
|
+
shift
|
|
20
|
+
fi
|
|
21
|
+
|
|
22
|
+
exec kubectl $namespace exec -i $pod --container "${KRSYNC_CONTAINER}" -- "$@"
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Wait for a Job to finish."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from datetime import timedelta
|
|
5
|
+
from timeit import default_timer
|
|
6
|
+
|
|
7
|
+
from thds.core import scope
|
|
8
|
+
from thds.core.log import logger_context
|
|
9
|
+
|
|
10
|
+
from .._utils.colorize import colorized
|
|
11
|
+
from . import config
|
|
12
|
+
from ._shared import logger
|
|
13
|
+
from .jobs import get_job, is_job_failed, is_job_succeeded
|
|
14
|
+
|
|
15
|
+
UNUSUAL = colorized(fg="white", bg="yellow")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _max_no_job_wait() -> timedelta:
|
|
19
|
+
return timedelta(seconds=config.k8s_monitor_max_attempts() * config.k8s_monitor_delay())
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@scope.bound
|
|
23
|
+
def wait_for_job(job_name: str, short_name: str = "") -> bool:
|
|
24
|
+
"""Return True if Job completed, False if it failed.
|
|
25
|
+
|
|
26
|
+
May raise an exception if something truly unusual happened.
|
|
27
|
+
|
|
28
|
+
A _lot_ has gone in to trying to make this robust against common
|
|
29
|
+
failure patterns. My apologies for the resulting shape of the
|
|
30
|
+
code. :/
|
|
31
|
+
"""
|
|
32
|
+
scope.enter(logger_context(job=job_name))
|
|
33
|
+
log_name = f"Job {short_name}" if short_name else "Job"
|
|
34
|
+
logger.debug(f"Waiting for {log_name} to finish...")
|
|
35
|
+
start_time = default_timer()
|
|
36
|
+
|
|
37
|
+
def _wait_for_job() -> bool:
|
|
38
|
+
nonlocal start_time
|
|
39
|
+
found_at_least_once = False
|
|
40
|
+
while True:
|
|
41
|
+
time.sleep(0.5 if found_at_least_once else 10.0)
|
|
42
|
+
job = get_job(job_name)
|
|
43
|
+
if not job:
|
|
44
|
+
if found_at_least_once:
|
|
45
|
+
logger.warning(UNUSUAL(f"Known job {job_name} no longer exists - assuming success!"))
|
|
46
|
+
return True
|
|
47
|
+
max_wait_seconds = _max_no_job_wait().total_seconds()
|
|
48
|
+
if default_timer() - start_time > max_wait_seconds:
|
|
49
|
+
logger.error(
|
|
50
|
+
UNUSUAL(
|
|
51
|
+
f"Job {job_name} has not been seen for {max_wait_seconds:.1f} seconds"
|
|
52
|
+
" - assuming failure!"
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
logger.debug("%s not found... retrying.", job_name)
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
if is_job_succeeded(job):
|
|
61
|
+
return True
|
|
62
|
+
|
|
63
|
+
if is_job_failed(job):
|
|
64
|
+
logger.error(
|
|
65
|
+
UNUSUAL(f"A Kubernetes Job is reporting an actual failed status: {job_name}")
|
|
66
|
+
)
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
found_at_least_once = True
|
|
70
|
+
start_time = default_timer() # restart timer since the job has been found.
|
|
71
|
+
|
|
72
|
+
return _wait_for_job()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import typing as ty
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
|
|
5
|
+
from kubernetes import client
|
|
6
|
+
|
|
7
|
+
from thds.core.log import getLogger
|
|
8
|
+
|
|
9
|
+
from .._utils.colorize import colorized
|
|
10
|
+
from . import config
|
|
11
|
+
from .watch import K8sList, OneShotLimiter, yield_objects_from_list
|
|
12
|
+
|
|
13
|
+
logger = getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
OnCoreEvent = ty.Callable[[client.CoreV1Event], ty.Any]
|
|
16
|
+
|
|
17
|
+
YIKES = colorized(fg="black", bg="yellow")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _emit_basic(event: client.CoreV1Event) -> None:
|
|
21
|
+
logger.error(YIKES(event.message))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _warn_image_pull_backoff(namespace: str, on_backoff: OnCoreEvent = _emit_basic) -> None:
|
|
25
|
+
"""Log scary errors when ImagePullBackoff is observed."""
|
|
26
|
+
start_dt = datetime.now(tz=timezone.utc)
|
|
27
|
+
for _ns, obj in yield_objects_from_list(
|
|
28
|
+
namespace,
|
|
29
|
+
lambda _, __: ty.cast(
|
|
30
|
+
# do NOT use client.EventsV1Api here - for some reason
|
|
31
|
+
# it does not return the right 'types' of events.
|
|
32
|
+
# why? who the heck knows? How much time did I spend
|
|
33
|
+
# trying to figure this out? Also who knows.
|
|
34
|
+
K8sList[client.CoreV1Event],
|
|
35
|
+
client.CoreV1Api().list_namespaced_event,
|
|
36
|
+
),
|
|
37
|
+
object_type_hint="backoff-warnings",
|
|
38
|
+
field_selector="reason=BackOff",
|
|
39
|
+
):
|
|
40
|
+
if None is obj.last_timestamp or obj.last_timestamp > start_dt:
|
|
41
|
+
on_backoff(obj)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_WARN_IMAGE_PULL_BACKOFF = OneShotLimiter()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def start_warn_image_pull_backoff_thread(
|
|
48
|
+
namespace: str = "", on_backoff: ty.Optional[OnCoreEvent] = None
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Limit 1 thread per namespace per application.
|
|
51
|
+
|
|
52
|
+
You can pass an additional message context
|
|
53
|
+
"""
|
|
54
|
+
namespace = namespace or config.k8s_namespace()
|
|
55
|
+
|
|
56
|
+
_WARN_IMAGE_PULL_BACKOFF(
|
|
57
|
+
namespace,
|
|
58
|
+
lambda ns: threading.Thread(
|
|
59
|
+
target=_warn_image_pull_backoff,
|
|
60
|
+
args=(namespace, on_backoff or _emit_basic),
|
|
61
|
+
daemon=True,
|
|
62
|
+
).start(),
|
|
63
|
+
)
|
thds/mops/k8s/watch.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""K8s SDK watching is very unreliable for lots of reasons.
|
|
2
|
+
|
|
3
|
+
This is a general-purpose fix for using watchers in a thread reliably.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
import typing as ty
|
|
9
|
+
|
|
10
|
+
import urllib3
|
|
11
|
+
from kubernetes import client
|
|
12
|
+
from kubernetes import watch as k8s_watch
|
|
13
|
+
|
|
14
|
+
from thds.core import scope
|
|
15
|
+
from thds.core.log import getLogger, logger_context
|
|
16
|
+
|
|
17
|
+
from .._utils.colorize import colorized
|
|
18
|
+
from . import config
|
|
19
|
+
from .auth import load_config
|
|
20
|
+
from .too_old_resource_version import parse_too_old_resource_version
|
|
21
|
+
|
|
22
|
+
logger = getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
T = ty.TypeVar("T")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class V1List(ty.Protocol[T]):
|
|
28
|
+
api_version: str
|
|
29
|
+
items: ty.List[T]
|
|
30
|
+
kind: str
|
|
31
|
+
metadata: client.models.V1ListMeta
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class K8sList(ty.Protocol[T]):
|
|
35
|
+
def __call__(self, *args: ty.Any, namespace: str, **kwargs: ty.Any) -> V1List[T]:
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# If this does not return a K8sList API method, the loop will exit
|
|
40
|
+
GetListMethod = ty.Callable[[str, ty.Optional[Exception]], ty.Optional[K8sList[T]]]
|
|
41
|
+
# if this returns True, the loop will exit.
|
|
42
|
+
OnEvent = ty.Callable[[str, T], ty.Optional[bool]]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def yield_objects_from_list(
|
|
46
|
+
namespace: str,
|
|
47
|
+
get_list_method: GetListMethod[T],
|
|
48
|
+
server_timeout: int = 10,
|
|
49
|
+
object_type_hint: str = "items",
|
|
50
|
+
init: ty.Optional[ty.Callable[[], None]] = None,
|
|
51
|
+
**kwargs: ty.Any,
|
|
52
|
+
) -> ty.Iterator[ty.Tuple[str, T]]:
|
|
53
|
+
ex = None
|
|
54
|
+
if init:
|
|
55
|
+
init()
|
|
56
|
+
while True:
|
|
57
|
+
try:
|
|
58
|
+
load_config()
|
|
59
|
+
list_method = get_list_method(namespace, ex)
|
|
60
|
+
if not list_method:
|
|
61
|
+
logger.debug(f"No longer watching {object_type_hint} events in namespace: {namespace}")
|
|
62
|
+
break
|
|
63
|
+
initial_list = list_method(namespace=namespace)
|
|
64
|
+
logger.debug(
|
|
65
|
+
f"Listed {len(initial_list.items)} {object_type_hint} in namespace: {namespace}"
|
|
66
|
+
)
|
|
67
|
+
for object in initial_list.items:
|
|
68
|
+
yield namespace, object
|
|
69
|
+
|
|
70
|
+
if initial_list.metadata._continue:
|
|
71
|
+
logger.warning(
|
|
72
|
+
f"We did not fetch the whole list of {object_type_hint} the first time..."
|
|
73
|
+
)
|
|
74
|
+
for evt in k8s_watch.Watch().stream(
|
|
75
|
+
list_method,
|
|
76
|
+
namespace=namespace,
|
|
77
|
+
resource_version=initial_list.metadata.resource_version,
|
|
78
|
+
**kwargs,
|
|
79
|
+
_request_timeout=(server_timeout, config.k8s_job_timeout_seconds()),
|
|
80
|
+
):
|
|
81
|
+
object = evt.get("object")
|
|
82
|
+
if object:
|
|
83
|
+
yield namespace, object
|
|
84
|
+
# once we've received events, let the resource version
|
|
85
|
+
# be managed automatically if possible.
|
|
86
|
+
except urllib3.exceptions.ProtocolError:
|
|
87
|
+
ex = None
|
|
88
|
+
except urllib3.exceptions.ReadTimeoutError:
|
|
89
|
+
ex = None
|
|
90
|
+
except Exception as e:
|
|
91
|
+
too_old = parse_too_old_resource_version(e)
|
|
92
|
+
if too_old:
|
|
93
|
+
logger.debug(f"Immediately retrying {too_old}")
|
|
94
|
+
else:
|
|
95
|
+
logger.exception(f"Unexpected exception while listing {object_type_hint}")
|
|
96
|
+
ex = e
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def callback_events(on_event: OnEvent[T], event_yielder: ty.Iterable[ty.Tuple[str, T]]) -> None:
|
|
100
|
+
"""Suitable for use with a daemon thread."""
|
|
101
|
+
for namespace, event in event_yielder:
|
|
102
|
+
should_exit = on_event(namespace, event)
|
|
103
|
+
if should_exit:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _make_name(namespace: str, name: str) -> str:
|
|
108
|
+
return f"{namespace}/{name}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _default_get_name(obj: ty.Any) -> str:
|
|
112
|
+
return obj.metadata.name
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _default_get_namespace(obj: ty.Any) -> str:
|
|
116
|
+
return obj.metadata.namespace
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
STARTING = colorized(fg="white", bg="orange")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class OneShotLimiter:
|
|
123
|
+
"""Do an action once per provided name. Does not wait for it to complete."""
|
|
124
|
+
|
|
125
|
+
def __init__(self) -> None:
|
|
126
|
+
self._lock = threading.RLock()
|
|
127
|
+
self._names: ty.Set[str] = set()
|
|
128
|
+
|
|
129
|
+
def __call__(self, name: str, shoot: ty.Callable[[str], ty.Any]) -> None:
|
|
130
|
+
"""Shoot if the name has not already been shot."""
|
|
131
|
+
if name in self._names:
|
|
132
|
+
return
|
|
133
|
+
with self._lock:
|
|
134
|
+
if name in self._names:
|
|
135
|
+
return
|
|
136
|
+
shoot(name)
|
|
137
|
+
self._names.add(name)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def is_stale(api_last_update_time: float, obj_last_seen_time: float) -> bool:
|
|
141
|
+
now = time.monotonic()
|
|
142
|
+
allowed_stale_seconds = config.k8s_watch_object_stale_seconds()
|
|
143
|
+
if (time_since_api_update := now - api_last_update_time) > allowed_stale_seconds: # noqa: F841
|
|
144
|
+
# we haven't heard anything from the API in a while; probably
|
|
145
|
+
# the API is down. Ignore object staleness to avoid false positives.
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
if not obj_last_seen_time:
|
|
149
|
+
return False # false positives aren't worth it
|
|
150
|
+
|
|
151
|
+
return (time_since_obj_update := now - obj_last_seen_time) > allowed_stale_seconds # noqa: F841
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class WatchingObjectSource(ty.Generic[T]):
|
|
155
|
+
"""Efficiently 'get' objects by reliably watching for changes to all such objects in a given namespace.
|
|
156
|
+
|
|
157
|
+
This is network-efficient for observing many different objects,
|
|
158
|
+
but not memory efficient if you really only need to fetch details
|
|
159
|
+
for a few objects.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(
|
|
163
|
+
self,
|
|
164
|
+
get_list_method: GetListMethod[T],
|
|
165
|
+
get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
|
|
166
|
+
ty.Callable[[T], str], _default_get_name
|
|
167
|
+
),
|
|
168
|
+
backup_fetch: ty.Optional[ty.Callable[[str, str], T]] = None,
|
|
169
|
+
typename: str = "object",
|
|
170
|
+
starting: ty.Callable[[str], str] = STARTING,
|
|
171
|
+
) -> None:
|
|
172
|
+
self.get_list_method = get_list_method
|
|
173
|
+
self.get_name = get_name
|
|
174
|
+
self.backup_fetch = backup_fetch
|
|
175
|
+
self.typename = typename
|
|
176
|
+
self._objs_by_name: ty.Dict[str, T] = dict()
|
|
177
|
+
# ^ is a possibly big/expensive local cache of the most recent
|
|
178
|
+
# state for all of the event type in the namespace. Don't use
|
|
179
|
+
# this class if you can't afford the memory overhead of
|
|
180
|
+
# observing everything in your namespace and keeping the last
|
|
181
|
+
# known copy of everything forever.
|
|
182
|
+
self._last_seen_time_by_name: ty.Dict[str, float] = dict()
|
|
183
|
+
self._last_api_update_time = 0.0
|
|
184
|
+
self._limiter = OneShotLimiter()
|
|
185
|
+
|
|
186
|
+
def _start_thread(self, namespace: str) -> None:
|
|
187
|
+
threading.Thread(
|
|
188
|
+
target=callback_events,
|
|
189
|
+
args=(
|
|
190
|
+
self._add_object,
|
|
191
|
+
yield_objects_from_list(
|
|
192
|
+
namespace,
|
|
193
|
+
self._get_list_method_on_restart,
|
|
194
|
+
object_type_hint=self.typename + "s",
|
|
195
|
+
init=lambda: logger.info(STARTING(f"Watching {self.typename}s in {namespace}")),
|
|
196
|
+
),
|
|
197
|
+
),
|
|
198
|
+
daemon=True,
|
|
199
|
+
).start()
|
|
200
|
+
|
|
201
|
+
def _add_object(self, namespace: str, obj: T) -> None:
|
|
202
|
+
"""This is where we receive updates from the k8s API."""
|
|
203
|
+
self._last_api_update_time = time.monotonic()
|
|
204
|
+
|
|
205
|
+
if not obj:
|
|
206
|
+
logger.warning(f"Received null/empty {self.typename}")
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
name = _make_name(namespace, self.get_name(obj))
|
|
210
|
+
logger.debug(f"{self.typename} {name} updated")
|
|
211
|
+
self._last_seen_time_by_name[name] = time.monotonic()
|
|
212
|
+
self._objs_by_name[name] = obj
|
|
213
|
+
|
|
214
|
+
def _get_list_method_on_restart(
|
|
215
|
+
self, namespace: str, exc: ty.Optional[Exception]
|
|
216
|
+
) -> ty.Optional[K8sList[T]]:
|
|
217
|
+
suffix = ""
|
|
218
|
+
if exc:
|
|
219
|
+
too_old = parse_too_old_resource_version(exc)
|
|
220
|
+
if not too_old:
|
|
221
|
+
logger.exception(f"Not fatal, but sleeping before we retry {self.typename} scraping...")
|
|
222
|
+
time.sleep(config.k8s_monitor_delay())
|
|
223
|
+
suffix = f" after {type(exc).__name__}: {exc}"
|
|
224
|
+
logger.info(f"Watching {self.typename}s in namespace: {namespace}{suffix}")
|
|
225
|
+
return self.get_list_method(namespace, exc)
|
|
226
|
+
|
|
227
|
+
def _is_stale(self, name: str) -> bool:
|
|
228
|
+
return is_stale(self._last_api_update_time, self._last_seen_time_by_name.get(name) or 0)
|
|
229
|
+
|
|
230
|
+
@scope.bound
|
|
231
|
+
def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
|
|
232
|
+
namespace = namespace or config.k8s_namespace()
|
|
233
|
+
name = _make_name(namespace, obj_name)
|
|
234
|
+
scope.enter(logger_context(name=obj_name, namespace=namespace))
|
|
235
|
+
|
|
236
|
+
# first try is looking in our local cache
|
|
237
|
+
if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
|
|
238
|
+
return obj
|
|
239
|
+
|
|
240
|
+
# second try is making sure the namespace watcher is running, sleeping, and then looking in the cache again.
|
|
241
|
+
# This is much more efficient than a manual fetch.
|
|
242
|
+
self._limiter(namespace, self._start_thread)
|
|
243
|
+
time.sleep(config.k8s_monitor_delay())
|
|
244
|
+
if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
|
|
245
|
+
return obj
|
|
246
|
+
|
|
247
|
+
# if that doesn't work, try a manual fetch.
|
|
248
|
+
if self.backup_fetch:
|
|
249
|
+
logger.warning(f"Manually fetching {self.typename}...")
|
|
250
|
+
# doing a lot of manual fetches may indicate that the k8s API is having trouble keeping up...
|
|
251
|
+
try:
|
|
252
|
+
if obj := self.backup_fetch(namespace, obj_name):
|
|
253
|
+
self._add_object(namespace, obj) # updates last seen, too
|
|
254
|
+
return obj
|
|
255
|
+
|
|
256
|
+
except Exception:
|
|
257
|
+
logger.exception(f"Unexpected error during manual fetch of {self.typename}.")
|
|
258
|
+
|
|
259
|
+
if self._is_stale(name):
|
|
260
|
+
logger.warning(
|
|
261
|
+
f"Could not refresh {name}, and our record of it is stale - dropping stale object!"
|
|
262
|
+
)
|
|
263
|
+
self._objs_by_name.pop(name, None)
|
|
264
|
+
self._last_seen_time_by_name.pop(name, None)
|
|
265
|
+
|
|
266
|
+
return None
|