evakit 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evakit-0.0.0/LICENSE +9 -0
- evakit-0.0.0/PKG-INFO +61 -0
- evakit-0.0.0/README.md +39 -0
- evakit-0.0.0/evakit/__init__.py +0 -0
- evakit-0.0.0/evakit/__version__.py +1 -0
- evakit-0.0.0/evakit/args_base.py +190 -0
- evakit-0.0.0/evakit/cronjob.py +52 -0
- evakit-0.0.0/evakit/launcher_base.py +286 -0
- evakit-0.0.0/evakit/logging_utils.py +60 -0
- evakit-0.0.0/evakit/python_tricks.py +48 -0
- evakit-0.0.0/evakit/singleton.py +473 -0
- evakit-0.0.0/evakit.egg-info/PKG-INFO +61 -0
- evakit-0.0.0/evakit.egg-info/SOURCES.txt +20 -0
- evakit-0.0.0/evakit.egg-info/dependency_links.txt +1 -0
- evakit-0.0.0/evakit.egg-info/requires.txt +3 -0
- evakit-0.0.0/evakit.egg-info/top_level.txt +1 -0
- evakit-0.0.0/pyproject.toml +82 -0
- evakit-0.0.0/setup.cfg +4 -0
- evakit-0.0.0/tests/test_args_base.py +214 -0
- evakit-0.0.0/tests/test_cronjob.py +83 -0
- evakit-0.0.0/tests/test_launcher_base.py +156 -0
- evakit-0.0.0/tests/test_python_tricks.py +102 -0
evakit-0.0.0/LICENSE
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
|
|
2
|
+
The MIT License (MIT)
|
|
3
|
+
Copyright (c) 2026, garywei944
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
evakit-0.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evakit
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: A collection of utilities for the eva project.
|
|
5
|
+
Author-email: Gary Wei <garywei944@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Intended Audience :: Developers
|
|
8
|
+
Classifier: Natural Language :: English
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: typed-argument-parser
|
|
19
|
+
Requires-Dist: psutil
|
|
20
|
+
Requires-Dist: typing-extensions
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# evakit
|
|
24
|
+
|
|
25
|
+
## Quick Access
|
|
26
|
+
|
|
27
|
+
## Dataset
|
|
28
|
+
|
|
29
|
+
## Notebooks
|
|
30
|
+
|
|
31
|
+
- Notebooks for experiments are located under [`notebooks`](notebooks).
|
|
32
|
+
- Notebooks for visualization and presentation are located
|
|
33
|
+
under [`references`](references).
|
|
34
|
+
|
|
35
|
+
## Conda / Mamba Environment Setup
|
|
36
|
+
|
|
37
|
+
```shell
|
|
38
|
+
# To firstly install the environment
|
|
39
|
+
conda env create -f environment.yml
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
To update the environment after updating environment.yml
|
|
43
|
+
|
|
44
|
+
**_BEST PRACTICE, RECOMMENDED_** when updating a conda environment
|
|
45
|
+
|
|
46
|
+
```shell
|
|
47
|
+
conda env update -f environment.yml --prune
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Sandbox
|
|
51
|
+
|
|
52
|
+
The `sandbox` is a practice in teamwork collaboration.
|
|
53
|
+
Everyone works with the project should have a sub-folder named by their id
|
|
54
|
+
under it, and any scripts, doc, or data that are temporary should be placed
|
|
55
|
+
there.
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
Project based on
|
|
60
|
+
the [cookiecutter machine learning template](https://github.com/garywei944/cookiecutter-machine-learning)
|
|
61
|
+
.
|
evakit-0.0.0/README.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# evakit
|
|
2
|
+
|
|
3
|
+
## Quick Access
|
|
4
|
+
|
|
5
|
+
## Dataset
|
|
6
|
+
|
|
7
|
+
## Notebooks
|
|
8
|
+
|
|
9
|
+
- Notebooks for experiments are located under [`notebooks`](notebooks).
|
|
10
|
+
- Notebooks for visualization and presentation are located
|
|
11
|
+
under [`references`](references).
|
|
12
|
+
|
|
13
|
+
## Conda / Mamba Environment Setup
|
|
14
|
+
|
|
15
|
+
```shell
|
|
16
|
+
# To firstly install the environment
|
|
17
|
+
conda env create -f environment.yml
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
To update the environment after updating environment.yml
|
|
21
|
+
|
|
22
|
+
**_BEST PRACTICE, RECOMMENDED_** when updating a conda environment
|
|
23
|
+
|
|
24
|
+
```shell
|
|
25
|
+
conda env update -f environment.yml --prune
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Sandbox
|
|
29
|
+
|
|
30
|
+
The `sandbox` is a practice in teamwork collaboration.
|
|
31
|
+
Everyone works with the project should have a sub-folder named by their id
|
|
32
|
+
under it, and any scripts, doc, or data that are temporary should be placed
|
|
33
|
+
there.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
Project based on
|
|
38
|
+
the [cookiecutter machine learning template](https://github.com/garywei944/cookiecutter-machine-learning)
|
|
39
|
+
.
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from argparse import ArgumentParser, ArgumentTypeError, Namespace
|
|
6
|
+
from typing import final, get_args, get_origin
|
|
7
|
+
|
|
8
|
+
from tap import Tap
|
|
9
|
+
from typing_extensions import override
|
|
10
|
+
|
|
11
|
+
from evakit.python_tricks import freeze_dataclass
|
|
12
|
+
from evakit.singleton import Singleton
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"ArgsBase",
|
|
16
|
+
"str2bool",
|
|
17
|
+
"add_bool_arg",
|
|
18
|
+
"move_arg_from_to",
|
|
19
|
+
"env_to_bool",
|
|
20
|
+
"arg_env_consistent_bool",
|
|
21
|
+
"csv",
|
|
22
|
+
"tuple_parser",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ArgsBase(Tap, Singleton, ABC):
|
|
29
|
+
"""Base class for argument parsing using Tap.
|
|
30
|
+
|
|
31
|
+
Supports
|
|
32
|
+
- Type hinting
|
|
33
|
+
- Automatic argument parsing
|
|
34
|
+
- Singleton pattern with Dependency Injection
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
@final
|
|
38
|
+
def __init__(self, args: list[str] | None = None, *, frozen: bool = True) -> None:
|
|
39
|
+
super().__init__(explicit_bool=False, allow_abbrev=False)
|
|
40
|
+
|
|
41
|
+
self.parse_args(args=args, known_only=True)
|
|
42
|
+
|
|
43
|
+
if frozen:
|
|
44
|
+
freeze_dataclass(self.__class__)
|
|
45
|
+
|
|
46
|
+
def _add_args(self) -> None:
|
|
47
|
+
"""Add arguments to the parser.
|
|
48
|
+
|
|
49
|
+
Equivalent to self.configure(), only keep for backward compatibility.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
@final
|
|
53
|
+
@override
|
|
54
|
+
def configure(self) -> None:
|
|
55
|
+
|
|
56
|
+
# The Tap accepts `--ports p1 p2 p3` as a list input, which is different from
|
|
57
|
+
# `--ports=p1,p2,p3`. To keep backward compatibility, we override them here.
|
|
58
|
+
# self._annotations is a dict of arg name to type hint Types.GenericAlias
|
|
59
|
+
for arg_name, type_hint in self._annotations.items():
|
|
60
|
+
|
|
61
|
+
arg_flag, default = f"--{arg_name}", getattr(self, arg_name)
|
|
62
|
+
type_origin, type_args = get_origin(type_hint), get_args(type_hint)
|
|
63
|
+
|
|
64
|
+
# 1. handler bool to add_bool_arg
|
|
65
|
+
if type_hint is bool:
|
|
66
|
+
add_bool_arg(self, arg_flag, default=getattr(self, arg_name))
|
|
67
|
+
# 2. handle list or set
|
|
68
|
+
elif type_origin in {list, set}:
|
|
69
|
+
elem_type = type_args[0]
|
|
70
|
+
if elem_type is int:
|
|
71
|
+
self.add_argument(arg_flag, type=functools.partial(csv, int, type_origin))
|
|
72
|
+
elif elem_type is float:
|
|
73
|
+
self.add_argument(arg_flag, type=functools.partial(csv, float, type_origin))
|
|
74
|
+
elif elem_type is bool:
|
|
75
|
+
self.add_argument(arg_flag, type=functools.partial(csv, str2bool, type_origin))
|
|
76
|
+
elif elem_type is str:
|
|
77
|
+
self.add_argument(arg_flag, type=functools.partial(csv, str, type_origin))
|
|
78
|
+
# 3. handle tuple
|
|
79
|
+
elif type_origin is tuple:
|
|
80
|
+
|
|
81
|
+
# homogeneous tuple: tuple[int, ...]
|
|
82
|
+
if len(type_args) == 2 and type_args[1] is ...:
|
|
83
|
+
elem_type = type_args[0]
|
|
84
|
+
self.add_argument(
|
|
85
|
+
arg_flag,
|
|
86
|
+
type=functools.partial(csv, elem_type, tuple),
|
|
87
|
+
default=default,
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
self.add_argument(
|
|
91
|
+
arg_flag,
|
|
92
|
+
type=functools.partial(tuple_parser, type_args),
|
|
93
|
+
default=default,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
self._add_args()
|
|
97
|
+
|
|
98
|
+
@final
|
|
99
|
+
@override
|
|
100
|
+
def process_args(self) -> None:
|
|
101
|
+
self._merge_args()
|
|
102
|
+
self._process_args()
|
|
103
|
+
self._check_args()
|
|
104
|
+
|
|
105
|
+
def _merge_args(self) -> None:
|
|
106
|
+
"""Merge some arguments for compatibility."""
|
|
107
|
+
|
|
108
|
+
def _process_args(self) -> None:
|
|
109
|
+
"""Process the arguments after parsing."""
|
|
110
|
+
|
|
111
|
+
def _check_args(self) -> None:
|
|
112
|
+
"""Check the arguments for validity."""
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def str2bool(v: str) -> bool:
|
|
116
|
+
if v.lower() in ("yes", "true", "t", "y", "1"):
|
|
117
|
+
return True
|
|
118
|
+
elif v.lower() in ("no", "false", "f", "n", "0"):
|
|
119
|
+
return False
|
|
120
|
+
else:
|
|
121
|
+
raise ArgumentTypeError("Boolean value expected.")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def add_bool_arg(parser: ArgumentParser, *args, default=False, help=None, **kwargs):
|
|
125
|
+
# https://stackoverflow.com/a/43357954
|
|
126
|
+
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
*args,
|
|
129
|
+
type=str2bool,
|
|
130
|
+
metavar="BOOLEAN",
|
|
131
|
+
nargs="?",
|
|
132
|
+
const=True,
|
|
133
|
+
default=default,
|
|
134
|
+
help=help,
|
|
135
|
+
**kwargs,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def move_arg_from_to(
|
|
140
|
+
args: Namespace | ArgsBase,
|
|
141
|
+
old_arg: str,
|
|
142
|
+
new_arg: str,
|
|
143
|
+
final_value,
|
|
144
|
+
print_deprecated_msg: bool,
|
|
145
|
+
):
|
|
146
|
+
setattr(args, new_arg, final_value)
|
|
147
|
+
setattr(args, old_arg, final_value) # for backward compatibility
|
|
148
|
+
if print_deprecated_msg:
|
|
149
|
+
logger.warning(
|
|
150
|
+
'[ArgParser] --%s is deprecated, please use "--%s" instead!',
|
|
151
|
+
old_arg,
|
|
152
|
+
new_arg,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def env_to_bool(env_var: str) -> bool:
|
|
157
|
+
return os.getenv(env_var, "0").lower() not in ["0", "false"]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def arg_env_consistent_bool(args: Namespace | ArgsBase, arg_name: str, env_var: str):
|
|
161
|
+
"""Make sure argument and environment variable are consistent.
|
|
162
|
+
|
|
163
|
+
Taking the logical OR of both values.
|
|
164
|
+
"""
|
|
165
|
+
arg_value = bool(getattr(args, arg_name, False))
|
|
166
|
+
env_value = env_to_bool(env_var)
|
|
167
|
+
|
|
168
|
+
if arg_value or env_value:
|
|
169
|
+
setattr(args, arg_name, True)
|
|
170
|
+
os.environ[env_var] = "1"
|
|
171
|
+
else:
|
|
172
|
+
setattr(args, arg_name, False)
|
|
173
|
+
os.environ[env_var] = "0"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def csv(elem_type, container_type: type = list, value: str = ""):
|
|
177
|
+
if value == "":
|
|
178
|
+
return container_type()
|
|
179
|
+
try:
|
|
180
|
+
return container_type(elem_type(v) for v in value.split(","))
|
|
181
|
+
except Exception as e:
|
|
182
|
+
raise ArgumentTypeError(f"Invalid {elem_type.__name__} list: {value}") from e
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def tuple_parser(type_args, value: str):
|
|
186
|
+
parts = value.split(",")
|
|
187
|
+
if len(parts) != len(type_args):
|
|
188
|
+
raise ArgumentTypeError(f"Expected {len(type_args)} values, got {len(parts)}")
|
|
189
|
+
|
|
190
|
+
return tuple(t(p) for t, p in zip(type_args, parts))
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Any, Callable
|
|
4
|
+
|
|
5
|
+
__all__ = ["CronJob"]
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CronJob(threading.Thread):
|
|
11
|
+
interval: float | int
|
|
12
|
+
|
|
13
|
+
_stop_event: threading.Event
|
|
14
|
+
|
|
15
|
+
_target: Callable[..., Any]
|
|
16
|
+
_args: tuple[Any, ...]
|
|
17
|
+
_kwargs: dict[str, Any]
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
name: str,
|
|
22
|
+
task: Callable,
|
|
23
|
+
args: tuple[Any, ...] = (),
|
|
24
|
+
kwargs: dict[str, Any] | None = None,
|
|
25
|
+
*,
|
|
26
|
+
interval: float | int = 60,
|
|
27
|
+
daemon: bool | None = True,
|
|
28
|
+
):
|
|
29
|
+
super().__init__(
|
|
30
|
+
target=task,
|
|
31
|
+
name=name,
|
|
32
|
+
args=args,
|
|
33
|
+
kwargs=kwargs,
|
|
34
|
+
daemon=daemon,
|
|
35
|
+
)
|
|
36
|
+
self.interval_sec = interval
|
|
37
|
+
self._stop_event = threading.Event()
|
|
38
|
+
|
|
39
|
+
def run(self):
|
|
40
|
+
logger.info("[CronJob %s] Started with interval %ss", self.name, self.interval_sec)
|
|
41
|
+
while not self._stop_event.is_set():
|
|
42
|
+
try:
|
|
43
|
+
self._target(*self._args, **self._kwargs)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logger.exception("[CronJob %s] Exception: %s", self.name, e)
|
|
46
|
+
# Wait with early exit if stopped
|
|
47
|
+
if self._stop_event.wait(self.interval_sec):
|
|
48
|
+
break
|
|
49
|
+
|
|
50
|
+
def stop(self):
|
|
51
|
+
self._stop_event.set()
|
|
52
|
+
logger.info("[CronJob %s] Stopped", self.name)
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import datetime
|
|
3
|
+
import itertools
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import signal
|
|
7
|
+
import threading
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from contextlib import contextmanager, suppress
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Callable, TextIO, cast
|
|
12
|
+
|
|
13
|
+
import psutil
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"LauncherBase",
|
|
17
|
+
"ProcessMeta",
|
|
18
|
+
"defer_termination_signals",
|
|
19
|
+
"restore_signal_mask",
|
|
20
|
+
"exit_handler",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ProcessMeta:
|
|
28
|
+
proc: psutil.Popen
|
|
29
|
+
log_file: TextIO
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LauncherBase(ABC):
|
|
33
|
+
"""
|
|
34
|
+
Maintain the life cycle of the spawned processes.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
procs: list[ProcessMeta | None]
|
|
38
|
+
|
|
39
|
+
_lock: threading.Lock
|
|
40
|
+
_log_counter: itertools.cycle
|
|
41
|
+
_logging_interval: int
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
exit_timeout: float = 180,
|
|
46
|
+
logging_interval: int = 60,
|
|
47
|
+
*,
|
|
48
|
+
# For testability,
|
|
49
|
+
register_atexit: bool = True,
|
|
50
|
+
):
|
|
51
|
+
self.procs = []
|
|
52
|
+
self._logging_interval = logging_interval
|
|
53
|
+
# log every 60 * 10s = 10 minutes
|
|
54
|
+
self._log_counter = itertools.cycle(range(logging_interval))
|
|
55
|
+
self._lock = threading.Lock()
|
|
56
|
+
if register_atexit:
|
|
57
|
+
atexit.register(exit_handler, self, exit_timeout, "Normal Exit signal received")
|
|
58
|
+
|
|
59
|
+
def is_healthy(self) -> bool:
|
|
60
|
+
return self._is_healthy()
|
|
61
|
+
|
|
62
|
+
def _is_healthy(self, log_name="Process", offset=None) -> bool:
|
|
63
|
+
"""Check for all process, process is alive or process is None."""
|
|
64
|
+
count = next(self._log_counter)
|
|
65
|
+
with self._lock:
|
|
66
|
+
for i, proc_meta in enumerate(self.procs):
|
|
67
|
+
if proc_meta is None:
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
p = proc_meta.proc
|
|
71
|
+
idx = "" if offset is None else f" {offset + i}"
|
|
72
|
+
if p.poll() is not None:
|
|
73
|
+
logger.warning(
|
|
74
|
+
"[Launcher] %s%s pid=%d exited unexpectedly, exit code: %d",
|
|
75
|
+
log_name,
|
|
76
|
+
idx,
|
|
77
|
+
p.pid,
|
|
78
|
+
p.returncode,
|
|
79
|
+
)
|
|
80
|
+
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
81
|
+
if proc_meta.log_file:
|
|
82
|
+
proc_meta.log_file.write(
|
|
83
|
+
f"{date_str} runner fail; only for report log to forge\n"
|
|
84
|
+
)
|
|
85
|
+
return False
|
|
86
|
+
if count == 0:
|
|
87
|
+
logger.info(
|
|
88
|
+
"[Launcher] %s%s is running. pid=%d pgid=%d sid=%d",
|
|
89
|
+
log_name,
|
|
90
|
+
idx,
|
|
91
|
+
p.pid,
|
|
92
|
+
os.getpgid(p.pid),
|
|
93
|
+
os.getsid(p.pid),
|
|
94
|
+
)
|
|
95
|
+
return True
|
|
96
|
+
|
|
97
|
+
def all_alive(self) -> bool:
|
|
98
|
+
with self._lock:
|
|
99
|
+
# return False if procs is empty
|
|
100
|
+
if not self.procs:
|
|
101
|
+
return False
|
|
102
|
+
# all([]) returns True
|
|
103
|
+
return all(p is not None and p.proc.poll() is None for p in self.procs)
|
|
104
|
+
|
|
105
|
+
def any_alive(self) -> bool:
|
|
106
|
+
with self._lock:
|
|
107
|
+
# any([]) returns False
|
|
108
|
+
return any(p is not None and p.proc.poll() is None for p in self.procs)
|
|
109
|
+
|
|
110
|
+
@abstractmethod
|
|
111
|
+
def launch(self, *args, **kwargs) -> None:
|
|
112
|
+
"""Launch the processes and maintain their metadata in self.procs."""
|
|
113
|
+
|
|
114
|
+
def kill_and_wait(self, timeout: float = 5, *args, **kwargs):
|
|
115
|
+
with self._lock:
|
|
116
|
+
proc_metas = [proc for proc in self.procs if proc is not None]
|
|
117
|
+
kill_procs(proc_metas, timeout)
|
|
118
|
+
|
|
119
|
+
# Clean status
|
|
120
|
+
for idx in range(len(self.procs)):
|
|
121
|
+
self.procs[idx] = None
|
|
122
|
+
self._log_counter = itertools.cycle(range(self._logging_interval))
|
|
123
|
+
|
|
124
|
+
def wait(self) -> list[int | None]:
|
|
125
|
+
exit_codes: list[int | None] = [None] * len(self.procs)
|
|
126
|
+
for i, proc_meta in enumerate(self.procs):
|
|
127
|
+
if proc_meta is None:
|
|
128
|
+
continue
|
|
129
|
+
p = proc_meta.proc
|
|
130
|
+
p.wait()
|
|
131
|
+
exit_codes[i] = p.returncode
|
|
132
|
+
logger.info("[Launcher] Process pid=%d exited with code %d", p.pid, p.returncode)
|
|
133
|
+
# Clean status
|
|
134
|
+
for idx in range(len(self.procs)):
|
|
135
|
+
self.procs[idx] = None
|
|
136
|
+
self._log_counter = itertools.cycle(range(self._logging_interval))
|
|
137
|
+
|
|
138
|
+
return exit_codes
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def kill_procs(proc_metas: list[ProcessMeta], timeout: float = 5):
|
|
142
|
+
"""Guarantee failed procs are killed and log files are closed."""
|
|
143
|
+
|
|
144
|
+
# ! Scan all process that has the same pgid with self.procs. This is necessary
|
|
145
|
+
# ! to avoid that when bash is killed by -9 SIGKILL, runner process become orphan
|
|
146
|
+
# ! and its ppid=1 and not listed in curr_process.children(recursive=True)
|
|
147
|
+
|
|
148
|
+
# Popen with start_new_session=True guarantees that p.pid == p.pgid
|
|
149
|
+
child_pgids = [p.proc.pid for p in proc_metas]
|
|
150
|
+
child_pgid_procs: list[psutil.Process] = []
|
|
151
|
+
for proc in psutil.process_iter(["pid"]):
|
|
152
|
+
try:
|
|
153
|
+
if os.getpgid(proc.pid) in child_pgids:
|
|
154
|
+
child_pgid_procs.append(proc)
|
|
155
|
+
except (psutil.NoSuchProcess, ProcessLookupError):
|
|
156
|
+
logger.info("[Launcher] Process %d already terminated, no need to wait", proc.pid)
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# ! a child process of nsys might spawn our base_runner process by setting new
|
|
160
|
+
# ! pgid, so we need to reap its children, trying our best.
|
|
161
|
+
# ! But if our base_runner process is already an orphan with new pgid,
|
|
162
|
+
# ! there's no way to figure out that it's formerly a child of this launcher.
|
|
163
|
+
all_children: list[psutil.Process] = [p.proc for p in proc_metas]
|
|
164
|
+
for proc in child_pgid_procs:
|
|
165
|
+
all_children.append(proc)
|
|
166
|
+
all_children.extend(proc.children(recursive=True))
|
|
167
|
+
|
|
168
|
+
# Send SIGTERM to all children
|
|
169
|
+
need_to_wait: list[psutil.Process] = []
|
|
170
|
+
for proc in set(all_children):
|
|
171
|
+
logger.info("[Launcher] Send SIGTERM to child process %d", proc.pid)
|
|
172
|
+
try:
|
|
173
|
+
proc.terminate()
|
|
174
|
+
except psutil.ZombieProcess:
|
|
175
|
+
logger.warning("[Launcher] Child process %d is a zombie, reaping it", proc.pid)
|
|
176
|
+
proc.wait()
|
|
177
|
+
except psutil.NoSuchProcess:
|
|
178
|
+
logger.info("[Launcher] Child process %d already terminated", proc.pid)
|
|
179
|
+
continue
|
|
180
|
+
except psutil.AccessDenied:
|
|
181
|
+
logger.warning("[Launcher] Access denied to terminate child process %d", proc.pid)
|
|
182
|
+
continue
|
|
183
|
+
else:
|
|
184
|
+
need_to_wait.append(proc)
|
|
185
|
+
|
|
186
|
+
# In some region(like GCP), there are very flaky kernel syscall which might cause
|
|
187
|
+
# process to be stuck in uninterruptible sleep state. If that happens, waiting for
|
|
188
|
+
# maximum 5 min then exit.
|
|
189
|
+
tolerance_extra_timeout = 300
|
|
190
|
+
retry_sigkill_every = 30
|
|
191
|
+
|
|
192
|
+
def kill_callback(p: psutil.Process):
|
|
193
|
+
with suppress(Exception):
|
|
194
|
+
p.kill()
|
|
195
|
+
|
|
196
|
+
for tries in range(tolerance_extra_timeout // retry_sigkill_every + 1):
|
|
197
|
+
# For the first try, we wait for the given timeout
|
|
198
|
+
_timeout = timeout if tries == 0 else retry_sigkill_every
|
|
199
|
+
|
|
200
|
+
need_to_wait = wait_for_procs(
|
|
201
|
+
need_to_wait, kill_callback, "killing it with SIGKILL", _timeout
|
|
202
|
+
)
|
|
203
|
+
if not need_to_wait:
|
|
204
|
+
break
|
|
205
|
+
else:
|
|
206
|
+
# After all retries, some processes are still alive, force exit
|
|
207
|
+
for p in need_to_wait:
|
|
208
|
+
logger.critical(
|
|
209
|
+
"[Launcher] Child process %d didn't respond to SIGKILL after %d seconds!",
|
|
210
|
+
p.pid,
|
|
211
|
+
tolerance_extra_timeout,
|
|
212
|
+
)
|
|
213
|
+
logger.critical("[Launcher] Exiting launcher process forcefully!")
|
|
214
|
+
os._exit(1)
|
|
215
|
+
|
|
216
|
+
for proc in proc_metas:
|
|
217
|
+
proc.log_file.close()
|
|
218
|
+
|
|
219
|
+
logger.info("[Launcher] All processes & log_files are closed.")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def wait_for_procs(
|
|
223
|
+
procs: list[psutil.Process],
|
|
224
|
+
callback: Callable[[psutil.Process], None],
|
|
225
|
+
callback_msg: str,
|
|
226
|
+
timeout: float = 5,
|
|
227
|
+
) -> list[psutil.Process]:
|
|
228
|
+
gone, alive = psutil.wait_procs(procs, timeout=timeout)
|
|
229
|
+
for p in gone:
|
|
230
|
+
p = cast(psutil.Popen, p)
|
|
231
|
+
logger.info(
|
|
232
|
+
"[Launcher] Child process %d terminated with exit code %d",
|
|
233
|
+
p.pid,
|
|
234
|
+
p.returncode,
|
|
235
|
+
)
|
|
236
|
+
for p in alive:
|
|
237
|
+
logger.warning(
|
|
238
|
+
"[Launcher] Child process %d did not terminate in time, %s",
|
|
239
|
+
p.pid,
|
|
240
|
+
callback_msg,
|
|
241
|
+
)
|
|
242
|
+
callback(p)
|
|
243
|
+
|
|
244
|
+
return alive
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
ALL_TERM_SIGNALS = {
|
|
248
|
+
signal.SIGABRT,
|
|
249
|
+
signal.SIGBUS,
|
|
250
|
+
signal.SIGFPE,
|
|
251
|
+
signal.SIGILL,
|
|
252
|
+
signal.SIGINT,
|
|
253
|
+
signal.SIGKILL,
|
|
254
|
+
signal.SIGPIPE,
|
|
255
|
+
signal.SIGQUIT,
|
|
256
|
+
signal.SIGSEGV,
|
|
257
|
+
signal.SIGTERM,
|
|
258
|
+
signal.SIGSYS,
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@contextmanager
|
|
263
|
+
def defer_termination_signals():
|
|
264
|
+
# Block these signals and save the old signal mask.
|
|
265
|
+
old_mask = signal.pthread_sigmask(signal.SIG_BLOCK, ALL_TERM_SIGNALS)
|
|
266
|
+
try:
|
|
267
|
+
yield
|
|
268
|
+
finally:
|
|
269
|
+
# Restore the original signal mask.
|
|
270
|
+
signal.pthread_sigmask(signal.SIG_SETMASK, old_mask)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def restore_signal_mask():
|
|
274
|
+
"""restore the signal mask that might block all termination signals"""
|
|
275
|
+
signal.pthread_sigmask(signal.SIG_UNBLOCK, ALL_TERM_SIGNALS)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def exit_handler(launcher: LauncherBase, timeout: float = 180, msg: str = ""):
|
|
279
|
+
with defer_termination_signals():
|
|
280
|
+
logger.info("=" * 80)
|
|
281
|
+
if msg:
|
|
282
|
+
logger.info(msg)
|
|
283
|
+
logger.info("[ExitHandler] Release resource, timeout=%d s", timeout)
|
|
284
|
+
lg_procs = [lg_proc for lg_proc in launcher.procs if lg_proc is not None]
|
|
285
|
+
kill_procs(lg_procs, timeout)
|
|
286
|
+
logger.info("[ExitHandler] Resource released")
|