evakit 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evakit-0.0.0/LICENSE ADDED
@@ -0,0 +1,9 @@
1
+
2
+ The MIT License (MIT)
3
+ Copyright (c) 2026, garywei944
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
evakit-0.0.0/PKG-INFO ADDED
@@ -0,0 +1,61 @@
1
+ Metadata-Version: 2.4
2
+ Name: evakit
3
+ Version: 0.0.0
4
+ Summary: A collection of utilities for the eva project.
5
+ Author-email: Gary Wei <garywei944@gmail.com>
6
+ License-Expression: MIT
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Natural Language :: English
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: typed-argument-parser
19
+ Requires-Dist: psutil
20
+ Requires-Dist: typing-extensions
21
+ Dynamic: license-file
22
+
23
+ # evakit
24
+
25
+ ## Quick Access
26
+
27
+ ## Dataset
28
+
29
+ ## Notebooks
30
+
31
+ - Notebooks for experiments are located under [`notebooks`](notebooks).
32
+ - Notebooks for visualization and presentation are located
33
+ under [`references`](references).
34
+
35
+ ## Conda / Mamba Environment Setup
36
+
37
+ ```shell
38
+ # To firstly install the environment
39
+ conda env create -f environment.yml
40
+ ```
41
+
42
+ To update the environment after updating environment.yml
43
+
44
+ **_BEST PRACTICE, RECOMMENDED_** when updating a conda environment
45
+
46
+ ```shell
47
+ conda env update -f environment.yml --prune
48
+ ```
49
+
50
+ ## Sandbox
51
+
52
+ The `sandbox` is a practice in teamwork collaboration.
53
+ Everyone works with the project should have a sub-folder named by their id
54
+ under it, and any scripts, doc, or data that are temporary should be placed
55
+ there.
56
+
57
+ ---
58
+
59
+ Project based on
60
+ the [cookiecutter machine learning template](https://github.com/garywei944/cookiecutter-machine-learning)
61
+ .
evakit-0.0.0/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # evakit
2
+
3
+ ## Quick Access
4
+
5
+ ## Dataset
6
+
7
+ ## Notebooks
8
+
9
+ - Notebooks for experiments are located under [`notebooks`](notebooks).
10
+ - Notebooks for visualization and presentation are located
11
+ under [`references`](references).
12
+
13
+ ## Conda / Mamba Environment Setup
14
+
15
+ ```shell
16
+ # To firstly install the environment
17
+ conda env create -f environment.yml
18
+ ```
19
+
20
+ To update the environment after updating environment.yml
21
+
22
+ **_BEST PRACTICE, RECOMMENDED_** when updating a conda environment
23
+
24
+ ```shell
25
+ conda env update -f environment.yml --prune
26
+ ```
27
+
28
+ ## Sandbox
29
+
30
+ The `sandbox` is a practice in teamwork collaboration.
31
+ Everyone works with the project should have a sub-folder named by their id
32
+ under it, and any scripts, doc, or data that are temporary should be placed
33
+ there.
34
+
35
+ ---
36
+
37
+ Project based on
38
+ the [cookiecutter machine learning template](https://github.com/garywei944/cookiecutter-machine-learning)
39
+ .
File without changes
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
@@ -0,0 +1,190 @@
1
+ import functools
2
+ import logging
3
+ import os
4
+ from abc import ABC
5
+ from argparse import ArgumentParser, ArgumentTypeError, Namespace
6
+ from typing import final, get_args, get_origin
7
+
8
+ from tap import Tap
9
+ from typing_extensions import override
10
+
11
+ from evakit.python_tricks import freeze_dataclass
12
+ from evakit.singleton import Singleton
13
+
14
+ __all__ = [
15
+ "ArgsBase",
16
+ "str2bool",
17
+ "add_bool_arg",
18
+ "move_arg_from_to",
19
+ "env_to_bool",
20
+ "arg_env_consistent_bool",
21
+ "csv",
22
+ "tuple_parser",
23
+ ]
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ArgsBase(Tap, Singleton, ABC):
29
+ """Base class for argument parsing using Tap.
30
+
31
+ Supports
32
+ - Type hinting
33
+ - Automatic argument parsing
34
+ - Singleton pattern with Dependency Injection
35
+ """
36
+
37
+ @final
38
+ def __init__(self, args: list[str] | None = None, *, frozen: bool = True) -> None:
39
+ super().__init__(explicit_bool=False, allow_abbrev=False)
40
+
41
+ self.parse_args(args=args, known_only=True)
42
+
43
+ if frozen:
44
+ freeze_dataclass(self.__class__)
45
+
46
+ def _add_args(self) -> None:
47
+ """Add arguments to the parser.
48
+
49
+ Equivalent to self.configure(), only keep for backward compatibility.
50
+ """
51
+
52
+ @final
53
+ @override
54
+ def configure(self) -> None:
55
+
56
+ # The Tap accepts `--ports p1 p2 p3` as a list input, which is different from
57
+ # `--ports=p1,p2,p3`. To keep backward compatibility, we override them here.
58
+ # self._annotations is a dict of arg name to type hint Types.GenericAlias
59
+ for arg_name, type_hint in self._annotations.items():
60
+
61
+ arg_flag, default = f"--{arg_name}", getattr(self, arg_name)
62
+ type_origin, type_args = get_origin(type_hint), get_args(type_hint)
63
+
64
+ # 1. handler bool to add_bool_arg
65
+ if type_hint is bool:
66
+ add_bool_arg(self, arg_flag, default=getattr(self, arg_name))
67
+ # 2. handle list or set
68
+ elif type_origin in {list, set}:
69
+ elem_type = type_args[0]
70
+ if elem_type is int:
71
+ self.add_argument(arg_flag, type=functools.partial(csv, int, type_origin))
72
+ elif elem_type is float:
73
+ self.add_argument(arg_flag, type=functools.partial(csv, float, type_origin))
74
+ elif elem_type is bool:
75
+ self.add_argument(arg_flag, type=functools.partial(csv, str2bool, type_origin))
76
+ elif elem_type is str:
77
+ self.add_argument(arg_flag, type=functools.partial(csv, str, type_origin))
78
+ # 3. handle tuple
79
+ elif type_origin is tuple:
80
+
81
+ # homogeneous tuple: tuple[int, ...]
82
+ if len(type_args) == 2 and type_args[1] is ...:
83
+ elem_type = type_args[0]
84
+ self.add_argument(
85
+ arg_flag,
86
+ type=functools.partial(csv, elem_type, tuple),
87
+ default=default,
88
+ )
89
+ else:
90
+ self.add_argument(
91
+ arg_flag,
92
+ type=functools.partial(tuple_parser, type_args),
93
+ default=default,
94
+ )
95
+
96
+ self._add_args()
97
+
98
+ @final
99
+ @override
100
+ def process_args(self) -> None:
101
+ self._merge_args()
102
+ self._process_args()
103
+ self._check_args()
104
+
105
+ def _merge_args(self) -> None:
106
+ """Merge some arguments for compatibility."""
107
+
108
+ def _process_args(self) -> None:
109
+ """Process the arguments after parsing."""
110
+
111
+ def _check_args(self) -> None:
112
+ """Check the arguments for validity."""
113
+
114
+
115
+ def str2bool(v: str) -> bool:
116
+ if v.lower() in ("yes", "true", "t", "y", "1"):
117
+ return True
118
+ elif v.lower() in ("no", "false", "f", "n", "0"):
119
+ return False
120
+ else:
121
+ raise ArgumentTypeError("Boolean value expected.")
122
+
123
+
124
+ def add_bool_arg(parser: ArgumentParser, *args, default=False, help=None, **kwargs):
125
+ # https://stackoverflow.com/a/43357954
126
+
127
+ parser.add_argument(
128
+ *args,
129
+ type=str2bool,
130
+ metavar="BOOLEAN",
131
+ nargs="?",
132
+ const=True,
133
+ default=default,
134
+ help=help,
135
+ **kwargs,
136
+ )
137
+
138
+
139
+ def move_arg_from_to(
140
+ args: Namespace | ArgsBase,
141
+ old_arg: str,
142
+ new_arg: str,
143
+ final_value,
144
+ print_deprecated_msg: bool,
145
+ ):
146
+ setattr(args, new_arg, final_value)
147
+ setattr(args, old_arg, final_value) # for backward compatibility
148
+ if print_deprecated_msg:
149
+ logger.warning(
150
+ '[ArgParser] --%s is deprecated, please use "--%s" instead!',
151
+ old_arg,
152
+ new_arg,
153
+ )
154
+
155
+
156
+ def env_to_bool(env_var: str) -> bool:
157
+ return os.getenv(env_var, "0").lower() not in ["0", "false"]
158
+
159
+
160
+ def arg_env_consistent_bool(args: Namespace | ArgsBase, arg_name: str, env_var: str):
161
+ """Make sure argument and environment variable are consistent.
162
+
163
+ Taking the logical OR of both values.
164
+ """
165
+ arg_value = bool(getattr(args, arg_name, False))
166
+ env_value = env_to_bool(env_var)
167
+
168
+ if arg_value or env_value:
169
+ setattr(args, arg_name, True)
170
+ os.environ[env_var] = "1"
171
+ else:
172
+ setattr(args, arg_name, False)
173
+ os.environ[env_var] = "0"
174
+
175
+
176
+ def csv(elem_type, container_type: type = list, value: str = ""):
177
+ if value == "":
178
+ return container_type()
179
+ try:
180
+ return container_type(elem_type(v) for v in value.split(","))
181
+ except Exception as e:
182
+ raise ArgumentTypeError(f"Invalid {elem_type.__name__} list: {value}") from e
183
+
184
+
185
+ def tuple_parser(type_args, value: str):
186
+ parts = value.split(",")
187
+ if len(parts) != len(type_args):
188
+ raise ArgumentTypeError(f"Expected {len(type_args)} values, got {len(parts)}")
189
+
190
+ return tuple(t(p) for t, p in zip(type_args, parts))
@@ -0,0 +1,52 @@
1
+ import logging
2
+ import threading
3
+ from typing import Any, Callable
4
+
5
+ __all__ = ["CronJob"]
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class CronJob(threading.Thread):
11
+ interval: float | int
12
+
13
+ _stop_event: threading.Event
14
+
15
+ _target: Callable[..., Any]
16
+ _args: tuple[Any, ...]
17
+ _kwargs: dict[str, Any]
18
+
19
+ def __init__(
20
+ self,
21
+ name: str,
22
+ task: Callable,
23
+ args: tuple[Any, ...] = (),
24
+ kwargs: dict[str, Any] | None = None,
25
+ *,
26
+ interval: float | int = 60,
27
+ daemon: bool | None = True,
28
+ ):
29
+ super().__init__(
30
+ target=task,
31
+ name=name,
32
+ args=args,
33
+ kwargs=kwargs,
34
+ daemon=daemon,
35
+ )
36
+ self.interval_sec = interval
37
+ self._stop_event = threading.Event()
38
+
39
+ def run(self):
40
+ logger.info("[CronJob %s] Started with interval %ss", self.name, self.interval_sec)
41
+ while not self._stop_event.is_set():
42
+ try:
43
+ self._target(*self._args, **self._kwargs)
44
+ except Exception as e:
45
+ logger.exception("[CronJob %s] Exception: %s", self.name, e)
46
+ # Wait with early exit if stopped
47
+ if self._stop_event.wait(self.interval_sec):
48
+ break
49
+
50
+ def stop(self):
51
+ self._stop_event.set()
52
+ logger.info("[CronJob %s] Stopped", self.name)
@@ -0,0 +1,286 @@
1
+ import atexit
2
+ import datetime
3
+ import itertools
4
+ import logging
5
+ import os
6
+ import signal
7
+ import threading
8
+ from abc import ABC, abstractmethod
9
+ from contextlib import contextmanager, suppress
10
+ from dataclasses import dataclass
11
+ from typing import Callable, TextIO, cast
12
+
13
+ import psutil
14
+
15
+ __all__ = [
16
+ "LauncherBase",
17
+ "ProcessMeta",
18
+ "defer_termination_signals",
19
+ "restore_signal_mask",
20
+ "exit_handler",
21
+ ]
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class ProcessMeta:
28
+ proc: psutil.Popen
29
+ log_file: TextIO
30
+
31
+
32
+ class LauncherBase(ABC):
33
+ """
34
+ Maintain the life cycle of the spawned processes.
35
+ """
36
+
37
+ procs: list[ProcessMeta | None]
38
+
39
+ _lock: threading.Lock
40
+ _log_counter: itertools.cycle
41
+ _logging_interval: int
42
+
43
+ def __init__(
44
+ self,
45
+ exit_timeout: float = 180,
46
+ logging_interval: int = 60,
47
+ *,
48
+ # For testability,
49
+ register_atexit: bool = True,
50
+ ):
51
+ self.procs = []
52
+ self._logging_interval = logging_interval
53
+ # log every 60 * 10s = 10 minutes
54
+ self._log_counter = itertools.cycle(range(logging_interval))
55
+ self._lock = threading.Lock()
56
+ if register_atexit:
57
+ atexit.register(exit_handler, self, exit_timeout, "Normal Exit signal received")
58
+
59
+ def is_healthy(self) -> bool:
60
+ return self._is_healthy()
61
+
62
+ def _is_healthy(self, log_name="Process", offset=None) -> bool:
63
+ """Check for all process, process is alive or process is None."""
64
+ count = next(self._log_counter)
65
+ with self._lock:
66
+ for i, proc_meta in enumerate(self.procs):
67
+ if proc_meta is None:
68
+ continue
69
+
70
+ p = proc_meta.proc
71
+ idx = "" if offset is None else f" {offset + i}"
72
+ if p.poll() is not None:
73
+ logger.warning(
74
+ "[Launcher] %s%s pid=%d exited unexpectedly, exit code: %d",
75
+ log_name,
76
+ idx,
77
+ p.pid,
78
+ p.returncode,
79
+ )
80
+ date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
81
+ if proc_meta.log_file:
82
+ proc_meta.log_file.write(
83
+ f"{date_str} runner fail; only for report log to forge\n"
84
+ )
85
+ return False
86
+ if count == 0:
87
+ logger.info(
88
+ "[Launcher] %s%s is running. pid=%d pgid=%d sid=%d",
89
+ log_name,
90
+ idx,
91
+ p.pid,
92
+ os.getpgid(p.pid),
93
+ os.getsid(p.pid),
94
+ )
95
+ return True
96
+
97
+ def all_alive(self) -> bool:
98
+ with self._lock:
99
+ # return False if procs is empty
100
+ if not self.procs:
101
+ return False
102
+ # all([]) returns True
103
+ return all(p is not None and p.proc.poll() is None for p in self.procs)
104
+
105
+ def any_alive(self) -> bool:
106
+ with self._lock:
107
+ # any([]) returns False
108
+ return any(p is not None and p.proc.poll() is None for p in self.procs)
109
+
110
+ @abstractmethod
111
+ def launch(self, *args, **kwargs) -> None:
112
+ """Launch the processes and maintain their metadata in self.procs."""
113
+
114
+ def kill_and_wait(self, timeout: float = 5, *args, **kwargs):
115
+ with self._lock:
116
+ proc_metas = [proc for proc in self.procs if proc is not None]
117
+ kill_procs(proc_metas, timeout)
118
+
119
+ # Clean status
120
+ for idx in range(len(self.procs)):
121
+ self.procs[idx] = None
122
+ self._log_counter = itertools.cycle(range(self._logging_interval))
123
+
124
+ def wait(self) -> list[int | None]:
125
+ exit_codes: list[int | None] = [None] * len(self.procs)
126
+ for i, proc_meta in enumerate(self.procs):
127
+ if proc_meta is None:
128
+ continue
129
+ p = proc_meta.proc
130
+ p.wait()
131
+ exit_codes[i] = p.returncode
132
+ logger.info("[Launcher] Process pid=%d exited with code %d", p.pid, p.returncode)
133
+ # Clean status
134
+ for idx in range(len(self.procs)):
135
+ self.procs[idx] = None
136
+ self._log_counter = itertools.cycle(range(self._logging_interval))
137
+
138
+ return exit_codes
139
+
140
+
141
+ def kill_procs(proc_metas: list[ProcessMeta], timeout: float = 5):
142
+ """Guarantee failed procs are killed and log files are closed."""
143
+
144
+ # ! Scan all process that has the same pgid with self.procs. This is necessary
145
+ # ! to avoid that when bash is killed by -9 SIGKILL, runner process become orphan
146
+ # ! and its ppid=1 and not listed in curr_process.children(recursive=True)
147
+
148
+ # Popen with start_new_session=True guarantees that p.pid == p.pgid
149
+ child_pgids = [p.proc.pid for p in proc_metas]
150
+ child_pgid_procs: list[psutil.Process] = []
151
+ for proc in psutil.process_iter(["pid"]):
152
+ try:
153
+ if os.getpgid(proc.pid) in child_pgids:
154
+ child_pgid_procs.append(proc)
155
+ except (psutil.NoSuchProcess, ProcessLookupError):
156
+ logger.info("[Launcher] Process %d already terminated, no need to wait", proc.pid)
157
+ continue
158
+
159
+ # ! a child process of nsys might spawn our base_runner process by setting new
160
+ # ! pgid, so we need to reap its children, trying our best.
161
+ # ! But if our base_runner process is already an orphan with new pgid,
162
+ # ! there's no way to figure out that it's formerly a child of this launcher.
163
+ all_children: list[psutil.Process] = [p.proc for p in proc_metas]
164
+ for proc in child_pgid_procs:
165
+ all_children.append(proc)
166
+ all_children.extend(proc.children(recursive=True))
167
+
168
+ # Send SIGTERM to all children
169
+ need_to_wait: list[psutil.Process] = []
170
+ for proc in set(all_children):
171
+ logger.info("[Launcher] Send SIGTERM to child process %d", proc.pid)
172
+ try:
173
+ proc.terminate()
174
+ except psutil.ZombieProcess:
175
+ logger.warning("[Launcher] Child process %d is a zombie, reaping it", proc.pid)
176
+ proc.wait()
177
+ except psutil.NoSuchProcess:
178
+ logger.info("[Launcher] Child process %d already terminated", proc.pid)
179
+ continue
180
+ except psutil.AccessDenied:
181
+ logger.warning("[Launcher] Access denied to terminate child process %d", proc.pid)
182
+ continue
183
+ else:
184
+ need_to_wait.append(proc)
185
+
186
+ # In some region(like GCP), there are very flaky kernel syscall which might cause
187
+ # process to be stuck in uninterruptible sleep state. If that happens, waiting for
188
+ # maximum 5 min then exit.
189
+ tolerance_extra_timeout = 300
190
+ retry_sigkill_every = 30
191
+
192
+ def kill_callback(p: psutil.Process):
193
+ with suppress(Exception):
194
+ p.kill()
195
+
196
+ for tries in range(tolerance_extra_timeout // retry_sigkill_every + 1):
197
+ # For the first try, we wait for the given timeout
198
+ _timeout = timeout if tries == 0 else retry_sigkill_every
199
+
200
+ need_to_wait = wait_for_procs(
201
+ need_to_wait, kill_callback, "killing it with SIGKILL", _timeout
202
+ )
203
+ if not need_to_wait:
204
+ break
205
+ else:
206
+ # After all retries, some processes are still alive, force exit
207
+ for p in need_to_wait:
208
+ logger.critical(
209
+ "[Launcher] Child process %d didn't respond to SIGKILL after %d seconds!",
210
+ p.pid,
211
+ tolerance_extra_timeout,
212
+ )
213
+ logger.critical("[Launcher] Exiting launcher process forcefully!")
214
+ os._exit(1)
215
+
216
+ for proc in proc_metas:
217
+ proc.log_file.close()
218
+
219
+ logger.info("[Launcher] All processes & log_files are closed.")
220
+
221
+
222
+ def wait_for_procs(
223
+ procs: list[psutil.Process],
224
+ callback: Callable[[psutil.Process], None],
225
+ callback_msg: str,
226
+ timeout: float = 5,
227
+ ) -> list[psutil.Process]:
228
+ gone, alive = psutil.wait_procs(procs, timeout=timeout)
229
+ for p in gone:
230
+ p = cast(psutil.Popen, p)
231
+ logger.info(
232
+ "[Launcher] Child process %d terminated with exit code %d",
233
+ p.pid,
234
+ p.returncode,
235
+ )
236
+ for p in alive:
237
+ logger.warning(
238
+ "[Launcher] Child process %d did not terminate in time, %s",
239
+ p.pid,
240
+ callback_msg,
241
+ )
242
+ callback(p)
243
+
244
+ return alive
245
+
246
+
247
+ ALL_TERM_SIGNALS = {
248
+ signal.SIGABRT,
249
+ signal.SIGBUS,
250
+ signal.SIGFPE,
251
+ signal.SIGILL,
252
+ signal.SIGINT,
253
+ signal.SIGKILL,
254
+ signal.SIGPIPE,
255
+ signal.SIGQUIT,
256
+ signal.SIGSEGV,
257
+ signal.SIGTERM,
258
+ signal.SIGSYS,
259
+ }
260
+
261
+
262
+ @contextmanager
263
+ def defer_termination_signals():
264
+ # Block these signals and save the old signal mask.
265
+ old_mask = signal.pthread_sigmask(signal.SIG_BLOCK, ALL_TERM_SIGNALS)
266
+ try:
267
+ yield
268
+ finally:
269
+ # Restore the original signal mask.
270
+ signal.pthread_sigmask(signal.SIG_SETMASK, old_mask)
271
+
272
+
273
+ def restore_signal_mask():
274
+ """restore the signal mask that might block all termination signals"""
275
+ signal.pthread_sigmask(signal.SIG_UNBLOCK, ALL_TERM_SIGNALS)
276
+
277
+
278
+ def exit_handler(launcher: LauncherBase, timeout: float = 180, msg: str = ""):
279
+ with defer_termination_signals():
280
+ logger.info("=" * 80)
281
+ if msg:
282
+ logger.info(msg)
283
+ logger.info("[ExitHandler] Release resource, timeout=%d s", timeout)
284
+ lg_procs = [lg_proc for lg_proc in launcher.procs if lg_proc is not None]
285
+ kill_procs(lg_procs, timeout)
286
+ logger.info("[ExitHandler] Resource released")