ins-pricing 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +60 -0
- ins_pricing/__init__.py +102 -0
- ins_pricing/governance/README.md +18 -0
- ins_pricing/governance/__init__.py +20 -0
- ins_pricing/governance/approval.py +93 -0
- ins_pricing/governance/audit.py +37 -0
- ins_pricing/governance/registry.py +99 -0
- ins_pricing/governance/release.py +159 -0
- ins_pricing/modelling/BayesOpt.py +146 -0
- ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
- ins_pricing/modelling/BayesOpt_entry.py +575 -0
- ins_pricing/modelling/BayesOpt_incremental.py +731 -0
- ins_pricing/modelling/Explain_Run.py +36 -0
- ins_pricing/modelling/Explain_entry.py +539 -0
- ins_pricing/modelling/Pricing_Run.py +36 -0
- ins_pricing/modelling/README.md +33 -0
- ins_pricing/modelling/__init__.py +44 -0
- ins_pricing/modelling/bayesopt/__init__.py +98 -0
- ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
- ins_pricing/modelling/bayesopt/core.py +1476 -0
- ins_pricing/modelling/bayesopt/models.py +2196 -0
- ins_pricing/modelling/bayesopt/trainers.py +2446 -0
- ins_pricing/modelling/bayesopt/utils.py +1021 -0
- ins_pricing/modelling/cli_common.py +136 -0
- ins_pricing/modelling/explain/__init__.py +55 -0
- ins_pricing/modelling/explain/gradients.py +334 -0
- ins_pricing/modelling/explain/metrics.py +176 -0
- ins_pricing/modelling/explain/permutation.py +155 -0
- ins_pricing/modelling/explain/shap_utils.py +146 -0
- ins_pricing/modelling/notebook_utils.py +284 -0
- ins_pricing/modelling/plotting/__init__.py +45 -0
- ins_pricing/modelling/plotting/common.py +63 -0
- ins_pricing/modelling/plotting/curves.py +572 -0
- ins_pricing/modelling/plotting/diagnostics.py +139 -0
- ins_pricing/modelling/plotting/geo.py +362 -0
- ins_pricing/modelling/plotting/importance.py +121 -0
- ins_pricing/modelling/run_logging.py +133 -0
- ins_pricing/modelling/tests/conftest.py +8 -0
- ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
- ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
- ins_pricing/modelling/tests/test_explain.py +56 -0
- ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
- ins_pricing/modelling/tests/test_graph_cache.py +33 -0
- ins_pricing/modelling/tests/test_plotting.py +63 -0
- ins_pricing/modelling/tests/test_plotting_library.py +150 -0
- ins_pricing/modelling/tests/test_preprocessor.py +48 -0
- ins_pricing/modelling/watchdog_run.py +211 -0
- ins_pricing/pricing/README.md +44 -0
- ins_pricing/pricing/__init__.py +27 -0
- ins_pricing/pricing/calibration.py +39 -0
- ins_pricing/pricing/data_quality.py +117 -0
- ins_pricing/pricing/exposure.py +85 -0
- ins_pricing/pricing/factors.py +91 -0
- ins_pricing/pricing/monitoring.py +99 -0
- ins_pricing/pricing/rate_table.py +78 -0
- ins_pricing/production/__init__.py +21 -0
- ins_pricing/production/drift.py +30 -0
- ins_pricing/production/monitoring.py +143 -0
- ins_pricing/production/scoring.py +40 -0
- ins_pricing/reporting/README.md +20 -0
- ins_pricing/reporting/__init__.py +11 -0
- ins_pricing/reporting/report_builder.py +72 -0
- ins_pricing/reporting/scheduler.py +45 -0
- ins_pricing/setup.py +41 -0
- ins_pricing v2/__init__.py +23 -0
- ins_pricing v2/governance/__init__.py +20 -0
- ins_pricing v2/governance/approval.py +93 -0
- ins_pricing v2/governance/audit.py +37 -0
- ins_pricing v2/governance/registry.py +99 -0
- ins_pricing v2/governance/release.py +159 -0
- ins_pricing v2/modelling/Explain_Run.py +36 -0
- ins_pricing v2/modelling/Pricing_Run.py +36 -0
- ins_pricing v2/modelling/__init__.py +151 -0
- ins_pricing v2/modelling/cli_common.py +141 -0
- ins_pricing v2/modelling/config.py +249 -0
- ins_pricing v2/modelling/config_preprocess.py +254 -0
- ins_pricing v2/modelling/core.py +741 -0
- ins_pricing v2/modelling/data_container.py +42 -0
- ins_pricing v2/modelling/explain/__init__.py +55 -0
- ins_pricing v2/modelling/explain/gradients.py +334 -0
- ins_pricing v2/modelling/explain/metrics.py +176 -0
- ins_pricing v2/modelling/explain/permutation.py +155 -0
- ins_pricing v2/modelling/explain/shap_utils.py +146 -0
- ins_pricing v2/modelling/features.py +215 -0
- ins_pricing v2/modelling/model_manager.py +148 -0
- ins_pricing v2/modelling/model_plotting.py +463 -0
- ins_pricing v2/modelling/models.py +2203 -0
- ins_pricing v2/modelling/notebook_utils.py +294 -0
- ins_pricing v2/modelling/plotting/__init__.py +45 -0
- ins_pricing v2/modelling/plotting/common.py +63 -0
- ins_pricing v2/modelling/plotting/curves.py +572 -0
- ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
- ins_pricing v2/modelling/plotting/geo.py +362 -0
- ins_pricing v2/modelling/plotting/importance.py +121 -0
- ins_pricing v2/modelling/run_logging.py +133 -0
- ins_pricing v2/modelling/tests/conftest.py +8 -0
- ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
- ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
- ins_pricing v2/modelling/tests/test_explain.py +56 -0
- ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
- ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
- ins_pricing v2/modelling/tests/test_plotting.py +63 -0
- ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
- ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
- ins_pricing v2/modelling/trainers.py +2447 -0
- ins_pricing v2/modelling/utils.py +1020 -0
- ins_pricing v2/modelling/watchdog_run.py +211 -0
- ins_pricing v2/pricing/__init__.py +27 -0
- ins_pricing v2/pricing/calibration.py +39 -0
- ins_pricing v2/pricing/data_quality.py +117 -0
- ins_pricing v2/pricing/exposure.py +85 -0
- ins_pricing v2/pricing/factors.py +91 -0
- ins_pricing v2/pricing/monitoring.py +99 -0
- ins_pricing v2/pricing/rate_table.py +78 -0
- ins_pricing v2/production/__init__.py +21 -0
- ins_pricing v2/production/drift.py +30 -0
- ins_pricing v2/production/monitoring.py +143 -0
- ins_pricing v2/production/scoring.py +40 -0
- ins_pricing v2/reporting/__init__.py +11 -0
- ins_pricing v2/reporting/report_builder.py +72 -0
- ins_pricing v2/reporting/scheduler.py +45 -0
- ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
- ins_pricing v2/scripts/Explain_entry.py +545 -0
- ins_pricing v2/scripts/__init__.py +1 -0
- ins_pricing v2/scripts/train.py +568 -0
- ins_pricing v2/setup.py +55 -0
- ins_pricing v2/smoke_test.py +28 -0
- ins_pricing-0.1.6.dist-info/METADATA +78 -0
- ins_pricing-0.1.6.dist-info/RECORD +169 -0
- ins_pricing-0.1.6.dist-info/WHEEL +5 -0
- ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
- user_packages/__init__.py +105 -0
- user_packages legacy/BayesOpt.py +5659 -0
- user_packages legacy/BayesOpt_entry.py +513 -0
- user_packages legacy/BayesOpt_incremental.py +685 -0
- user_packages legacy/Pricing_Run.py +36 -0
- user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
- user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
- user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
- user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
- user_packages legacy/Try/BayesOpt legacy.py +3280 -0
- user_packages legacy/Try/BayesOpt.py +838 -0
- user_packages legacy/Try/BayesOptAll.py +1569 -0
- user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
- user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
- user_packages legacy/Try/BayesOptSearch.py +830 -0
- user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
- user_packages legacy/Try/BayesOptV1.py +1911 -0
- user_packages legacy/Try/BayesOptV10.py +2973 -0
- user_packages legacy/Try/BayesOptV11.py +3001 -0
- user_packages legacy/Try/BayesOptV12.py +3001 -0
- user_packages legacy/Try/BayesOptV2.py +2065 -0
- user_packages legacy/Try/BayesOptV3.py +2209 -0
- user_packages legacy/Try/BayesOptV4.py +2342 -0
- user_packages legacy/Try/BayesOptV5.py +2372 -0
- user_packages legacy/Try/BayesOptV6.py +2759 -0
- user_packages legacy/Try/BayesOptV7.py +2832 -0
- user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
- user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
- user_packages legacy/Try/BayesOptV9.py +2927 -0
- user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
- user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
- user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
- user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
- user_packages legacy/Try/xgbbayesopt.py +523 -0
- user_packages legacy/__init__.py +19 -0
- user_packages legacy/cli_common.py +124 -0
- user_packages legacy/notebook_utils.py +228 -0
- user_packages legacy/watchdog_run.py +202 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterable, List, Optional, Sequence, cast
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _find_user_packages_dir(cwd: Optional[Path] = None) -> Path:
|
|
11
|
+
cwd = (cwd or Path().resolve()).resolve()
|
|
12
|
+
candidates = [cwd / "user_packages", cwd, cwd.parent / "user_packages"]
|
|
13
|
+
for cand in candidates:
|
|
14
|
+
if (cand / "BayesOpt_entry.py").exists() and (cand / "watchdog_run.py").exists():
|
|
15
|
+
return cand
|
|
16
|
+
raise FileNotFoundError(
|
|
17
|
+
"Cannot locate user_packages directory (expected BayesOpt_entry.py and watchdog_run.py). "
|
|
18
|
+
f"cwd={cwd}"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _stringify_cmd(cmd: Sequence[object]) -> List[str]:
|
|
23
|
+
return [str(x) for x in cmd]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def build_bayesopt_entry_cmd(
|
|
27
|
+
config_json: str | Path,
|
|
28
|
+
model_keys: Sequence[str],
|
|
29
|
+
*,
|
|
30
|
+
nproc_per_node: int = 1,
|
|
31
|
+
standalone: bool = True,
|
|
32
|
+
entry_script: str | Path = "BayesOpt_entry.py",
|
|
33
|
+
extra_args: Optional[Sequence[str]] = None,
|
|
34
|
+
) -> List[str]:
|
|
35
|
+
"""构造运行 BayesOpt_entry.py 的命令(可选 torchrun/DDP)。"""
|
|
36
|
+
pkg_dir = _find_user_packages_dir()
|
|
37
|
+
entry_path = (pkg_dir / entry_script).resolve() if not Path(entry_script).is_absolute() else Path(entry_script).resolve()
|
|
38
|
+
config_path = Path(config_json)
|
|
39
|
+
if not config_path.is_absolute():
|
|
40
|
+
config_path = (pkg_dir / config_path).resolve() if (pkg_dir / config_path).exists() else config_path.resolve()
|
|
41
|
+
|
|
42
|
+
cmd: List[object]
|
|
43
|
+
if int(nproc_per_node) > 1:
|
|
44
|
+
cmd = [
|
|
45
|
+
sys.executable,
|
|
46
|
+
"-m",
|
|
47
|
+
"torch.distributed.run",
|
|
48
|
+
*(["--standalone"] if standalone else []),
|
|
49
|
+
f"--nproc_per_node={int(nproc_per_node)}",
|
|
50
|
+
str(entry_path),
|
|
51
|
+
]
|
|
52
|
+
else:
|
|
53
|
+
cmd = [sys.executable, str(entry_path)]
|
|
54
|
+
|
|
55
|
+
cmd += ["--config-json", str(config_path), "--model-keys", *list(model_keys)]
|
|
56
|
+
if extra_args:
|
|
57
|
+
cmd += list(extra_args)
|
|
58
|
+
return _stringify_cmd(cmd)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def build_incremental_cmd(
|
|
62
|
+
config_json: str | Path,
|
|
63
|
+
*,
|
|
64
|
+
entry_script: str | Path = "BayesOpt_incremental.py",
|
|
65
|
+
extra_args: Optional[Sequence[str]] = None,
|
|
66
|
+
) -> List[str]:
|
|
67
|
+
"""构造运行 BayesOpt_incremental.py 的命令。"""
|
|
68
|
+
pkg_dir = _find_user_packages_dir()
|
|
69
|
+
entry_path = (pkg_dir / entry_script).resolve() if not Path(entry_script).is_absolute() else Path(entry_script).resolve()
|
|
70
|
+
config_path = Path(config_json)
|
|
71
|
+
if not config_path.is_absolute():
|
|
72
|
+
config_path = (pkg_dir / config_path).resolve() if (pkg_dir / config_path).exists() else config_path.resolve()
|
|
73
|
+
|
|
74
|
+
cmd: List[object] = [sys.executable, str(entry_path), "--config-json", str(config_path)]
|
|
75
|
+
if extra_args:
|
|
76
|
+
cmd += list(extra_args)
|
|
77
|
+
return _stringify_cmd(cmd)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def wrap_with_watchdog(
|
|
81
|
+
cmd: Sequence[str],
|
|
82
|
+
*,
|
|
83
|
+
idle_seconds: int = 7200,
|
|
84
|
+
max_restarts: int = 50,
|
|
85
|
+
restart_delay_seconds: int = 10,
|
|
86
|
+
stop_on_nonzero_exit: bool = True,
|
|
87
|
+
watchdog_script: str | Path = "watchdog_run.py",
|
|
88
|
+
) -> List[str]:
|
|
89
|
+
"""用 watchdog 包一层命令:超过 idle_seconds 无输出则自动杀进程树并重启。"""
|
|
90
|
+
pkg_dir = _find_user_packages_dir()
|
|
91
|
+
watchdog_path = (pkg_dir / watchdog_script).resolve() if not Path(watchdog_script).is_absolute() else Path(watchdog_script).resolve()
|
|
92
|
+
wd_cmd: List[object] = [
|
|
93
|
+
sys.executable,
|
|
94
|
+
str(watchdog_path),
|
|
95
|
+
"--idle-seconds",
|
|
96
|
+
str(int(idle_seconds)),
|
|
97
|
+
"--max-restarts",
|
|
98
|
+
str(int(max_restarts)),
|
|
99
|
+
"--restart-delay-seconds",
|
|
100
|
+
str(int(restart_delay_seconds)),
|
|
101
|
+
]
|
|
102
|
+
if stop_on_nonzero_exit:
|
|
103
|
+
wd_cmd.append("--stop-on-nonzero-exit")
|
|
104
|
+
wd_cmd.append("--")
|
|
105
|
+
wd_cmd.extend(list(cmd))
|
|
106
|
+
return _stringify_cmd(wd_cmd)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def run(cmd: Sequence[str], *, check: bool = True) -> subprocess.CompletedProcess:
|
|
110
|
+
"""在 notebook 中运行外部命令(同步等待)。"""
|
|
111
|
+
return subprocess.run(list(cmd), check=check)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def run_bayesopt_entry(
|
|
115
|
+
*,
|
|
116
|
+
config_json: str | Path,
|
|
117
|
+
model_keys: Sequence[str],
|
|
118
|
+
max_evals: int = 50,
|
|
119
|
+
plot_curves: bool = True,
|
|
120
|
+
ft_role: Optional[str] = None,
|
|
121
|
+
nproc_per_node: int = 1,
|
|
122
|
+
use_watchdog: bool = False,
|
|
123
|
+
idle_seconds: int = 7200,
|
|
124
|
+
max_restarts: int = 50,
|
|
125
|
+
restart_delay_seconds: int = 10,
|
|
126
|
+
extra_args: Optional[Sequence[str]] = None,
|
|
127
|
+
) -> subprocess.CompletedProcess:
|
|
128
|
+
"""便捷封装:构造并运行 BayesOpt_entry(可选 torchrun + watchdog)。"""
|
|
129
|
+
args: List[str] = ["--max-evals", str(int(max_evals))]
|
|
130
|
+
if plot_curves:
|
|
131
|
+
args.append("--plot-curves")
|
|
132
|
+
if ft_role:
|
|
133
|
+
args += ["--ft-role", str(ft_role)]
|
|
134
|
+
if extra_args:
|
|
135
|
+
args += list(extra_args)
|
|
136
|
+
|
|
137
|
+
cmd = build_bayesopt_entry_cmd(
|
|
138
|
+
config_json=config_json,
|
|
139
|
+
model_keys=model_keys,
|
|
140
|
+
nproc_per_node=nproc_per_node,
|
|
141
|
+
extra_args=args,
|
|
142
|
+
)
|
|
143
|
+
if use_watchdog:
|
|
144
|
+
cmd = wrap_with_watchdog(
|
|
145
|
+
cmd,
|
|
146
|
+
idle_seconds=idle_seconds,
|
|
147
|
+
max_restarts=max_restarts,
|
|
148
|
+
restart_delay_seconds=restart_delay_seconds,
|
|
149
|
+
)
|
|
150
|
+
return run(cmd, check=True)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def run_from_config(config_json: str | Path) -> subprocess.CompletedProcess:
|
|
154
|
+
"""Notebook 统一入口:只修改 config 文件即可切换不同运行方式。
|
|
155
|
+
|
|
156
|
+
约定:config.json 可选增加一个 `runner` 字段,用于 notebook 执行控制:
|
|
157
|
+
- runner.mode: "entry"(默认)或 "incremental"
|
|
158
|
+
- runner.nproc_per_node: >1 则使用 torchrun/DDP(仅 entry)
|
|
159
|
+
- runner.model_keys: 需要运行的模型列表(仅 entry)
|
|
160
|
+
- runner.max_evals / runner.plot_curves / runner.ft_role(仅 entry,可覆盖 config 内同名字段)
|
|
161
|
+
- runner.use_watchdog / runner.idle_seconds / runner.max_restarts / runner.restart_delay_seconds
|
|
162
|
+
- runner.incremental_args: List[str](仅 incremental,等价于直接传给 BayesOpt_incremental.py 的额外参数)
|
|
163
|
+
"""
|
|
164
|
+
pkg_dir = _find_user_packages_dir()
|
|
165
|
+
config_path = Path(config_json)
|
|
166
|
+
if not config_path.is_absolute():
|
|
167
|
+
config_path = (pkg_dir / config_path).resolve() if (pkg_dir / config_path).exists() else config_path.resolve()
|
|
168
|
+
raw = json.loads(config_path.read_text(encoding="utf-8", errors="replace"))
|
|
169
|
+
runner = cast(dict, raw.get("runner") or {})
|
|
170
|
+
|
|
171
|
+
mode = str(runner.get("mode") or "entry").strip().lower()
|
|
172
|
+
use_watchdog = bool(runner.get("use_watchdog", False))
|
|
173
|
+
idle_seconds = int(runner.get("idle_seconds", 7200))
|
|
174
|
+
max_restarts = int(runner.get("max_restarts", 50))
|
|
175
|
+
restart_delay_seconds = int(runner.get("restart_delay_seconds", 10))
|
|
176
|
+
|
|
177
|
+
if mode == "incremental":
|
|
178
|
+
inc_args = runner.get("incremental_args") or []
|
|
179
|
+
if not isinstance(inc_args, list):
|
|
180
|
+
raise ValueError("config.runner.incremental_args must be a list of strings.")
|
|
181
|
+
cmd = build_incremental_cmd(config_path, extra_args=[str(x) for x in inc_args])
|
|
182
|
+
if use_watchdog:
|
|
183
|
+
cmd = wrap_with_watchdog(
|
|
184
|
+
cmd,
|
|
185
|
+
idle_seconds=idle_seconds,
|
|
186
|
+
max_restarts=max_restarts,
|
|
187
|
+
restart_delay_seconds=restart_delay_seconds,
|
|
188
|
+
)
|
|
189
|
+
return run(cmd, check=True)
|
|
190
|
+
|
|
191
|
+
if mode != "entry":
|
|
192
|
+
raise ValueError(f"Unsupported runner.mode={mode!r}, expected 'entry' or 'incremental'.")
|
|
193
|
+
|
|
194
|
+
model_keys = runner.get("model_keys")
|
|
195
|
+
if not model_keys:
|
|
196
|
+
model_keys = raw.get("model_keys")
|
|
197
|
+
if not model_keys:
|
|
198
|
+
model_keys = ["ft"]
|
|
199
|
+
if not isinstance(model_keys, list):
|
|
200
|
+
raise ValueError("runner.model_keys must be a list of strings.")
|
|
201
|
+
|
|
202
|
+
nproc_per_node = int(runner.get("nproc_per_node", 1))
|
|
203
|
+
max_evals = int(runner.get("max_evals", raw.get("max_evals", 50)))
|
|
204
|
+
plot_curves = bool(runner.get("plot_curves", raw.get("plot_curves", True)))
|
|
205
|
+
ft_role = runner.get("ft_role", None)
|
|
206
|
+
if ft_role is None:
|
|
207
|
+
ft_role = raw.get("ft_role")
|
|
208
|
+
|
|
209
|
+
cmd = build_bayesopt_entry_cmd(
|
|
210
|
+
config_path,
|
|
211
|
+
model_keys=[str(x) for x in model_keys],
|
|
212
|
+
nproc_per_node=nproc_per_node,
|
|
213
|
+
extra_args=[
|
|
214
|
+
"--max-evals",
|
|
215
|
+
str(max_evals),
|
|
216
|
+
*(["--plot-curves"] if plot_curves else []),
|
|
217
|
+
*(["--ft-role", str(ft_role)] if ft_role else []),
|
|
218
|
+
],
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if use_watchdog:
|
|
222
|
+
cmd = wrap_with_watchdog(
|
|
223
|
+
cmd,
|
|
224
|
+
idle_seconds=idle_seconds,
|
|
225
|
+
max_restarts=max_restarts,
|
|
226
|
+
restart_delay_seconds=restart_delay_seconds,
|
|
227
|
+
)
|
|
228
|
+
return run(cmd, check=True)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _split_argv(argv: List[str]) -> tuple[List[str], List[str]]:
|
|
13
|
+
if "--" not in argv:
|
|
14
|
+
raise ValueError("Missing '--' separator before the command to run.")
|
|
15
|
+
idx = argv.index("--")
|
|
16
|
+
return argv[:idx], argv[idx + 1 :]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _kill_process_tree(pid: int) -> None:
|
|
20
|
+
if pid <= 0:
|
|
21
|
+
return
|
|
22
|
+
if os.name == "nt":
|
|
23
|
+
subprocess.run(
|
|
24
|
+
["taskkill", "/PID", str(pid), "/T", "/F"],
|
|
25
|
+
stdout=subprocess.DEVNULL,
|
|
26
|
+
stderr=subprocess.DEVNULL,
|
|
27
|
+
check=False,
|
|
28
|
+
)
|
|
29
|
+
return
|
|
30
|
+
try:
|
|
31
|
+
os.killpg(pid, 15)
|
|
32
|
+
time.sleep(2)
|
|
33
|
+
os.killpg(pid, 9)
|
|
34
|
+
except Exception:
|
|
35
|
+
try:
|
|
36
|
+
os.kill(pid, 9)
|
|
37
|
+
except Exception:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _reader_thread(
|
|
42
|
+
proc: subprocess.Popen, last_output_ts: dict, prefix: str = ""
|
|
43
|
+
) -> None:
|
|
44
|
+
assert proc.stdout is not None
|
|
45
|
+
for line in proc.stdout:
|
|
46
|
+
last_output_ts["ts"] = time.time()
|
|
47
|
+
if prefix:
|
|
48
|
+
sys.stdout.write(prefix)
|
|
49
|
+
sys.stdout.write(line)
|
|
50
|
+
sys.stdout.flush()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _parse_args(before_cmd: List[str], cmd: List[str]) -> argparse.Namespace:
|
|
54
|
+
parser = argparse.ArgumentParser(
|
|
55
|
+
description=(
|
|
56
|
+
"Run a command under a simple watchdog: if there is no stdout/stderr "
|
|
57
|
+
"output for N seconds, kill the whole process tree and restart. "
|
|
58
|
+
"Designed to pair with optuna_storage so BayesOpt can resume."
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
"--idle-seconds",
|
|
63
|
+
type=int,
|
|
64
|
+
default=7200,
|
|
65
|
+
help="Restart if there is no output for this many seconds (default: 7200).",
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"--max-restarts",
|
|
69
|
+
type=int,
|
|
70
|
+
default=50,
|
|
71
|
+
help="Maximum restart attempts (default: 50).",
|
|
72
|
+
)
|
|
73
|
+
parser.add_argument(
|
|
74
|
+
"--restart-delay-seconds",
|
|
75
|
+
type=int,
|
|
76
|
+
default=10,
|
|
77
|
+
help="Delay between restarts (default: 10).",
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"--stop-on-nonzero-exit",
|
|
81
|
+
action="store_true",
|
|
82
|
+
help="If the command exits non-zero, stop instead of restarting.",
|
|
83
|
+
)
|
|
84
|
+
args = parser.parse_args(before_cmd)
|
|
85
|
+
if not cmd:
|
|
86
|
+
parser.error("Empty command after '--'.")
|
|
87
|
+
return args
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def run_with_watchdog(
|
|
91
|
+
cmd: List[str],
|
|
92
|
+
idle_seconds: int,
|
|
93
|
+
max_restarts: int,
|
|
94
|
+
restart_delay_seconds: int,
|
|
95
|
+
stop_on_nonzero_exit: bool,
|
|
96
|
+
) -> int:
|
|
97
|
+
idle_seconds = max(1, int(idle_seconds))
|
|
98
|
+
max_restarts = max(0, int(max_restarts))
|
|
99
|
+
restart_delay_seconds = max(0, int(restart_delay_seconds))
|
|
100
|
+
|
|
101
|
+
attempt = 0
|
|
102
|
+
while True:
|
|
103
|
+
attempt += 1
|
|
104
|
+
print(
|
|
105
|
+
f"[watchdog] start attempt={attempt} idle_seconds={idle_seconds} cmd={cmd}",
|
|
106
|
+
flush=True,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
creationflags = 0
|
|
110
|
+
start_new_session = False
|
|
111
|
+
if os.name == "nt":
|
|
112
|
+
creationflags = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0)
|
|
113
|
+
else:
|
|
114
|
+
start_new_session = True
|
|
115
|
+
|
|
116
|
+
proc = subprocess.Popen(
|
|
117
|
+
cmd,
|
|
118
|
+
stdout=subprocess.PIPE,
|
|
119
|
+
stderr=subprocess.STDOUT,
|
|
120
|
+
text=True,
|
|
121
|
+
bufsize=1,
|
|
122
|
+
universal_newlines=True,
|
|
123
|
+
creationflags=creationflags,
|
|
124
|
+
start_new_session=start_new_session,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
last_output_ts: dict = {"ts": time.time()}
|
|
128
|
+
reader = threading.Thread(
|
|
129
|
+
target=_reader_thread,
|
|
130
|
+
args=(proc, last_output_ts),
|
|
131
|
+
kwargs={"prefix": ""},
|
|
132
|
+
daemon=True,
|
|
133
|
+
)
|
|
134
|
+
reader.start()
|
|
135
|
+
|
|
136
|
+
killed_for_idle = False
|
|
137
|
+
exit_code: Optional[int] = None
|
|
138
|
+
while True:
|
|
139
|
+
exit_code = proc.poll()
|
|
140
|
+
if exit_code is not None:
|
|
141
|
+
break
|
|
142
|
+
idle_for = time.time() - float(last_output_ts["ts"])
|
|
143
|
+
if idle_for > idle_seconds:
|
|
144
|
+
killed_for_idle = True
|
|
145
|
+
print(
|
|
146
|
+
f"[watchdog] idle>{idle_seconds}s (idle_for={int(idle_for)}s), killing pid={proc.pid}",
|
|
147
|
+
flush=True,
|
|
148
|
+
)
|
|
149
|
+
_kill_process_tree(proc.pid)
|
|
150
|
+
break
|
|
151
|
+
time.sleep(5)
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
proc.wait(timeout=30)
|
|
155
|
+
except Exception:
|
|
156
|
+
_kill_process_tree(proc.pid)
|
|
157
|
+
|
|
158
|
+
if exit_code is None:
|
|
159
|
+
exit_code = proc.poll() or 1
|
|
160
|
+
|
|
161
|
+
if exit_code == 0:
|
|
162
|
+
print("[watchdog] finished with exit_code=0", flush=True)
|
|
163
|
+
return 0
|
|
164
|
+
|
|
165
|
+
if stop_on_nonzero_exit and not killed_for_idle:
|
|
166
|
+
print(
|
|
167
|
+
f"[watchdog] command exited non-zero (exit_code={exit_code}); stop.",
|
|
168
|
+
flush=True,
|
|
169
|
+
)
|
|
170
|
+
return int(exit_code)
|
|
171
|
+
|
|
172
|
+
if attempt > max_restarts + 1:
|
|
173
|
+
print(
|
|
174
|
+
f"[watchdog] exceeded max_restarts={max_restarts}; last exit_code={exit_code}",
|
|
175
|
+
flush=True,
|
|
176
|
+
)
|
|
177
|
+
return int(exit_code)
|
|
178
|
+
|
|
179
|
+
print(
|
|
180
|
+
f"[watchdog] restart in {restart_delay_seconds}s (exit_code={exit_code}, killed_for_idle={killed_for_idle})",
|
|
181
|
+
flush=True,
|
|
182
|
+
)
|
|
183
|
+
if restart_delay_seconds:
|
|
184
|
+
time.sleep(restart_delay_seconds)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
188
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
189
|
+
before_cmd, cmd = _split_argv(argv)
|
|
190
|
+
args = _parse_args(before_cmd, cmd)
|
|
191
|
+
return run_with_watchdog(
|
|
192
|
+
cmd=cmd,
|
|
193
|
+
idle_seconds=args.idle_seconds,
|
|
194
|
+
max_restarts=args.max_restarts,
|
|
195
|
+
restart_delay_seconds=args.restart_delay_seconds,
|
|
196
|
+
stop_on_nonzero_exit=bool(args.stop_on_nonzero_exit),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
if __name__ == "__main__":
|
|
201
|
+
raise SystemExit(main())
|
|
202
|
+
|