vibego 1.0.2__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bot.py +345 -121
- master.py +18 -3
- scripts/master_healthcheck.py +34 -134
- scripts/requirements.txt +1 -0
- scripts/run_bot.sh +3 -0
- scripts/session_pointer_watch.py +265 -0
- scripts/start.sh +16 -9
- scripts/start_tmux_codex.sh +25 -0
- tasks/service.py +1 -46
- {vibego-1.0.2.dist-info → vibego-1.0.11.dist-info}/METADATA +10 -20
- {vibego-1.0.2.dist-info → vibego-1.0.11.dist-info}/RECORD +16 -15
- vibego_cli/__init__.py +1 -1
- {vibego-1.0.2.dist-info → vibego-1.0.11.dist-info}/WHEEL +0 -0
- {vibego-1.0.2.dist-info → vibego-1.0.11.dist-info}/entry_points.txt +0 -0
- {vibego-1.0.2.dist-info → vibego-1.0.11.dist-info}/licenses/LICENSE +0 -0
- {vibego-1.0.2.dist-info → vibego-1.0.11.dist-info}/top_level.txt +0 -0
scripts/master_healthcheck.py
CHANGED
|
@@ -1,137 +1,65 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""Master
|
|
2
|
+
"""Master post-startup health check.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
4. Send a probe message to the chat through the Telegram Bot API to confirm that the sending is successful.
|
|
9
|
-
5. If any step fails, an exception is thrown and an attempt is made to notify the administrator.
|
|
10
|
-
|
|
11
|
-
Note: This script will not automatically retry the restart and will only return a non-zero exit code for processing by the outer script.
|
|
4
|
+
This lightweight variant only verifies that the master process has emitted the
|
|
5
|
+
readiness marker in its log file. Worker bootstrap and Telegram probes are no
|
|
6
|
+
longer executed because master restart confirmation is now limited to the
|
|
7
|
+
controller process itself.
|
|
12
8
|
"""
|
|
13
9
|
from __future__ import annotations
|
|
14
10
|
|
|
15
11
|
import argparse
|
|
16
|
-
import asyncio
|
|
17
12
|
import json
|
|
18
13
|
import os
|
|
19
|
-
import re
|
|
20
14
|
import sys
|
|
21
15
|
import time
|
|
22
16
|
from pathlib import Path
|
|
23
|
-
from typing import Optional
|
|
24
|
-
from urllib.error import URLError, HTTPError
|
|
25
17
|
from urllib.request import Request, urlopen
|
|
26
18
|
|
|
27
|
-
#
|
|
19
|
+
# Make sure the master module can be imported from the repository root
|
|
28
20
|
ROOT_DIR = Path(__file__).resolve().parent.parent
|
|
29
21
|
ROOT_DIR_STR = str(ROOT_DIR)
|
|
30
22
|
if ROOT_DIR_STR not in sys.path:
|
|
31
|
-
# Make sure the master module can be imported from the repository root
|
|
32
23
|
sys.path.insert(0, ROOT_DIR_STR)
|
|
33
24
|
|
|
34
25
|
import master # type: ignore
|
|
35
|
-
|
|
26
|
+
|
|
36
27
|
DEFAULT_MASTER_LOG = master.LOG_ROOT_PATH / "vibe.log"
|
|
37
28
|
DEFAULT_TIMEOUT_MASTER = 60.0
|
|
38
|
-
|
|
39
|
-
PROBE_TEXT = "hello"
|
|
40
|
-
REPOSITORY = ProjectRepository(master.CONFIG_DB_PATH, master.CONFIG_PATH)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def _load_project(project_id: str) -> master.ProjectConfig:
|
|
44
|
-
"""Get the project configuration based on the slug or bot name, and list the options on failure."""
|
|
45
|
-
|
|
46
|
-
record = REPOSITORY.get_by_slug(project_id)
|
|
47
|
-
if record is None:
|
|
48
|
-
record = REPOSITORY.get_by_bot_name(project_id)
|
|
49
|
-
if record is None:
|
|
50
|
-
available = [r.project_slug for r in REPOSITORY.list_projects()]
|
|
51
|
-
raise RuntimeError(f"No items found {project_id}, Optional items: {available}")
|
|
52
|
-
return master.ProjectConfig.from_dict(record.to_dict())
|
|
29
|
+
MASTER_READY_MARKER = "Master Started, listening for administrator commands."
|
|
53
30
|
|
|
54
31
|
|
|
55
32
|
def _wait_for_log_flag(path: Path, pattern: str, timeout: float) -> None:
|
|
56
|
-
"""
|
|
33
|
+
"""Poll the master log until the readiness marker is detected or timeout."""
|
|
57
34
|
|
|
58
35
|
deadline = time.monotonic() + timeout
|
|
59
|
-
|
|
36
|
+
if path.exists():
|
|
37
|
+
position = path.stat().st_size
|
|
38
|
+
initialized = True
|
|
39
|
+
else:
|
|
40
|
+
position = 0
|
|
41
|
+
initialized = False
|
|
60
42
|
while time.monotonic() < deadline:
|
|
61
43
|
if path.exists():
|
|
62
|
-
if
|
|
63
|
-
position =
|
|
44
|
+
if not initialized:
|
|
45
|
+
position = 0
|
|
46
|
+
initialized = True
|
|
64
47
|
with path.open("r", encoding="utf-8", errors="ignore") as fh:
|
|
65
48
|
fh.seek(position)
|
|
66
49
|
while time.monotonic() < deadline:
|
|
67
50
|
line = fh.readline()
|
|
68
51
|
if not line:
|
|
69
52
|
time.sleep(0.5)
|
|
70
|
-
|
|
53
|
+
break
|
|
71
54
|
position = fh.tell()
|
|
72
55
|
if pattern in line:
|
|
73
56
|
return
|
|
74
57
|
time.sleep(0.5)
|
|
75
|
-
raise TimeoutError(f"
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def _extract_chat_id_from_logs(log_path: Path) -> Optional[int]:
|
|
79
|
-
"""Find the most recent chat from the log file in reverse order_id."""
|
|
80
|
-
|
|
81
|
-
if not log_path.exists():
|
|
82
|
-
return None
|
|
83
|
-
pattern = re.compile(r"chat=(-?\d+)")
|
|
84
|
-
try:
|
|
85
|
-
lines = log_path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
|
86
|
-
except Exception:
|
|
87
|
-
return None
|
|
88
|
-
for line in reversed(lines[-200:]): # Reverse search for recent records
|
|
89
|
-
match = pattern.search(line)
|
|
90
|
-
if match:
|
|
91
|
-
try:
|
|
92
|
-
return int(match.group(1))
|
|
93
|
-
except ValueError:
|
|
94
|
-
continue
|
|
95
|
-
return None
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _ensure_chat_id(cfg: master.ProjectConfig, manager: master.MasterManager) -> int:
|
|
99
|
-
"""Make sure the task is assigned chat_id, Backfill from log and write back to state if necessary."""
|
|
100
|
-
|
|
101
|
-
state = manager.state_store.data.get(cfg.project_slug)
|
|
102
|
-
if state and state.chat_id:
|
|
103
|
-
return int(state.chat_id)
|
|
104
|
-
# Fall back to log search
|
|
105
|
-
log_dir = master.LOG_ROOT_PATH / (cfg.default_model.lower()) / cfg.project_slug
|
|
106
|
-
chat_id = _extract_chat_id_from_logs(log_dir / "run_bot.log")
|
|
107
|
-
if chat_id is None:
|
|
108
|
-
raise RuntimeError(
|
|
109
|
-
"Unable to get chat automatically_id, Please manually have a conversation with the bot to write the state/log"
|
|
110
|
-
)
|
|
111
|
-
# will discover chat_id Write back the state for easy reuse next time
|
|
112
|
-
manager.state_store.update(cfg.project_slug, chat_id=chat_id)
|
|
113
|
-
return chat_id
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def _send_probe(bot_token: str, chat_id: int, text: str, timeout: float) -> None:
|
|
117
|
-
"""Send a probe message to the specified chat to verify that the Telegram API is available."""
|
|
118
|
-
|
|
119
|
-
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
|
120
|
-
payload = json.dumps({"chat_id": chat_id, "text": text, "disable_notification": True}).encode("utf-8")
|
|
121
|
-
request = Request(url, data=payload, headers={"Content-Type": "application/json"}, method="POST")
|
|
122
|
-
try:
|
|
123
|
-
with urlopen(request, timeout=timeout) as resp:
|
|
124
|
-
data = json.loads(resp.read().decode("utf-8"))
|
|
125
|
-
except HTTPError as exc: # pragma: no cover - Thrown when network exception occurs
|
|
126
|
-
raise RuntimeError(f"Failed to send probe message, HTTP {exc.code}: {exc.reason}") from exc
|
|
127
|
-
except URLError as exc: # pragma: no cover - Thrown when network exception occurs
|
|
128
|
-
raise RuntimeError(f"Failed to send probe message: {exc}") from exc
|
|
129
|
-
if not data.get("ok"):
|
|
130
|
-
raise RuntimeError(f"Failed to send probe message: {data}")
|
|
58
|
+
raise TimeoutError(f"No log markers detected within {timeout:.0f}s: {pattern}")
|
|
131
59
|
|
|
132
60
|
|
|
133
61
|
def _format_admin_notice(reason: str) -> str:
|
|
134
|
-
"""
|
|
62
|
+
"""Compose the notification text for administrator alerts."""
|
|
135
63
|
|
|
136
64
|
return (
|
|
137
65
|
"Master Restart health check failed\n"
|
|
@@ -141,7 +69,7 @@ def _format_admin_notice(reason: str) -> str:
|
|
|
141
69
|
|
|
142
70
|
|
|
143
71
|
def _notify_admins(reason: str) -> None:
|
|
144
|
-
"""
|
|
72
|
+
"""Broadcast the failure reason to administrators if the master token exists."""
|
|
145
73
|
|
|
146
74
|
master_token = os.environ.get("MASTER_BOT_TOKEN")
|
|
147
75
|
if not master_token:
|
|
@@ -163,58 +91,30 @@ def _notify_admins(reason: str) -> None:
|
|
|
163
91
|
continue
|
|
164
92
|
|
|
165
93
|
|
|
166
|
-
def _ensure_worker(cfg: master.ProjectConfig) -> master.MasterManager:
|
|
167
|
-
"""Starts the specified project worker and returns the temporarily constructed MasterManager."""
|
|
168
|
-
|
|
169
|
-
records = REPOSITORY.list_projects()
|
|
170
|
-
configs = [master.ProjectConfig.from_dict(record.to_dict()) for record in records]
|
|
171
|
-
state_store = master.StateStore(
|
|
172
|
-
master.STATE_PATH, {item.project_slug: item for item in configs}
|
|
173
|
-
)
|
|
174
|
-
manager = master.MasterManager(configs, state_store=state_store)
|
|
175
|
-
|
|
176
|
-
async def _run() -> None:
|
|
177
|
-
"""The coroutine performs the actual stop/start process."""
|
|
178
|
-
# Make sure to stop the old instance first(If it existsexist)
|
|
179
|
-
try:
|
|
180
|
-
await manager.stop_worker(cfg)
|
|
181
|
-
except Exception:
|
|
182
|
-
pass
|
|
183
|
-
await manager.run_worker(cfg)
|
|
184
|
-
|
|
185
|
-
asyncio.run(_run())
|
|
186
|
-
return manager
|
|
187
|
-
|
|
188
|
-
|
|
189
94
|
def main() -> int:
|
|
190
|
-
"""Command line entry,
|
|
191
|
-
|
|
192
|
-
parser = argparse.ArgumentParser(description="Master
|
|
193
|
-
parser.add_argument("--
|
|
194
|
-
parser.add_argument(
|
|
195
|
-
|
|
196
|
-
|
|
95
|
+
"""Command line entry point, only validates master readiness."""
|
|
96
|
+
|
|
97
|
+
parser = argparse.ArgumentParser(description="Master post-launch health check (master only)")
|
|
98
|
+
parser.add_argument("--master-log", default=str(DEFAULT_MASTER_LOG), help="Master log path")
|
|
99
|
+
parser.add_argument(
|
|
100
|
+
"--master-timeout",
|
|
101
|
+
type=float,
|
|
102
|
+
default=DEFAULT_TIMEOUT_MASTER,
|
|
103
|
+
help="Master log wait timeout (seconds)",
|
|
104
|
+
)
|
|
197
105
|
args = parser.parse_args()
|
|
198
106
|
|
|
199
|
-
project_id = master._sanitize_slug(args.project)
|
|
200
107
|
master_log = Path(args.master_log)
|
|
201
108
|
|
|
202
109
|
try:
|
|
203
|
-
_wait_for_log_flag(master_log,
|
|
204
|
-
cfg = _load_project(project_id)
|
|
205
|
-
manager = _ensure_worker(cfg)
|
|
206
|
-
chat_id = _ensure_chat_id(cfg, manager)
|
|
207
|
-
_send_probe(cfg.bot_token, chat_id, PROBE_TEXT, args.probe_timeout)
|
|
110
|
+
_wait_for_log_flag(master_log, MASTER_READY_MARKER, args.master_timeout)
|
|
208
111
|
except Exception as exc:
|
|
209
112
|
reason = str(exc)
|
|
210
113
|
_notify_admins(reason)
|
|
211
114
|
print(f"[healthcheck] fail: {reason}", file=sys.stderr)
|
|
212
115
|
return 1
|
|
213
116
|
else:
|
|
214
|
-
print(
|
|
215
|
-
"[healthcheck] success: master ready,"
|
|
216
|
-
f"worker={cfg.display_name} Startup completed, chat_id={chat_id}, Probe message sent"
|
|
217
|
-
)
|
|
117
|
+
print("[healthcheck] success: master ready, worker checks skipped by configuration")
|
|
218
118
|
return 0
|
|
219
119
|
|
|
220
120
|
|
scripts/requirements.txt
CHANGED
scripts/run_bot.sh
CHANGED
|
@@ -90,6 +90,7 @@ LOG_DIR="$(log_dir_for "$MODEL" "$PROJECT_NAME")"
|
|
|
90
90
|
MODEL_LOG="$LOG_DIR/model.log"
|
|
91
91
|
RUN_LOG="$LOG_DIR/run_bot.log"
|
|
92
92
|
POINTER_FILE="$LOG_DIR/${MODEL_POINTER_BASENAME:-current_session.txt}"
|
|
93
|
+
LOCK_FILE="${SESSION_LOCK_FILE:-$LOG_DIR/${MODEL_POINTER_LOCK_BASENAME:-session_lock.json}}"
|
|
93
94
|
TMUX_SESSION="$(tmux_session_for "$PROJECT_NAME")"
|
|
94
95
|
|
|
95
96
|
expand_model_workdir() {
|
|
@@ -167,6 +168,8 @@ export MODEL_SESSION_ROOT
|
|
|
167
168
|
export MODEL_SESSION_GLOB
|
|
168
169
|
export SESSION_POINTER_FILE="$POINTER_FILE"
|
|
169
170
|
export CODEX_SESSION_FILE_PATH="$POINTER_FILE"
|
|
171
|
+
export SESSION_LOCK_FILE="$LOCK_FILE"
|
|
172
|
+
export SESSION_LOCK_FILE_PATH="$LOCK_FILE"
|
|
170
173
|
export TMUX_SESSION="$TMUX_SESSION"
|
|
171
174
|
export TMUX_LOG="$MODEL_LOG"
|
|
172
175
|
export PROJECT_NAME="$PROJECT_NAME"
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Capture the session JSONL path generated after a worker launch and lock it.
|
|
3
|
+
|
|
4
|
+
This helper watches the Codex/Claude session directory after the tmux worker
|
|
5
|
+
starts. The first rollout file that matches the configured working directory is
|
|
6
|
+
recorded into both the pointer file (used by the worker for streaming) and a
|
|
7
|
+
dedicated lock file so subsequent bindings never drift to another CLI session.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from fnmatch import fnmatch
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from threading import Event, Lock
|
|
21
|
+
from typing import Iterable, Optional
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from watchdog.events import FileSystemEvent, FileSystemEventHandler
|
|
25
|
+
from watchdog.observers import Observer
|
|
26
|
+
except Exception: # pragma: no cover - watchdog is an optional dependency fallback
|
|
27
|
+
Observer = None
|
|
28
|
+
FileSystemEventHandler = object # type: ignore[assignment]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
log = logging.getLogger("session_pointer_watch")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _resolve(path: str) -> Path:
|
|
35
|
+
return Path(path).expanduser().resolve()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _read_session_cwd(path: Path) -> Optional[str]:
|
|
39
|
+
"""Read first JSON line and return payload.cwd when available."""
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
with path.open("r", encoding="utf-8", errors="ignore") as fh:
|
|
43
|
+
first_line = fh.readline()
|
|
44
|
+
except OSError:
|
|
45
|
+
return None
|
|
46
|
+
if not first_line:
|
|
47
|
+
return None
|
|
48
|
+
try:
|
|
49
|
+
data = json.loads(first_line)
|
|
50
|
+
except json.JSONDecodeError:
|
|
51
|
+
return None
|
|
52
|
+
payload = data.get("payload")
|
|
53
|
+
if isinstance(payload, dict):
|
|
54
|
+
raw = payload.get("cwd")
|
|
55
|
+
if isinstance(raw, str):
|
|
56
|
+
return raw
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _iter_candidate_files(roots: Iterable[Path], glob: str) -> Iterable[Path]:
|
|
61
|
+
for root in roots:
|
|
62
|
+
if not root.exists():
|
|
63
|
+
continue
|
|
64
|
+
try:
|
|
65
|
+
real_root = root.resolve()
|
|
66
|
+
except OSError:
|
|
67
|
+
real_root = root
|
|
68
|
+
yield from real_root.glob(f"**/{glob}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class _RolloutCapture(FileSystemEventHandler):
|
|
72
|
+
"""Handle filesystem events and pick the first rollout that matches criteria."""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
*,
|
|
77
|
+
pattern: str,
|
|
78
|
+
baseline: set[str],
|
|
79
|
+
start_wall: float,
|
|
80
|
+
start_monotonic: float,
|
|
81
|
+
target_cwd: Optional[str],
|
|
82
|
+
timeout: float,
|
|
83
|
+
poll_interval: float,
|
|
84
|
+
) -> None:
|
|
85
|
+
self._pattern = pattern
|
|
86
|
+
self._baseline = baseline
|
|
87
|
+
self._start_wall = start_wall
|
|
88
|
+
self._target_cwd = target_cwd
|
|
89
|
+
self._deadline = start_monotonic + timeout
|
|
90
|
+
self._poll_interval = poll_interval
|
|
91
|
+
self._chosen: Optional[Path] = None
|
|
92
|
+
self._event = Event()
|
|
93
|
+
self._lock = Lock()
|
|
94
|
+
|
|
95
|
+
def _consider(self, candidate: Path) -> None:
|
|
96
|
+
if candidate.is_dir():
|
|
97
|
+
return
|
|
98
|
+
name = candidate.name
|
|
99
|
+
if not fnmatch(name, self._pattern):
|
|
100
|
+
return
|
|
101
|
+
try:
|
|
102
|
+
real_path = candidate.resolve()
|
|
103
|
+
except OSError:
|
|
104
|
+
real_path = candidate
|
|
105
|
+
real_key = str(real_path)
|
|
106
|
+
if real_key in self._baseline:
|
|
107
|
+
return
|
|
108
|
+
try:
|
|
109
|
+
stat = real_path.stat()
|
|
110
|
+
except OSError:
|
|
111
|
+
return
|
|
112
|
+
if stat.st_mtime + 0.01 < self._start_wall:
|
|
113
|
+
# Ignore historical files.
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
if self._target_cwd:
|
|
117
|
+
# Wait until the JSON header is flushed and matches our CWD.
|
|
118
|
+
deadline = time.monotonic() + self._poll_interval * 10
|
|
119
|
+
while time.monotonic() < deadline:
|
|
120
|
+
cwd = _read_session_cwd(real_path)
|
|
121
|
+
if cwd is None:
|
|
122
|
+
time.sleep(self._poll_interval)
|
|
123
|
+
continue
|
|
124
|
+
if cwd == self._target_cwd:
|
|
125
|
+
break
|
|
126
|
+
log.debug("Skip rollout with mismatched cwd=%s", cwd)
|
|
127
|
+
return
|
|
128
|
+
with self._lock:
|
|
129
|
+
if self._chosen is None:
|
|
130
|
+
self._chosen = real_path
|
|
131
|
+
self._event.set()
|
|
132
|
+
|
|
133
|
+
# The following methods are only called when watchdog is available.
|
|
134
|
+
def on_created(self, event: FileSystemEvent) -> None: # type: ignore[override]
|
|
135
|
+
if getattr(event, "is_directory", False):
|
|
136
|
+
return
|
|
137
|
+
self._consider(Path(event.src_path))
|
|
138
|
+
|
|
139
|
+
def on_moved(self, event: FileSystemEvent) -> None: # type: ignore[override]
|
|
140
|
+
if getattr(event, "is_directory", False):
|
|
141
|
+
return
|
|
142
|
+
self._consider(Path(event.dest_path))
|
|
143
|
+
|
|
144
|
+
def poll_until_found(self, roots: Iterable[Path]) -> Optional[Path]:
|
|
145
|
+
while time.monotonic() < self._deadline:
|
|
146
|
+
remaining = self._deadline - time.monotonic()
|
|
147
|
+
if remaining <= 0:
|
|
148
|
+
break
|
|
149
|
+
wait_time = min(self._poll_interval, remaining)
|
|
150
|
+
if self._event.wait(timeout=wait_time):
|
|
151
|
+
break
|
|
152
|
+
for candidate in _iter_candidate_files(roots, self._pattern):
|
|
153
|
+
self._consider(candidate)
|
|
154
|
+
if self._event.is_set():
|
|
155
|
+
break
|
|
156
|
+
if self._event.is_set():
|
|
157
|
+
break
|
|
158
|
+
return self._chosen
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _write_pointer(pointer: Path, session_path: Path) -> None:
|
|
162
|
+
pointer.parent.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
pointer.write_text(str(session_path), encoding="utf-8")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _write_lock(lock_file: Path, *, session_path: Path, tmux_session: str, project: str, workdir: str, method: str) -> None:
|
|
167
|
+
payload = {
|
|
168
|
+
"session_path": str(session_path),
|
|
169
|
+
"captured_at": datetime.now(timezone.utc).isoformat(),
|
|
170
|
+
"tmux_session": tmux_session,
|
|
171
|
+
"project": project,
|
|
172
|
+
"workdir": workdir,
|
|
173
|
+
"method": method,
|
|
174
|
+
}
|
|
175
|
+
lock_file.parent.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
lock_file.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
180
|
+
parser = argparse.ArgumentParser(description="Capture newly created Codex/Claude rollout JSONL and write pointer+lock files.")
|
|
181
|
+
parser.add_argument("--pointer", required=True, help="Path to current_session.txt pointer file")
|
|
182
|
+
parser.add_argument("--lock", required=True, help="Path to persistent lock metadata JSON")
|
|
183
|
+
parser.add_argument("--session-root", required=False, help="Primary sessions root directory")
|
|
184
|
+
parser.add_argument("--additional-root", action="append", default=[], help="Extra directories to monitor for rollouts")
|
|
185
|
+
parser.add_argument("--glob", default="rollout-*.jsonl", help="Glob pattern for rollout files")
|
|
186
|
+
parser.add_argument("--workdir", default="", help="Model working directory, used to filter sessions")
|
|
187
|
+
parser.add_argument("--tmux-session", default="", help="tmux session name for diagnostics")
|
|
188
|
+
parser.add_argument("--project", default="", help="Project slug for diagnostics")
|
|
189
|
+
parser.add_argument("--timeout", type=float, default=180.0, help="Maximum seconds to wait for a new rollout")
|
|
190
|
+
parser.add_argument("--poll", type=float, default=0.5, help="Polling interval when watchdog is unavailable")
|
|
191
|
+
args = parser.parse_args(argv)
|
|
192
|
+
|
|
193
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
194
|
+
|
|
195
|
+
pointer_path = _resolve(args.pointer)
|
|
196
|
+
lock_path = _resolve(args.lock)
|
|
197
|
+
roots: list[Path] = []
|
|
198
|
+
if args.session_root:
|
|
199
|
+
roots.append(_resolve(args.session_root))
|
|
200
|
+
pointer_parent = pointer_path.parent
|
|
201
|
+
if pointer_parent not in roots:
|
|
202
|
+
roots.append(pointer_parent)
|
|
203
|
+
for raw in args.additional_root:
|
|
204
|
+
resolved = _resolve(raw)
|
|
205
|
+
if resolved not in roots:
|
|
206
|
+
roots.append(resolved)
|
|
207
|
+
|
|
208
|
+
baseline = {str(path.resolve()) for path in _iter_candidate_files(roots, args.glob)}
|
|
209
|
+
start_wall = time.time()
|
|
210
|
+
start_monotonic = time.monotonic()
|
|
211
|
+
|
|
212
|
+
capture = _RolloutCapture(
|
|
213
|
+
pattern=args.glob,
|
|
214
|
+
baseline=baseline,
|
|
215
|
+
start_wall=start_wall,
|
|
216
|
+
start_monotonic=start_monotonic,
|
|
217
|
+
target_cwd=args.workdir or None,
|
|
218
|
+
timeout=args.timeout,
|
|
219
|
+
poll_interval=max(args.poll, 0.1),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
observer: Optional[Observer] = None
|
|
223
|
+
method = "watchdog"
|
|
224
|
+
if Observer is not None:
|
|
225
|
+
observer = Observer()
|
|
226
|
+
for root in roots:
|
|
227
|
+
if root.exists():
|
|
228
|
+
observer.schedule(capture, str(root), recursive=True)
|
|
229
|
+
observer.start()
|
|
230
|
+
else: # pragma: no cover - watchdog is optional
|
|
231
|
+
log.warning("watchdog not available, falling back to polling")
|
|
232
|
+
method = "polling"
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
session_path = capture.poll_until_found(roots)
|
|
236
|
+
finally:
|
|
237
|
+
if observer is not None:
|
|
238
|
+
observer.stop()
|
|
239
|
+
observer.join(timeout=5)
|
|
240
|
+
|
|
241
|
+
if session_path is None:
|
|
242
|
+
log.error(
|
|
243
|
+
"Failed to detect rollout file within timeout %.1fs (tmux session=%s, project=%s)",
|
|
244
|
+
args.timeout,
|
|
245
|
+
args.tmux_session or "-",
|
|
246
|
+
args.project or "-",
|
|
247
|
+
)
|
|
248
|
+
return 1
|
|
249
|
+
|
|
250
|
+
_write_pointer(pointer_path, session_path)
|
|
251
|
+
_write_lock(
|
|
252
|
+
lock_path,
|
|
253
|
+
session_path=session_path,
|
|
254
|
+
tmux_session=args.tmux_session,
|
|
255
|
+
project=args.project,
|
|
256
|
+
workdir=args.workdir,
|
|
257
|
+
method=method,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
log.info("Recorded session pointer -> %s", session_path)
|
|
261
|
+
return 0
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
if __name__ == "__main__":
|
|
265
|
+
sys.exit(main())
|
scripts/start.sh
CHANGED
|
@@ -118,8 +118,9 @@ ensure_codex_installed() {
|
|
|
118
118
|
ensure_codex_installed
|
|
119
119
|
|
|
120
120
|
select_python_binary() {
|
|
121
|
-
#
|
|
121
|
+
# Select a compatible CPython version; defaults accept 3.9-3.14 and can be overridden via env variables.
|
|
122
122
|
local allow_py313="${VIBEGO_ALLOW_PY313:-}"
|
|
123
|
+
local supported_max_minor="${VIBEGO_MAX_MINOR:-14}"
|
|
123
124
|
local candidates=()
|
|
124
125
|
local chosen=""
|
|
125
126
|
local name
|
|
@@ -152,18 +153,24 @@ select_python_binary() {
|
|
|
152
153
|
if [[ -n "${VIBEGO_PYTHON:-}" && "$name" == "$VIBEGO_PYTHON" ]]; then
|
|
153
154
|
explicit_override=1
|
|
154
155
|
fi
|
|
155
|
-
if [[ "$minor" =~ ^[0-9]+$ ]] && (( minor == 13 )) &&
|
|
156
|
-
|
|
157
|
-
|
|
156
|
+
if [[ "$minor" =~ ^[0-9]+$ ]] && (( minor == 13 )) && (( explicit_override == 0 )); then
|
|
157
|
+
if [[ "$allow_py313" == "0" ]]; then
|
|
158
|
+
log_line "Skip ${name} (version ${version_raw}): disabled explicitly by VIBEGO_ALLOW_PY313=0" >&2
|
|
159
|
+
continue
|
|
160
|
+
fi
|
|
161
|
+
log_line "Detected ${name} (version ${version_raw}): Python 3.13 accepted by default; use VIBEGO_ALLOW_PY313=1 to prefer or 0 to disable" >&2
|
|
158
162
|
fi
|
|
159
|
-
if [[ "$minor" =~ ^[0-9]+$ ]] && (( minor >
|
|
160
|
-
log_line "
|
|
163
|
+
if [[ "$minor" =~ ^[0-9]+$ ]] && (( minor > supported_max_minor )) && (( explicit_override == 0 )); then
|
|
164
|
+
log_line "Skip ${name} (version ${version_raw}): above supported ceiling 3.${supported_max_minor}; override with VIBEGO_MAX_MINOR if needed" >&2
|
|
161
165
|
continue
|
|
162
166
|
fi
|
|
163
167
|
if [[ "$minor" =~ ^[0-9]+$ ]] && (( minor < 9 )); then
|
|
164
168
|
log_line "jump over ${name} (Version ${version_raw}): less than 3.9, May be missing official wheels" >&2
|
|
165
169
|
continue
|
|
166
170
|
fi
|
|
171
|
+
if [[ "$minor" =~ ^[0-9]+$ ]] && (( minor >= 14 )); then
|
|
172
|
+
log_line "Detected ${name} (version ${version_raw}): ensure dependencies support this Python version" >&2
|
|
173
|
+
fi
|
|
167
174
|
chosen="$name"
|
|
168
175
|
log_line "Using the Python interpreter:${chosen} (Version ${version_raw})" >&2
|
|
169
176
|
break
|
|
@@ -438,13 +445,13 @@ fi
|
|
|
438
445
|
log_info "master Started in background, PID=$MASTER_PID, Log writing ${LOG_FILE}"
|
|
439
446
|
|
|
440
447
|
# Health check: wait for master to come online and verify key workers
|
|
441
|
-
log_info "Start
|
|
448
|
+
log_info "Start master readiness check..."
|
|
442
449
|
HEALTHCHECK_START=$(date +%s)
|
|
443
450
|
|
|
444
|
-
if python scripts/master_healthcheck.py --
|
|
451
|
+
if python scripts/master_healthcheck.py --master-log "$LOG_FILE"; then
|
|
445
452
|
HEALTHCHECK_END=$(date +%s)
|
|
446
453
|
HEALTHCHECK_DURATION=$((HEALTHCHECK_END - HEALTHCHECK_START))
|
|
447
|
-
log_info "OK: Master
|
|
454
|
+
log_info "OK: Master readiness confirmed (elapsed ${HEALTHCHECK_DURATION}s)"
|
|
448
455
|
else
|
|
449
456
|
HEALTHCHECK_END=$(date +%s)
|
|
450
457
|
HEALTHCHECK_DURATION=$((HEALTHCHECK_END - HEALTHCHECK_START))
|
scripts/start_tmux_codex.sh
CHANGED
|
@@ -18,6 +18,11 @@ MODEL_WORKDIR="${MODEL_WORKDIR:-$ROOT_DIR}"
|
|
|
18
18
|
MODEL_SESSION_ROOT="${MODEL_SESSION_ROOT:-${CODEX_SESSION_ROOT:-$HOME/.codex/sessions}}"
|
|
19
19
|
MODEL_SESSION_GLOB="${MODEL_SESSION_GLOB:-rollout-*.jsonl}"
|
|
20
20
|
SESSION_POINTER_FILE="${SESSION_POINTER_FILE:-$LOG_ROOT/${MODEL_NAME:-codex}/${PROJECT_NAME:-project}/current_session.txt}"
|
|
21
|
+
SESSION_LOCK_FILE="${SESSION_LOCK_FILE:-${SESSION_POINTER_FILE%.txt}.lock.json}"
|
|
22
|
+
SESSION_CAPTURE_TIMEOUT="${SESSION_CAPTURE_TIMEOUT:-180}"
|
|
23
|
+
SESSION_CAPTURE_POLL_INTERVAL="${SESSION_CAPTURE_POLL_INTERVAL:-0.5}"
|
|
24
|
+
# Ensure optional CODEX session root is always defined even under `set -u`
|
|
25
|
+
: "${CODEX_SESSIONS_ROOT:=}"
|
|
21
26
|
|
|
22
27
|
# Avoid oh-my-zsh popping up update prompts in non-interactive environments
|
|
23
28
|
export DISABLE_UPDATE_PROMPT="${DISABLE_UPDATE_PROMPT:-true}"
|
|
@@ -67,6 +72,7 @@ MODEL_SESSION_ROOT=$(expand_path "$MODEL_SESSION_ROOT")
|
|
|
67
72
|
SESSION_POINTER_FILE=$(expand_path "$SESSION_POINTER_FILE")
|
|
68
73
|
ensure_dir "$(dirname "$LOG_PATH")"
|
|
69
74
|
ensure_dir "$(dirname "$SESSION_POINTER_FILE")"
|
|
75
|
+
ensure_dir "$(dirname "$SESSION_LOCK_FILE")"
|
|
70
76
|
|
|
71
77
|
run_tmux() {
|
|
72
78
|
if (( DRY_RUN )); then
|
|
@@ -143,5 +149,24 @@ if (( DRY_RUN )); then
|
|
|
143
149
|
fi
|
|
144
150
|
|
|
145
151
|
: > "$SESSION_POINTER_FILE"
|
|
152
|
+
rm -f "$SESSION_LOCK_FILE"
|
|
153
|
+
|
|
154
|
+
if (( ! DRY_RUN )); then
|
|
155
|
+
WATCH_ARGS=("$PYTHON_EXEC" "$ROOT_DIR/scripts/session_pointer_watch.py" "--pointer" "$SESSION_POINTER_FILE" "--lock" "$SESSION_LOCK_FILE" "--glob" "$MODEL_SESSION_GLOB" "--workdir" "$MODEL_WORKDIR" "--tmux-session" "$SESSION_NAME" "--project" "${PROJECT_NAME:-}")
|
|
156
|
+
if [[ -n "$MODEL_SESSION_ROOT" ]]; then
|
|
157
|
+
WATCH_ARGS+=("--session-root" "$MODEL_SESSION_ROOT")
|
|
158
|
+
fi
|
|
159
|
+
if [[ -n "$CODEX_SESSIONS_ROOT" ]]; then
|
|
160
|
+
WATCH_ARGS+=("--additional-root" "$CODEX_SESSIONS_ROOT")
|
|
161
|
+
fi
|
|
162
|
+
WATCH_ARGS+=("--additional-root" "$(dirname "$SESSION_POINTER_FILE")")
|
|
163
|
+
WATCH_ARGS+=("--additional-root" "$(dirname "$SESSION_POINTER_FILE")/sessions")
|
|
164
|
+
WATCH_ARGS+=("--timeout" "$SESSION_CAPTURE_TIMEOUT")
|
|
165
|
+
WATCH_ARGS+=("--poll" "$SESSION_CAPTURE_POLL_INTERVAL")
|
|
166
|
+
if ! "${WATCH_ARGS[@]}"; then
|
|
167
|
+
echo "Failed to capture Codex/Claude session log for tmux session '$SESSION_NAME'. Please check CLI startup output." >&2
|
|
168
|
+
exit 1
|
|
169
|
+
fi
|
|
170
|
+
fi
|
|
146
171
|
|
|
147
172
|
exit 0
|