researchloop 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {researchloop-0.3.1 → researchloop-0.3.2}/PKG-INFO +1 -1
- {researchloop-0.3.1 → researchloop-0.3.2}/pyproject.toml +1 -1
- researchloop-0.3.2/researchloop/__init__.py +1 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/sge.sh.j2 +86 -15
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/slurm.sh.j2 +86 -16
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/report.md.j2 +11 -4
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_runner.py +28 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/uv.lock +1 -1
- researchloop-0.3.1/researchloop/__init__.py +0 -1
- {researchloop-0.3.1 → researchloop-0.3.2}/.github/workflows/ci.yml +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/.github/workflows/docs.yml +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/.github/workflows/release.yml +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/.gitignore +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/CLAUDE.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/Dockerfile +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/LICENSE +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/README.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/assets/mmlu-combined.gif +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/assets/mmlu-combined.mp4 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/cli.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/configuration.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/dashboard.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/deployment.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/development.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/getting-started.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/index.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/security.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/docs/slack.md +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/mkdocs.yml +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/__main__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/cli.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/clusters/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/clusters/monitor.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/clusters/ssh.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/base.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/ntfy.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/router.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/slack.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/config.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/credentials.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/models.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/orchestrator.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/app.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/auth.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/routes.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/base.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/login.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/loop_detail.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/loops.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/search.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/setup.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/sprint_detail.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/sprints.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/studies.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/study_detail.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/study_form.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/tweak_detail.html +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/database.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/migrations.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/queries.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/claude.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/sge_tweak.sh.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/slurm_tweak.sh.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/main.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/pipeline.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/fix_issues.md.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/idea_generator.md.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/red_team.md.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/research_sprint.md.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/summarizer.md.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/tweak.md.j2 +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/upload.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/base.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/local.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/sge.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/slurm.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/sprints/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/sprints/auto_loop.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/sprints/manager.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/studies/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/studies/manager.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/testing/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/testing/slack_mock.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/testing/slack_simulator.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/researchloop.toml.example +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/slack-app-manifest.yml +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/conftest.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/sge/Dockerfile +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/sge/entrypoint.sh +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/sge/mock_claude.sh +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/slurm/Dockerfile +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/slurm/entrypoint.sh +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/slurm/mock_claude.sh +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/__init__.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/conftest.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_loop_advancement.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_loop_and_monitor.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_sge_scheduler.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_slurm_scheduler.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_sprint_slurm.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_webhook_and_refresh.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_api.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_auto_loop.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_cli.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_config.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_dashboard.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_database.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_models.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_notification.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_queries.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_schedulers.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_search.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_sge.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack_events.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack_mock.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack_simulator.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_sprint_manager.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_study_manager.py +0 -0
- {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_tweaks.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.2"
|
|
@@ -54,6 +54,7 @@ log() {
|
|
|
54
54
|
|
|
55
55
|
# --- Helper: run claude and extract session ID ---
|
|
56
56
|
SESSION_ID=""
|
|
57
|
+
ACTIVE_STEP_FILE="$SPRINT_DIR/.researchloop/active_step"
|
|
57
58
|
run_step() {
|
|
58
59
|
local prompt_file="$1"
|
|
59
60
|
local step_name="$2"
|
|
@@ -65,12 +66,33 @@ run_step() {
|
|
|
65
66
|
cmd+=(--resume "$SESSION_ID")
|
|
66
67
|
fi
|
|
67
68
|
|
|
69
|
+
local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
|
|
70
|
+
local pgid_file="$SPRINT_DIR/.researchloop/${step_name}_pgid"
|
|
71
|
+
local result_sentinel="$SPRINT_DIR/.researchloop/${step_name}_result_seen"
|
|
72
|
+
# Reset per-step watchdog state and announce the active step to the
|
|
73
|
+
# heartbeat loop.
|
|
74
|
+
rm -f "$pgid_file" "$result_sentinel"
|
|
75
|
+
echo "$step_name" > "$ACTIVE_STEP_FILE"
|
|
76
|
+
|
|
68
77
|
# Run claude with streaming JSON output.
|
|
69
78
|
# Each line is a JSON event — we pipe through a filter that:
|
|
70
79
|
# 1. Logs human-readable summaries of what Claude is doing
|
|
71
80
|
# 2. Saves the full stream for post-processing
|
|
72
|
-
|
|
73
|
-
|
|
81
|
+
# 3. Writes ${step}_result_seen when claude emits its terminal `result`
|
|
82
|
+
# event, so the heartbeat watchdog can detect "done but pipeline open"
|
|
83
|
+
#
|
|
84
|
+
# claude is launched under `setsid` so it (and any subprocesses spawned
|
|
85
|
+
# by Bash tool calls) share a dedicated process group. If a Bash-tool
|
|
86
|
+
# subprocess later leaks (e.g. hung network call) and keeps the
|
|
87
|
+
# tee|python3 pipeline open after claude has emitted result, the
|
|
88
|
+
# heartbeat watchdog can SIGTERM the whole group cleanly via the pgid.
|
|
89
|
+
(
|
|
90
|
+
setsid "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) &
|
|
91
|
+
claude_pid=$!
|
|
92
|
+
# setsid makes the child a session/group leader; pgid == pid.
|
|
93
|
+
echo "$claude_pid" > "$pgid_file"
|
|
94
|
+
wait "$claude_pid"
|
|
95
|
+
) | tee "$output_file" | python3 -u -c "
|
|
74
96
|
import sys, json, os, time
|
|
75
97
|
log = open('$LOG_FILE', 'a', buffering=1)
|
|
76
98
|
last_tool_time = 0
|
|
@@ -112,17 +134,32 @@ for line in sys.stdin:
|
|
|
112
134
|
sid = evt.get('session_id', '')
|
|
113
135
|
if sid:
|
|
114
136
|
open('$SPRINT_DIR/.researchloop/session_id', 'w').write(sid)
|
|
137
|
+
# Mark claude's terminal result so the heartbeat watchdog can kill
|
|
138
|
+
# any leaked subprocesses keeping the pipeline open past this point.
|
|
139
|
+
try:
|
|
140
|
+
open('$result_sentinel', 'w').close()
|
|
141
|
+
except OSError:
|
|
142
|
+
pass
|
|
115
143
|
result = evt.get('result', '') or ''
|
|
116
144
|
log.write(f' Done ({tool_count} tool calls, {len(result)} chars output)\n')
|
|
117
145
|
log.flush()
|
|
118
146
|
log.close()
|
|
119
147
|
" || {
|
|
120
148
|
local rc=$?
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
149
|
+
# Treat as failure only if claude never emitted its result event. If
|
|
150
|
+
# the sentinel exists, the work is done — the nonzero exit is just
|
|
151
|
+
# the watchdog cleaning up a leaked Bash-tool subprocess.
|
|
152
|
+
if [ ! -f "$result_sentinel" ]; then
|
|
153
|
+
log "ERROR: Claude failed on step $step_name (exit $rc)"
|
|
154
|
+
tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
|
|
155
|
+
rm -f "$ACTIVE_STEP_FILE"
|
|
156
|
+
return 1
|
|
157
|
+
fi
|
|
158
|
+
log "NOTE: $step_name pipeline closed via watchdog after result event (exit $rc); treating step as complete"
|
|
124
159
|
}
|
|
125
160
|
|
|
161
|
+
rm -f "$ACTIVE_STEP_FILE"
|
|
162
|
+
|
|
126
163
|
if [ -f "$SPRINT_DIR/.researchloop/session_id" ]; then
|
|
127
164
|
SESSION_ID=$(cat "$SPRINT_DIR/.researchloop/session_id")
|
|
128
165
|
fi
|
|
@@ -204,22 +241,36 @@ print(json.dumps({
|
|
|
204
241
|
}
|
|
205
242
|
|
|
206
243
|
_heartbeat_loop() {
|
|
207
|
-
# Watchdog:
|
|
208
|
-
#
|
|
209
|
-
#
|
|
244
|
+
# Watchdog: detect and recover from hung pipelines.
|
|
245
|
+
#
|
|
246
|
+
# STUCK_PIPE warn: the active step's stream-json file goes silent for
|
|
247
|
+
# 5+ minutes while heartbeats keep firing. Signature of claude having
|
|
248
|
+
# exited but a leaked Bash-tool subprocess holding the pipeline's
|
|
249
|
+
# stdout fd open.
|
|
250
|
+
#
|
|
251
|
+
# Hung-after-result kill: claude emits a terminal `result` event when
|
|
252
|
+
# its work is done, which the stream filter records as a sentinel file.
|
|
253
|
+
# If the bash pipeline does not close within result_grace_secs after
|
|
254
|
+
# that, SIGTERM the entire claude process group via its pgid so the
|
|
255
|
+
# leaked subprocesses die and run_step can return. Escalate to SIGKILL
|
|
256
|
+
# if the group ignores SIGTERM.
|
|
210
257
|
local stuck_warned=0
|
|
211
258
|
local stuck_threshold_secs=300
|
|
259
|
+
local result_grace_secs=60
|
|
260
|
+
local kill_escalate_secs=15
|
|
261
|
+
local last_killed_pgid=""
|
|
212
262
|
while true; do
|
|
213
263
|
sleep 60
|
|
264
|
+
# Detect current step from the active_step file written by run_step.
|
|
265
|
+
local active_step=""
|
|
266
|
+
if [ -f "$SPRINT_DIR/.researchloop/active_step" ]; then
|
|
267
|
+
active_step=$(cat "$SPRINT_DIR/.researchloop/active_step" 2>/dev/null || true)
|
|
268
|
+
fi
|
|
214
269
|
local step="running"
|
|
215
|
-
if [ -
|
|
216
|
-
|
|
217
|
-
last_step=$(grep -o '>>> Starting step: [a-z_]*' "$LOG_FILE" | tail -1 | sed 's/>>> Starting step: //' || true)
|
|
218
|
-
if [ -n "$last_step" ]; then
|
|
219
|
-
step="running ($last_step)"
|
|
220
|
-
fi
|
|
270
|
+
if [ -n "$active_step" ]; then
|
|
271
|
+
step="running ($active_step)"
|
|
221
272
|
fi
|
|
222
|
-
# STUCK_PIPE watchdog (detect-and-warn
|
|
273
|
+
# STUCK_PIPE watchdog (detect-and-warn).
|
|
223
274
|
local newest_jsonl
|
|
224
275
|
newest_jsonl=$(ls -t "$SPRINT_DIR/.researchloop"/*_output.jsonl 2>/dev/null | head -1)
|
|
225
276
|
if [ -n "$newest_jsonl" ] && [ -f "$newest_jsonl" ]; then
|
|
@@ -236,6 +287,26 @@ _heartbeat_loop() {
|
|
|
236
287
|
stuck_warned=0
|
|
237
288
|
fi
|
|
238
289
|
fi
|
|
290
|
+
# Hung-after-result recovery: kill the claude pgid if the result
|
|
291
|
+
# sentinel is older than the grace period.
|
|
292
|
+
if [ -n "$active_step" ]; then
|
|
293
|
+
local sentinel="$SPRINT_DIR/.researchloop/${active_step}_result_seen"
|
|
294
|
+
local pgid_file="$SPRINT_DIR/.researchloop/${active_step}_pgid"
|
|
295
|
+
if [ -f "$sentinel" ] && [ -f "$pgid_file" ]; then
|
|
296
|
+
local s_mtime now_t s_age pgid
|
|
297
|
+
s_mtime=$(stat -c %Y "$sentinel" 2>/dev/null || stat -f %m "$sentinel" 2>/dev/null || echo 0)
|
|
298
|
+
now_t=$(date +%s)
|
|
299
|
+
s_age=$((now_t - s_mtime))
|
|
300
|
+
pgid=$(cat "$pgid_file" 2>/dev/null || true)
|
|
301
|
+
if [ -n "$pgid" ] && [ "$s_age" -ge "$result_grace_secs" ] && [ "$last_killed_pgid" != "$pgid" ]; then
|
|
302
|
+
echo "[$(date -u +%H:%M:%S)] STUCK_PIPE recovery: $active_step result event seen ${s_age}s ago but pipeline still open; SIGTERM pgid $pgid" >> "$LOG_FILE"
|
|
303
|
+
kill -TERM -"$pgid" 2>/dev/null || true
|
|
304
|
+
# Escalate to SIGKILL in case the group ignores SIGTERM.
|
|
305
|
+
( sleep "$kill_escalate_secs" && kill -KILL -"$pgid" 2>/dev/null || true ) &
|
|
306
|
+
last_killed_pgid="$pgid"
|
|
307
|
+
fi
|
|
308
|
+
fi
|
|
309
|
+
fi
|
|
239
310
|
send_heartbeat "$step"
|
|
240
311
|
echo "[$(date -u +%H:%M:%S)] ... still running (heartbeat)" >> "$LOG_FILE"
|
|
241
312
|
done
|
|
@@ -53,6 +53,7 @@ log() {
|
|
|
53
53
|
|
|
54
54
|
# --- Helper: run claude and extract session ID ---
|
|
55
55
|
SESSION_ID=""
|
|
56
|
+
ACTIVE_STEP_FILE="$SPRINT_DIR/.researchloop/active_step"
|
|
56
57
|
run_step() {
|
|
57
58
|
local prompt_file="$1"
|
|
58
59
|
local step_name="$2"
|
|
@@ -64,12 +65,33 @@ run_step() {
|
|
|
64
65
|
cmd+=(--resume "$SESSION_ID")
|
|
65
66
|
fi
|
|
66
67
|
|
|
68
|
+
local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
|
|
69
|
+
local pgid_file="$SPRINT_DIR/.researchloop/${step_name}_pgid"
|
|
70
|
+
local result_sentinel="$SPRINT_DIR/.researchloop/${step_name}_result_seen"
|
|
71
|
+
# Reset per-step watchdog state and announce the active step to the
|
|
72
|
+
# heartbeat loop.
|
|
73
|
+
rm -f "$pgid_file" "$result_sentinel"
|
|
74
|
+
echo "$step_name" > "$ACTIVE_STEP_FILE"
|
|
75
|
+
|
|
67
76
|
# Run claude with streaming JSON output.
|
|
68
77
|
# Each line is a JSON event — we pipe through a filter that:
|
|
69
78
|
# 1. Logs human-readable summaries of what Claude is doing
|
|
70
79
|
# 2. Saves the full stream for post-processing
|
|
71
|
-
|
|
72
|
-
|
|
80
|
+
# 3. Writes ${step}_result_seen when claude emits its terminal `result`
|
|
81
|
+
# event, so the heartbeat watchdog can detect "done but pipeline open"
|
|
82
|
+
#
|
|
83
|
+
# claude is launched under `setsid` so it (and any subprocesses spawned
|
|
84
|
+
# by Bash tool calls) share a dedicated process group. If a Bash-tool
|
|
85
|
+
# subprocess later leaks (e.g. hung network call) and keeps the
|
|
86
|
+
# tee|python3 pipeline open after claude has emitted result, the
|
|
87
|
+
# heartbeat watchdog can SIGTERM the whole group cleanly via the pgid.
|
|
88
|
+
(
|
|
89
|
+
setsid "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) &
|
|
90
|
+
claude_pid=$!
|
|
91
|
+
# setsid makes the child a session/group leader; pgid == pid.
|
|
92
|
+
echo "$claude_pid" > "$pgid_file"
|
|
93
|
+
wait "$claude_pid"
|
|
94
|
+
) | tee "$output_file" | python3 -u -c "
|
|
73
95
|
import sys, json, os, time
|
|
74
96
|
log = open('$LOG_FILE', 'a', buffering=1)
|
|
75
97
|
last_tool_time = 0
|
|
@@ -112,17 +134,32 @@ for line in sys.stdin:
|
|
|
112
134
|
sid = evt.get('session_id', '')
|
|
113
135
|
if sid:
|
|
114
136
|
open('$SPRINT_DIR/.researchloop/session_id', 'w').write(sid)
|
|
137
|
+
# Mark claude's terminal result so the heartbeat watchdog can kill
|
|
138
|
+
# any leaked subprocesses keeping the pipeline open past this point.
|
|
139
|
+
try:
|
|
140
|
+
open('$result_sentinel', 'w').close()
|
|
141
|
+
except OSError:
|
|
142
|
+
pass
|
|
115
143
|
result = evt.get('result', '') or ''
|
|
116
144
|
log.write(f' Done ({tool_count} tool calls, {len(result)} chars output)\n')
|
|
117
145
|
log.flush()
|
|
118
146
|
log.close()
|
|
119
147
|
" || {
|
|
120
148
|
local rc=$?
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
149
|
+
# Treat as failure only if claude never emitted its result event. If
|
|
150
|
+
# the sentinel exists, the work is done — the nonzero exit is just
|
|
151
|
+
# the watchdog cleaning up a leaked Bash-tool subprocess.
|
|
152
|
+
if [ ! -f "$result_sentinel" ]; then
|
|
153
|
+
log "ERROR: Claude failed on step $step_name (exit $rc)"
|
|
154
|
+
tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
|
|
155
|
+
rm -f "$ACTIVE_STEP_FILE"
|
|
156
|
+
return 1
|
|
157
|
+
fi
|
|
158
|
+
log "NOTE: $step_name pipeline closed via watchdog after result event (exit $rc); treating step as complete"
|
|
124
159
|
}
|
|
125
160
|
|
|
161
|
+
rm -f "$ACTIVE_STEP_FILE"
|
|
162
|
+
|
|
126
163
|
# Read session ID (written by the stream filter above).
|
|
127
164
|
if [ -f "$SPRINT_DIR/.researchloop/session_id" ]; then
|
|
128
165
|
SESSION_ID=$(cat "$SPRINT_DIR/.researchloop/session_id")
|
|
@@ -206,23 +243,36 @@ print(json.dumps({
|
|
|
206
243
|
}
|
|
207
244
|
|
|
208
245
|
_heartbeat_loop() {
|
|
209
|
-
# Watchdog:
|
|
210
|
-
#
|
|
211
|
-
#
|
|
246
|
+
# Watchdog: detect and recover from hung pipelines.
|
|
247
|
+
#
|
|
248
|
+
# STUCK_PIPE warn: the active step's stream-json file goes silent for
|
|
249
|
+
# 5+ minutes while heartbeats keep firing. Signature of claude having
|
|
250
|
+
# exited but a leaked Bash-tool subprocess holding the pipeline's
|
|
251
|
+
# stdout fd open.
|
|
252
|
+
#
|
|
253
|
+
# Hung-after-result kill: claude emits a terminal `result` event when
|
|
254
|
+
# its work is done, which the stream filter records as a sentinel file.
|
|
255
|
+
# If the bash pipeline does not close within result_grace_secs after
|
|
256
|
+
# that, SIGTERM the entire claude process group via its pgid so the
|
|
257
|
+
# leaked subprocesses die and run_step can return. Escalate to SIGKILL
|
|
258
|
+
# if the group ignores SIGTERM.
|
|
212
259
|
local stuck_warned=0
|
|
213
260
|
local stuck_threshold_secs=300
|
|
261
|
+
local result_grace_secs=60
|
|
262
|
+
local kill_escalate_secs=15
|
|
263
|
+
local last_killed_pgid=""
|
|
214
264
|
while true; do
|
|
215
265
|
sleep 60
|
|
216
|
-
# Detect current step from
|
|
266
|
+
# Detect current step from the active_step file written by run_step.
|
|
267
|
+
local active_step=""
|
|
268
|
+
if [ -f "$SPRINT_DIR/.researchloop/active_step" ]; then
|
|
269
|
+
active_step=$(cat "$SPRINT_DIR/.researchloop/active_step" 2>/dev/null || true)
|
|
270
|
+
fi
|
|
217
271
|
local step="running"
|
|
218
|
-
if [ -
|
|
219
|
-
|
|
220
|
-
last_step=$(grep -o '>>> Starting step: [a-z_]*' "$LOG_FILE" | tail -1 | sed 's/>>> Starting step: //' || true)
|
|
221
|
-
if [ -n "$last_step" ]; then
|
|
222
|
-
step="running ($last_step)"
|
|
223
|
-
fi
|
|
272
|
+
if [ -n "$active_step" ]; then
|
|
273
|
+
step="running ($active_step)"
|
|
224
274
|
fi
|
|
225
|
-
# STUCK_PIPE watchdog (detect-and-warn
|
|
275
|
+
# STUCK_PIPE watchdog (detect-and-warn).
|
|
226
276
|
local newest_jsonl
|
|
227
277
|
newest_jsonl=$(ls -t "$SPRINT_DIR/.researchloop"/*_output.jsonl 2>/dev/null | head -1)
|
|
228
278
|
if [ -n "$newest_jsonl" ] && [ -f "$newest_jsonl" ]; then
|
|
@@ -239,6 +289,26 @@ _heartbeat_loop() {
|
|
|
239
289
|
stuck_warned=0
|
|
240
290
|
fi
|
|
241
291
|
fi
|
|
292
|
+
# Hung-after-result recovery: kill the claude pgid if the result
|
|
293
|
+
# sentinel is older than the grace period.
|
|
294
|
+
if [ -n "$active_step" ]; then
|
|
295
|
+
local sentinel="$SPRINT_DIR/.researchloop/${active_step}_result_seen"
|
|
296
|
+
local pgid_file="$SPRINT_DIR/.researchloop/${active_step}_pgid"
|
|
297
|
+
if [ -f "$sentinel" ] && [ -f "$pgid_file" ]; then
|
|
298
|
+
local s_mtime now_t s_age pgid
|
|
299
|
+
s_mtime=$(stat -c %Y "$sentinel" 2>/dev/null || stat -f %m "$sentinel" 2>/dev/null || echo 0)
|
|
300
|
+
now_t=$(date +%s)
|
|
301
|
+
s_age=$((now_t - s_mtime))
|
|
302
|
+
pgid=$(cat "$pgid_file" 2>/dev/null || true)
|
|
303
|
+
if [ -n "$pgid" ] && [ "$s_age" -ge "$result_grace_secs" ] && [ "$last_killed_pgid" != "$pgid" ]; then
|
|
304
|
+
echo "[$(date -u +%H:%M:%S)] STUCK_PIPE recovery: $active_step result event seen ${s_age}s ago but pipeline still open; SIGTERM pgid $pgid" >> "$LOG_FILE"
|
|
305
|
+
kill -TERM -"$pgid" 2>/dev/null || true
|
|
306
|
+
# Escalate to SIGKILL in case the group ignores SIGTERM.
|
|
307
|
+
( sleep "$kill_escalate_secs" && kill -KILL -"$pgid" 2>/dev/null || true ) &
|
|
308
|
+
last_killed_pgid="$pgid"
|
|
309
|
+
fi
|
|
310
|
+
fi
|
|
311
|
+
fi
|
|
242
312
|
send_heartbeat "$step"
|
|
243
313
|
# Also write a heartbeat timestamp to the log so it's visible.
|
|
244
314
|
echo "[$(date -u +%H:%M:%S)] ... still running (heartbeat)" >> "$LOG_FILE"
|
|
@@ -48,12 +48,19 @@ After writing report.md, create `report.pdf` in the current directory using Pyth
|
|
|
48
48
|
- Style: remove outer edges, add subtle row shading or gridlines for readability
|
|
49
49
|
- If a table has many columns (>6), use font size 7 and/or landscape orientation for that page
|
|
50
50
|
|
|
51
|
-
**Images:**
|
|
52
|
-
-
|
|
53
|
-
-
|
|
54
|
-
|
|
51
|
+
**Images — embed vector plots as vectors, not rasters:**
|
|
52
|
+
- Plots from `results/` are typically saved as both `.png` (raster) and `.pdf` (vector). Always embed the `.pdf`. **Anything passed through `ax.imshow()` or `fig.figimage()` gets rasterized when PdfPages saves the page** — the source file's DPI is discarded, only the figure's savefig DPI matters, and the result is still raster. This is the dominant cause of blurry plots in the final report
|
|
53
|
+
- Two-pass approach to keep plots vector:
|
|
54
|
+
1. **Matplotlib pass:** for each plot, draw a placeholder rectangle (or leave whitespace) at the intended position and record its bbox in page-fraction coordinates. Render the caption normally
|
|
55
|
+
2. **PyMuPDF overlay pass:** after `PdfPages` writes `report.pdf`, open it with `fitz` and for each placeholder call `page.show_pdf_page(rect, src_doc, 0)` where `src_doc = fitz.open(plot_pdf_path)` and `rect = fitz.Rect(x0, y0, x1, y1)` is the placeholder's bbox converted to points (page-fraction × 72 × page-size-in-inches; remember PDF y-axis runs top-down). Overwrite `report.pdf` with the merged result
|
|
56
|
+
- Only use `ax.imshow(img, extent=..., aspect='auto')` for genuinely raster-only sources (photos, external screenshots) where no `.pdf` companion exists. Source those at dpi=150+
|
|
57
|
+
- If `PyMuPDF` is unavailable, fall back to `pypdf`'s `merge_transformed_page` (same idea, different API) rather than rasterizing
|
|
55
58
|
- Leave space above/below for captions
|
|
56
59
|
|
|
60
|
+
**Verify the result is actually vector:**
|
|
61
|
+
- For each embedded plot's xref, `fitz.open("report.pdf").xref_get_key(xref, "Subtype")` should return `/Form`, never `/Image`
|
|
62
|
+
- A vector-only report is typically <150 KB; if `report.pdf` is 400 KB+ you almost certainly rasterized something — re-check the overlay pass
|
|
63
|
+
|
|
57
64
|
**Pagination:**
|
|
58
65
|
- Track y_cursor starting at y_top, decrementing after each element
|
|
59
66
|
- When y_cursor < y_bottom + 0.05, save the page and start a new one
|
|
@@ -158,8 +158,36 @@ class TestJobScriptWatchdog:
|
|
|
158
158
|
# Once-per-stuck-episode flag, not log-every-heartbeat.
|
|
159
159
|
assert "stuck_warned" in script
|
|
160
160
|
|
|
161
|
+
def _assert_hung_pipeline_recovery_present(self, script: str) -> None:
|
|
162
|
+
# claude must run in its own session so the watchdog can SIGTERM the
|
|
163
|
+
# whole group (claude + any leaked Bash-tool subprocesses) by pgid.
|
|
164
|
+
assert "setsid " in script
|
|
165
|
+
# Per-step pgid + result sentinel files used by the watchdog.
|
|
166
|
+
assert "_pgid" in script
|
|
167
|
+
assert "_result_seen" in script
|
|
168
|
+
# Stream filter writes the sentinel on the terminal `result` event.
|
|
169
|
+
assert "open('$result_sentinel'" in script
|
|
170
|
+
# The watchdog escalates SIGTERM → SIGKILL against the pgid (negative
|
|
171
|
+
# number argument to kill targets a process group).
|
|
172
|
+
assert "kill -TERM -" in script
|
|
173
|
+
assert "kill -KILL -" in script
|
|
174
|
+
# Grace period between result event and pgid kill.
|
|
175
|
+
assert "result_grace_secs=60" in script
|
|
176
|
+
# Recovery messaging in the log so operators can tell when a kill ran.
|
|
177
|
+
assert "STUCK_PIPE recovery" in script
|
|
178
|
+
# run_step treats nonzero exit as success iff the sentinel exists.
|
|
179
|
+
assert 'if [ ! -f "$result_sentinel" ]; then' in script
|
|
180
|
+
# active_step file replaces the old log-grep step detection.
|
|
181
|
+
assert "ACTIVE_STEP_FILE=" in script
|
|
182
|
+
|
|
161
183
|
def test_slurm_template_includes_watchdog(self):
|
|
162
184
|
self._assert_watchdog_present(_render_job_template("slurm.sh.j2"))
|
|
163
185
|
|
|
164
186
|
def test_sge_template_includes_watchdog(self):
|
|
165
187
|
self._assert_watchdog_present(_render_job_template("sge.sh.j2"))
|
|
188
|
+
|
|
189
|
+
def test_slurm_template_includes_hung_pipeline_recovery(self):
|
|
190
|
+
self._assert_hung_pipeline_recovery_present(_render_job_template("slurm.sh.j2"))
|
|
191
|
+
|
|
192
|
+
def test_sge_template_includes_hung_pipeline_recovery(self):
|
|
193
|
+
self._assert_hung_pipeline_recovery_present(_render_job_template("sge.sh.j2"))
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.3.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/sprint_detail.html
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/study_detail.html
RENAMED
|
File without changes
|
|
File without changes
|
{researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/tweak_detail.html
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/slurm_tweak.sh.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/idea_generator.md.j2
RENAMED
|
File without changes
|
|
File without changes
|
{researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/research_sprint.md.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|