researchloop 0.3.1__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {researchloop-0.3.1 → researchloop-0.3.3}/PKG-INFO +1 -1
  2. {researchloop-0.3.1 → researchloop-0.3.3}/pyproject.toml +1 -1
  3. researchloop-0.3.3/researchloop/__init__.py +1 -0
  4. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/job_templates/sge.sh.j2 +86 -15
  5. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/job_templates/slurm.sh.j2 +86 -16
  6. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/templates/fix_issues.md.j2 +1 -0
  7. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/templates/report.md.j2 +11 -4
  8. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/templates/research_sprint.md.j2 +11 -0
  9. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_runner.py +46 -0
  10. {researchloop-0.3.1 → researchloop-0.3.3}/uv.lock +1 -1
  11. researchloop-0.3.1/researchloop/__init__.py +0 -1
  12. {researchloop-0.3.1 → researchloop-0.3.3}/.github/workflows/ci.yml +0 -0
  13. {researchloop-0.3.1 → researchloop-0.3.3}/.github/workflows/docs.yml +0 -0
  14. {researchloop-0.3.1 → researchloop-0.3.3}/.github/workflows/release.yml +0 -0
  15. {researchloop-0.3.1 → researchloop-0.3.3}/.gitignore +0 -0
  16. {researchloop-0.3.1 → researchloop-0.3.3}/CLAUDE.md +0 -0
  17. {researchloop-0.3.1 → researchloop-0.3.3}/Dockerfile +0 -0
  18. {researchloop-0.3.1 → researchloop-0.3.3}/LICENSE +0 -0
  19. {researchloop-0.3.1 → researchloop-0.3.3}/README.md +0 -0
  20. {researchloop-0.3.1 → researchloop-0.3.3}/docs/assets/mmlu-combined.gif +0 -0
  21. {researchloop-0.3.1 → researchloop-0.3.3}/docs/assets/mmlu-combined.mp4 +0 -0
  22. {researchloop-0.3.1 → researchloop-0.3.3}/docs/cli.md +0 -0
  23. {researchloop-0.3.1 → researchloop-0.3.3}/docs/configuration.md +0 -0
  24. {researchloop-0.3.1 → researchloop-0.3.3}/docs/dashboard.md +0 -0
  25. {researchloop-0.3.1 → researchloop-0.3.3}/docs/deployment.md +0 -0
  26. {researchloop-0.3.1 → researchloop-0.3.3}/docs/development.md +0 -0
  27. {researchloop-0.3.1 → researchloop-0.3.3}/docs/getting-started.md +0 -0
  28. {researchloop-0.3.1 → researchloop-0.3.3}/docs/index.md +0 -0
  29. {researchloop-0.3.1 → researchloop-0.3.3}/docs/security.md +0 -0
  30. {researchloop-0.3.1 → researchloop-0.3.3}/docs/slack.md +0 -0
  31. {researchloop-0.3.1 → researchloop-0.3.3}/mkdocs.yml +0 -0
  32. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/__main__.py +0 -0
  33. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/cli.py +0 -0
  34. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/clusters/__init__.py +0 -0
  35. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/clusters/monitor.py +0 -0
  36. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/clusters/ssh.py +0 -0
  37. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/comms/__init__.py +0 -0
  38. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/comms/base.py +0 -0
  39. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/comms/ntfy.py +0 -0
  40. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/comms/router.py +0 -0
  41. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/comms/slack.py +0 -0
  42. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/core/__init__.py +0 -0
  43. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/core/config.py +0 -0
  44. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/core/credentials.py +0 -0
  45. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/core/models.py +0 -0
  46. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/core/orchestrator.py +0 -0
  47. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/__init__.py +0 -0
  48. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/app.py +0 -0
  49. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/auth.py +0 -0
  50. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/routes.py +0 -0
  51. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/base.html +0 -0
  52. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/login.html +0 -0
  53. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/loop_detail.html +0 -0
  54. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/loops.html +0 -0
  55. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/search.html +0 -0
  56. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/setup.html +0 -0
  57. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/sprint_detail.html +0 -0
  58. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/sprints.html +0 -0
  59. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/studies.html +0 -0
  60. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/study_detail.html +0 -0
  61. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/study_form.html +0 -0
  62. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/dashboard/templates/tweak_detail.html +0 -0
  63. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/db/__init__.py +0 -0
  64. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/db/database.py +0 -0
  65. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/db/migrations.py +0 -0
  66. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/db/queries.py +0 -0
  67. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/__init__.py +0 -0
  68. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/claude.py +0 -0
  69. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/job_templates/sge_tweak.sh.j2 +0 -0
  70. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/job_templates/slurm_tweak.sh.j2 +0 -0
  71. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/main.py +0 -0
  72. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/pipeline.py +0 -0
  73. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/templates/idea_generator.md.j2 +0 -0
  74. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/templates/red_team.md.j2 +0 -0
  75. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/templates/summarizer.md.j2 +0 -0
  76. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/templates/tweak.md.j2 +0 -0
  77. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/runner/upload.py +0 -0
  78. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/schedulers/__init__.py +0 -0
  79. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/schedulers/base.py +0 -0
  80. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/schedulers/local.py +0 -0
  81. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/schedulers/sge.py +0 -0
  82. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/schedulers/slurm.py +0 -0
  83. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/sprints/__init__.py +0 -0
  84. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/sprints/auto_loop.py +0 -0
  85. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/sprints/manager.py +0 -0
  86. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/studies/__init__.py +0 -0
  87. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/studies/manager.py +0 -0
  88. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/testing/__init__.py +0 -0
  89. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/testing/slack_mock.py +0 -0
  90. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop/testing/slack_simulator.py +0 -0
  91. {researchloop-0.3.1 → researchloop-0.3.3}/researchloop.toml.example +0 -0
  92. {researchloop-0.3.1 → researchloop-0.3.3}/slack-app-manifest.yml +0 -0
  93. {researchloop-0.3.1 → researchloop-0.3.3}/tests/__init__.py +0 -0
  94. {researchloop-0.3.1 → researchloop-0.3.3}/tests/conftest.py +0 -0
  95. {researchloop-0.3.1 → researchloop-0.3.3}/tests/docker/sge/Dockerfile +0 -0
  96. {researchloop-0.3.1 → researchloop-0.3.3}/tests/docker/sge/entrypoint.sh +0 -0
  97. {researchloop-0.3.1 → researchloop-0.3.3}/tests/docker/sge/mock_claude.sh +0 -0
  98. {researchloop-0.3.1 → researchloop-0.3.3}/tests/docker/slurm/Dockerfile +0 -0
  99. {researchloop-0.3.1 → researchloop-0.3.3}/tests/docker/slurm/entrypoint.sh +0 -0
  100. {researchloop-0.3.1 → researchloop-0.3.3}/tests/docker/slurm/mock_claude.sh +0 -0
  101. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/__init__.py +0 -0
  102. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/conftest.py +0 -0
  103. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/test_loop_advancement.py +0 -0
  104. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/test_loop_and_monitor.py +0 -0
  105. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/test_sge_scheduler.py +0 -0
  106. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/test_slurm_scheduler.py +0 -0
  107. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/test_sprint_slurm.py +0 -0
  108. {researchloop-0.3.1 → researchloop-0.3.3}/tests/integration/test_webhook_and_refresh.py +0 -0
  109. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_api.py +0 -0
  110. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_auto_loop.py +0 -0
  111. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_cli.py +0 -0
  112. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_config.py +0 -0
  113. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_dashboard.py +0 -0
  114. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_database.py +0 -0
  115. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_models.py +0 -0
  116. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_notification.py +0 -0
  117. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_queries.py +0 -0
  118. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_schedulers.py +0 -0
  119. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_search.py +0 -0
  120. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_sge.py +0 -0
  121. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_slack.py +0 -0
  122. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_slack_events.py +0 -0
  123. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_slack_mock.py +0 -0
  124. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_slack_simulator.py +0 -0
  125. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_sprint_manager.py +0 -0
  126. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_study_manager.py +0 -0
  127. {researchloop-0.3.1 → researchloop-0.3.3}/tests/test_tweaks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: researchloop
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Automated research sprint platform for HPC clusters
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "researchloop"
3
- version = "0.3.1"
3
+ version = "0.3.3"
4
4
  description = "Automated research sprint platform for HPC clusters"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -0,0 +1 @@
1
+ __version__ = "0.3.3"
@@ -54,6 +54,7 @@ log() {
54
54
 
55
55
  # --- Helper: run claude and extract session ID ---
56
56
  SESSION_ID=""
57
+ ACTIVE_STEP_FILE="$SPRINT_DIR/.researchloop/active_step"
57
58
  run_step() {
58
59
  local prompt_file="$1"
59
60
  local step_name="$2"
@@ -65,12 +66,33 @@ run_step() {
65
66
  cmd+=(--resume "$SESSION_ID")
66
67
  fi
67
68
 
69
+ local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
70
+ local pgid_file="$SPRINT_DIR/.researchloop/${step_name}_pgid"
71
+ local result_sentinel="$SPRINT_DIR/.researchloop/${step_name}_result_seen"
72
+ # Reset per-step watchdog state and announce the active step to the
73
+ # heartbeat loop.
74
+ rm -f "$pgid_file" "$result_sentinel"
75
+ echo "$step_name" > "$ACTIVE_STEP_FILE"
76
+
68
77
  # Run claude with streaming JSON output.
69
78
  # Each line is a JSON event — we pipe through a filter that:
70
79
  # 1. Logs human-readable summaries of what Claude is doing
71
80
  # 2. Saves the full stream for post-processing
72
- local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
73
- "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) | tee "$output_file" | python3 -u -c "
81
+ # 3. Writes ${step}_result_seen when claude emits its terminal `result`
82
+ # event, so the heartbeat watchdog can detect "done but pipeline open"
83
+ #
84
+ # claude is launched under `setsid` so it (and any subprocesses spawned
85
+ # by Bash tool calls) share a dedicated process group. If a Bash-tool
86
+ # subprocess later leaks (e.g. hung network call) and keeps the
87
+ # tee|python3 pipeline open after claude has emitted result, the
88
+ # heartbeat watchdog can SIGTERM the whole group cleanly via the pgid.
89
+ (
90
+ setsid "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) &
91
+ claude_pid=$!
92
+ # setsid makes the child a session/group leader; pgid == pid.
93
+ echo "$claude_pid" > "$pgid_file"
94
+ wait "$claude_pid"
95
+ ) | tee "$output_file" | python3 -u -c "
74
96
  import sys, json, os, time
75
97
  log = open('$LOG_FILE', 'a', buffering=1)
76
98
  last_tool_time = 0
@@ -112,17 +134,32 @@ for line in sys.stdin:
112
134
  sid = evt.get('session_id', '')
113
135
  if sid:
114
136
  open('$SPRINT_DIR/.researchloop/session_id', 'w').write(sid)
137
+ # Mark claude's terminal result so the heartbeat watchdog can kill
138
+ # any leaked subprocesses keeping the pipeline open past this point.
139
+ try:
140
+ open('$result_sentinel', 'w').close()
141
+ except OSError:
142
+ pass
115
143
  result = evt.get('result', '') or ''
116
144
  log.write(f' Done ({tool_count} tool calls, {len(result)} chars output)\n')
117
145
  log.flush()
118
146
  log.close()
119
147
  " || {
120
148
  local rc=$?
121
- log "ERROR: Claude failed on step $step_name (exit $rc)"
122
- tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
123
- return 1
149
+ # Treat as failure only if claude never emitted its result event. If
150
+ # the sentinel exists, the work is done — the nonzero exit is just
151
+ # the watchdog cleaning up a leaked Bash-tool subprocess.
152
+ if [ ! -f "$result_sentinel" ]; then
153
+ log "ERROR: Claude failed on step $step_name (exit $rc)"
154
+ tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
155
+ rm -f "$ACTIVE_STEP_FILE"
156
+ return 1
157
+ fi
158
+ log "NOTE: $step_name pipeline closed via watchdog after result event (exit $rc); treating step as complete"
124
159
  }
125
160
 
161
+ rm -f "$ACTIVE_STEP_FILE"
162
+
126
163
  if [ -f "$SPRINT_DIR/.researchloop/session_id" ]; then
127
164
  SESSION_ID=$(cat "$SPRINT_DIR/.researchloop/session_id")
128
165
  fi
@@ -204,22 +241,36 @@ print(json.dumps({
204
241
  }
205
242
 
206
243
  _heartbeat_loop() {
207
- # Watchdog: warn once if the active step's stream-json file goes silent
208
- # for too long while heartbeats keep firing. Signature of a hung pipeline
209
- # (claude exited but a leaked fd keeps tee|python3 blocked).
244
+ # Watchdog: detect and recover from hung pipelines.
245
+ #
246
+ # STUCK_PIPE warn: the active step's stream-json file goes silent for
247
+ # 5+ minutes while heartbeats keep firing. Signature of claude having
248
+ # exited but a leaked Bash-tool subprocess holding the pipeline's
249
+ # stdout fd open.
250
+ #
251
+ # Hung-after-result kill: claude emits a terminal `result` event when
252
+ # its work is done, which the stream filter records as a sentinel file.
253
+ # If the bash pipeline does not close within result_grace_secs after
254
+ # that, SIGTERM the entire claude process group via its pgid so the
255
+ # leaked subprocesses die and run_step can return. Escalate to SIGKILL
256
+ # if the group ignores SIGTERM.
210
257
  local stuck_warned=0
211
258
  local stuck_threshold_secs=300
259
+ local result_grace_secs=60
260
+ local kill_escalate_secs=15
261
+ local last_killed_pgid=""
212
262
  while true; do
213
263
  sleep 60
264
+ # Detect current step from the active_step file written by run_step.
265
+ local active_step=""
266
+ if [ -f "$SPRINT_DIR/.researchloop/active_step" ]; then
267
+ active_step=$(cat "$SPRINT_DIR/.researchloop/active_step" 2>/dev/null || true)
268
+ fi
214
269
  local step="running"
215
- if [ -f "$LOG_FILE" ]; then
216
- local last_step
217
- last_step=$(grep -o '>>> Starting step: [a-z_]*' "$LOG_FILE" | tail -1 | sed 's/>>> Starting step: //' || true)
218
- if [ -n "$last_step" ]; then
219
- step="running ($last_step)"
220
- fi
270
+ if [ -n "$active_step" ]; then
271
+ step="running ($active_step)"
221
272
  fi
222
- # STUCK_PIPE watchdog (detect-and-warn only).
273
+ # STUCK_PIPE watchdog (detect-and-warn).
223
274
  local newest_jsonl
224
275
  newest_jsonl=$(ls -t "$SPRINT_DIR/.researchloop"/*_output.jsonl 2>/dev/null | head -1)
225
276
  if [ -n "$newest_jsonl" ] && [ -f "$newest_jsonl" ]; then
@@ -236,6 +287,26 @@ _heartbeat_loop() {
236
287
  stuck_warned=0
237
288
  fi
238
289
  fi
290
+ # Hung-after-result recovery: kill the claude pgid if the result
291
+ # sentinel is older than the grace period.
292
+ if [ -n "$active_step" ]; then
293
+ local sentinel="$SPRINT_DIR/.researchloop/${active_step}_result_seen"
294
+ local pgid_file="$SPRINT_DIR/.researchloop/${active_step}_pgid"
295
+ if [ -f "$sentinel" ] && [ -f "$pgid_file" ]; then
296
+ local s_mtime now_t s_age pgid
297
+ s_mtime=$(stat -c %Y "$sentinel" 2>/dev/null || stat -f %m "$sentinel" 2>/dev/null || echo 0)
298
+ now_t=$(date +%s)
299
+ s_age=$((now_t - s_mtime))
300
+ pgid=$(cat "$pgid_file" 2>/dev/null || true)
301
+ if [ -n "$pgid" ] && [ "$s_age" -ge "$result_grace_secs" ] && [ "$last_killed_pgid" != "$pgid" ]; then
302
+ echo "[$(date -u +%H:%M:%S)] STUCK_PIPE recovery: $active_step result event seen ${s_age}s ago but pipeline still open; SIGTERM pgid $pgid" >> "$LOG_FILE"
303
+ kill -TERM -"$pgid" 2>/dev/null || true
304
+ # Escalate to SIGKILL in case the group ignores SIGTERM.
305
+ ( sleep "$kill_escalate_secs" && kill -KILL -"$pgid" 2>/dev/null || true ) &
306
+ last_killed_pgid="$pgid"
307
+ fi
308
+ fi
309
+ fi
239
310
  send_heartbeat "$step"
240
311
  echo "[$(date -u +%H:%M:%S)] ... still running (heartbeat)" >> "$LOG_FILE"
241
312
  done
@@ -53,6 +53,7 @@ log() {
53
53
 
54
54
  # --- Helper: run claude and extract session ID ---
55
55
  SESSION_ID=""
56
+ ACTIVE_STEP_FILE="$SPRINT_DIR/.researchloop/active_step"
56
57
  run_step() {
57
58
  local prompt_file="$1"
58
59
  local step_name="$2"
@@ -64,12 +65,33 @@ run_step() {
64
65
  cmd+=(--resume "$SESSION_ID")
65
66
  fi
66
67
 
68
+ local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
69
+ local pgid_file="$SPRINT_DIR/.researchloop/${step_name}_pgid"
70
+ local result_sentinel="$SPRINT_DIR/.researchloop/${step_name}_result_seen"
71
+ # Reset per-step watchdog state and announce the active step to the
72
+ # heartbeat loop.
73
+ rm -f "$pgid_file" "$result_sentinel"
74
+ echo "$step_name" > "$ACTIVE_STEP_FILE"
75
+
67
76
  # Run claude with streaming JSON output.
68
77
  # Each line is a JSON event — we pipe through a filter that:
69
78
  # 1. Logs human-readable summaries of what Claude is doing
70
79
  # 2. Saves the full stream for post-processing
71
- local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
72
- "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) | tee "$output_file" | python3 -u -c "
80
+ # 3. Writes ${step}_result_seen when claude emits its terminal `result`
81
+ # event, so the heartbeat watchdog can detect "done but pipeline open"
82
+ #
83
+ # claude is launched under `setsid` so it (and any subprocesses spawned
84
+ # by Bash tool calls) share a dedicated process group. If a Bash-tool
85
+ # subprocess later leaks (e.g. hung network call) and keeps the
86
+ # tee|python3 pipeline open after claude has emitted result, the
87
+ # heartbeat watchdog can SIGTERM the whole group cleanly via the pgid.
88
+ (
89
+ setsid "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) &
90
+ claude_pid=$!
91
+ # setsid makes the child a session/group leader; pgid == pid.
92
+ echo "$claude_pid" > "$pgid_file"
93
+ wait "$claude_pid"
94
+ ) | tee "$output_file" | python3 -u -c "
73
95
  import sys, json, os, time
74
96
  log = open('$LOG_FILE', 'a', buffering=1)
75
97
  last_tool_time = 0
@@ -112,17 +134,32 @@ for line in sys.stdin:
112
134
  sid = evt.get('session_id', '')
113
135
  if sid:
114
136
  open('$SPRINT_DIR/.researchloop/session_id', 'w').write(sid)
137
+ # Mark claude's terminal result so the heartbeat watchdog can kill
138
+ # any leaked subprocesses keeping the pipeline open past this point.
139
+ try:
140
+ open('$result_sentinel', 'w').close()
141
+ except OSError:
142
+ pass
115
143
  result = evt.get('result', '') or ''
116
144
  log.write(f' Done ({tool_count} tool calls, {len(result)} chars output)\n')
117
145
  log.flush()
118
146
  log.close()
119
147
  " || {
120
148
  local rc=$?
121
- log "ERROR: Claude failed on step $step_name (exit $rc)"
122
- tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
123
- return 1
149
+ # Treat as failure only if claude never emitted its result event. If
150
+ # the sentinel exists, the work is done — the nonzero exit is just
151
+ # the watchdog cleaning up a leaked Bash-tool subprocess.
152
+ if [ ! -f "$result_sentinel" ]; then
153
+ log "ERROR: Claude failed on step $step_name (exit $rc)"
154
+ tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
155
+ rm -f "$ACTIVE_STEP_FILE"
156
+ return 1
157
+ fi
158
+ log "NOTE: $step_name pipeline closed via watchdog after result event (exit $rc); treating step as complete"
124
159
  }
125
160
 
161
+ rm -f "$ACTIVE_STEP_FILE"
162
+
126
163
  # Read session ID (written by the stream filter above).
127
164
  if [ -f "$SPRINT_DIR/.researchloop/session_id" ]; then
128
165
  SESSION_ID=$(cat "$SPRINT_DIR/.researchloop/session_id")
@@ -206,23 +243,36 @@ print(json.dumps({
206
243
  }
207
244
 
208
245
  _heartbeat_loop() {
209
- # Watchdog: warn once if the active step's stream-json file goes silent
210
- # for too long while heartbeats keep firing. Signature of a hung pipeline
211
- # (claude exited but a leaked fd keeps tee|python3 blocked).
246
+ # Watchdog: detect and recover from hung pipelines.
247
+ #
248
+ # STUCK_PIPE warn: the active step's stream-json file goes silent for
249
+ # 5+ minutes while heartbeats keep firing. Signature of claude having
250
+ # exited but a leaked Bash-tool subprocess holding the pipeline's
251
+ # stdout fd open.
252
+ #
253
+ # Hung-after-result kill: claude emits a terminal `result` event when
254
+ # its work is done, which the stream filter records as a sentinel file.
255
+ # If the bash pipeline does not close within result_grace_secs after
256
+ # that, SIGTERM the entire claude process group via its pgid so the
257
+ # leaked subprocesses die and run_step can return. Escalate to SIGKILL
258
+ # if the group ignores SIGTERM.
212
259
  local stuck_warned=0
213
260
  local stuck_threshold_secs=300
261
+ local result_grace_secs=60
262
+ local kill_escalate_secs=15
263
+ local last_killed_pgid=""
214
264
  while true; do
215
265
  sleep 60
216
- # Detect current step from log.
266
+ # Detect current step from the active_step file written by run_step.
267
+ local active_step=""
268
+ if [ -f "$SPRINT_DIR/.researchloop/active_step" ]; then
269
+ active_step=$(cat "$SPRINT_DIR/.researchloop/active_step" 2>/dev/null || true)
270
+ fi
217
271
  local step="running"
218
- if [ -f "$LOG_FILE" ]; then
219
- local last_step
220
- last_step=$(grep -o '>>> Starting step: [a-z_]*' "$LOG_FILE" | tail -1 | sed 's/>>> Starting step: //' || true)
221
- if [ -n "$last_step" ]; then
222
- step="running ($last_step)"
223
- fi
272
+ if [ -n "$active_step" ]; then
273
+ step="running ($active_step)"
224
274
  fi
225
- # STUCK_PIPE watchdog (detect-and-warn only).
275
+ # STUCK_PIPE watchdog (detect-and-warn).
226
276
  local newest_jsonl
227
277
  newest_jsonl=$(ls -t "$SPRINT_DIR/.researchloop"/*_output.jsonl 2>/dev/null | head -1)
228
278
  if [ -n "$newest_jsonl" ] && [ -f "$newest_jsonl" ]; then
@@ -239,6 +289,26 @@ _heartbeat_loop() {
239
289
  stuck_warned=0
240
290
  fi
241
291
  fi
292
+ # Hung-after-result recovery: kill the claude pgid if the result
293
+ # sentinel is older than the grace period.
294
+ if [ -n "$active_step" ]; then
295
+ local sentinel="$SPRINT_DIR/.researchloop/${active_step}_result_seen"
296
+ local pgid_file="$SPRINT_DIR/.researchloop/${active_step}_pgid"
297
+ if [ -f "$sentinel" ] && [ -f "$pgid_file" ]; then
298
+ local s_mtime now_t s_age pgid
299
+ s_mtime=$(stat -c %Y "$sentinel" 2>/dev/null || stat -f %m "$sentinel" 2>/dev/null || echo 0)
300
+ now_t=$(date +%s)
301
+ s_age=$((now_t - s_mtime))
302
+ pgid=$(cat "$pgid_file" 2>/dev/null || true)
303
+ if [ -n "$pgid" ] && [ "$s_age" -ge "$result_grace_secs" ] && [ "$last_killed_pgid" != "$pgid" ]; then
304
+ echo "[$(date -u +%H:%M:%S)] STUCK_PIPE recovery: $active_step result event seen ${s_age}s ago but pipeline still open; SIGTERM pgid $pgid" >> "$LOG_FILE"
305
+ kill -TERM -"$pgid" 2>/dev/null || true
306
+ # Escalate to SIGKILL in case the group ignores SIGTERM.
307
+ ( sleep "$kill_escalate_secs" && kill -KILL -"$pgid" 2>/dev/null || true ) &
308
+ last_killed_pgid="$pgid"
309
+ fi
310
+ fi
311
+ fi
242
312
  send_heartbeat "$step"
243
313
  # Also write a heartbeat timestamp to the log so it's visible.
244
314
  echo "[$(date -u +%H:%M:%S)] ... still running (heartbeat)" >> "$LOG_FILE"
@@ -9,3 +9,4 @@ Review the file: red_team_round_{{ round_number }}.md
9
9
  - Document what you changed and why in fixes_round_{{ round_number }}.md
10
10
  - Do not break existing working functionality
11
11
  - Update progress.md with what you're fixing and the results
12
+ - Run any re-training or re-evaluation commands synchronously in the foreground. NEVER pass `run_in_background: true` on a Bash tool call, and do NOT use `nohup`, `disown`, `setsid`, or `&` to detach work — this is a one-shot `claude -p` session and any orphaned subprocess will be killed when you end your turn.
@@ -48,12 +48,19 @@ After writing report.md, create `report.pdf` in the current directory using Pyth
48
48
  - Style: remove outer edges, add subtle row shading or gridlines for readability
49
49
  - If a table has many columns (>6), use font size 7 and/or landscape orientation for that page
50
50
 
51
- **Images:**
52
- - When generating plots earlier in the pipeline, save at **dpi=150 or higher** so text in plots is sharp in the PDF
53
- - When embedding in the PDF figure, use `ax.imshow(img, extent=[x_left, x_right, y_low, y_high], aspect='auto')` or insert via `fig.figimage()`. Scale to fit within margins while preserving aspect ratio
54
- - Do NOT downscale high-res images matplotlib's PdfPages will embed them at their native resolution
51
+ **Images — embed vector plots as vectors, not rasters:**
52
+ - Plots from `results/` are typically saved as both `.png` (raster) and `.pdf` (vector). Always embed the `.pdf`. **Anything passed through `ax.imshow()` or `fig.figimage()` gets rasterized when PdfPages saves the page** the source file's DPI is discarded, only the figure's savefig DPI matters, and the result is still raster. This is the dominant cause of blurry plots in the final report
53
+ - Two-pass approach to keep plots vector:
54
+ 1. **Matplotlib pass:** for each plot, draw a placeholder rectangle (or leave whitespace) at the intended position and record its bbox in page-fraction coordinates. Render the caption normally
55
+ 2. **PyMuPDF overlay pass:** after `PdfPages` writes `report.pdf`, open it with `fitz` and for each placeholder call `page.show_pdf_page(rect, src_doc, 0)` where `src_doc = fitz.open(plot_pdf_path)` and `rect = fitz.Rect(x0, y0, x1, y1)` is the placeholder's bbox converted to points (page-fraction × 72 × page-size-in-inches; remember PDF y-axis runs top-down). Overwrite `report.pdf` with the merged result
56
+ - Only use `ax.imshow(img, extent=..., aspect='auto')` for genuinely raster-only sources (photos, external screenshots) where no `.pdf` companion exists. Source those at dpi=150+
57
+ - If `PyMuPDF` is unavailable, fall back to `pypdf`'s `merge_transformed_page` (same idea, different API) rather than rasterizing
55
58
  - Leave space above/below for captions
56
59
 
60
+ **Verify the result is actually vector:**
61
+ - For each embedded plot's xref, `fitz.open("report.pdf").xref_get_key(xref, "Subtype")` should return `/Form`, never `/Image`
62
+ - A vector-only report is typically <150 KB; if `report.pdf` is 400 KB+ you almost certainly rasterized something — re-check the overlay pass
63
+
57
64
  **Pagination:**
58
65
  - Track y_cursor starting at y_top, decrementing after each element
59
66
  - When y_cursor < y_bottom + 0.05, save the page and start a new one
@@ -23,6 +23,17 @@ python train.py 2>&1 | tee -a output.log
23
23
 
24
24
  This lets the team monitor script output remotely. Always use `tee -a` (append mode) so all runs accumulate in the same log file. Do this for every script execution, training run, or evaluation.
25
25
 
26
+ ## Long-running commands — run synchronously, NEVER in the background
27
+ This sprint runs as a single one-shot `claude -p` invocation. There is no notification mechanism, no resumed turn, and no way to "wait and be notified". The moment you end your turn, the session is over and the sprint runner kills any detached subprocesses (training, evaluation, anything you started).
28
+
29
+ Rules:
30
+ - NEVER pass `run_in_background: true` on a Bash tool call. Always run training and evaluation in the foreground.
31
+ - NEVER use `nohup`, `disown`, `setsid`, `&`, `&>`, or any other shell-level backgrounding for work you care about. The orphan will be killed.
32
+ - If a single Bash call needs to run longer than its default timeout, raise the per-call `timeout` parameter instead of backgrounding it.
33
+ - The `PushNotification`, `Monitor`, `Cron*`, `Task`, `AskUserQuestion`, `EnterPlanMode`, and `EnterWorktree` tools are NOT available in this one-shot mode. Ignore them if they appear in your tool list.
34
+
35
+ Plan training as a series of synchronous foreground commands. Wait for each to finish before ending your turn.
36
+
26
37
  ## Progress Log
27
38
  Maintain a file called `progress.md` in the sprint directory. Update it regularly as you work — it's how the team monitors your progress remotely. Keep it concise and current:
28
39
 
@@ -102,6 +102,24 @@ class TestRenderTemplate:
102
102
  )
103
103
  assert "progress.md" in output
104
104
 
105
+ def test_research_template_forbids_backgrounding(self):
106
+ """claude -p is one-shot — backgrounded subprocesses get orphaned and
107
+ killed by the runner's pipeline-cleanup watchdog. The template must
108
+ tell claude not to use run_in_background or shell-level detach."""
109
+ output = render_template(
110
+ "research_sprint.md.j2",
111
+ study_context="Study context",
112
+ idea="test idea",
113
+ sprint_dir="/tmp/sprint",
114
+ )
115
+ assert "run_in_background" in output
116
+ assert "nohup" in output
117
+ assert "one-shot" in output
118
+
119
+ def test_fix_template_forbids_backgrounding(self):
120
+ output = render_template("fix_issues.md.j2", round_number=1)
121
+ assert "run_in_background" in output
122
+
105
123
  def test_red_team_template(self):
106
124
  output = render_template(
107
125
  "red_team.md.j2",
@@ -158,8 +176,36 @@ class TestJobScriptWatchdog:
158
176
  # Once-per-stuck-episode flag, not log-every-heartbeat.
159
177
  assert "stuck_warned" in script
160
178
 
179
+ def _assert_hung_pipeline_recovery_present(self, script: str) -> None:
180
+ # claude must run in its own session so the watchdog can SIGTERM the
181
+ # whole group (claude + any leaked Bash-tool subprocesses) by pgid.
182
+ assert "setsid " in script
183
+ # Per-step pgid + result sentinel files used by the watchdog.
184
+ assert "_pgid" in script
185
+ assert "_result_seen" in script
186
+ # Stream filter writes the sentinel on the terminal `result` event.
187
+ assert "open('$result_sentinel'" in script
188
+ # The watchdog escalates SIGTERM → SIGKILL against the pgid (negative
189
+ # number argument to kill targets a process group).
190
+ assert "kill -TERM -" in script
191
+ assert "kill -KILL -" in script
192
+ # Grace period between result event and pgid kill.
193
+ assert "result_grace_secs=60" in script
194
+ # Recovery messaging in the log so operators can tell when a kill ran.
195
+ assert "STUCK_PIPE recovery" in script
196
+ # run_step treats nonzero exit as success iff the sentinel exists.
197
+ assert 'if [ ! -f "$result_sentinel" ]; then' in script
198
+ # active_step file replaces the old log-grep step detection.
199
+ assert "ACTIVE_STEP_FILE=" in script
200
+
161
201
  def test_slurm_template_includes_watchdog(self):
162
202
  self._assert_watchdog_present(_render_job_template("slurm.sh.j2"))
163
203
 
164
204
  def test_sge_template_includes_watchdog(self):
165
205
  self._assert_watchdog_present(_render_job_template("sge.sh.j2"))
206
+
207
+ def test_slurm_template_includes_hung_pipeline_recovery(self):
208
+ self._assert_hung_pipeline_recovery_present(_render_job_template("slurm.sh.j2"))
209
+
210
+ def test_sge_template_includes_hung_pipeline_recovery(self):
211
+ self._assert_hung_pipeline_recovery_present(_render_job_template("sge.sh.j2"))
@@ -1152,7 +1152,7 @@ wheels = [
1152
1152
 
1153
1153
  [[package]]
1154
1154
  name = "researchloop"
1155
- version = "0.3.1"
1155
+ version = "0.3.3"
1156
1156
  source = { editable = "." }
1157
1157
  dependencies = [
1158
1158
  { name = "aiosqlite" },
@@ -1 +0,0 @@
1
- __version__ = "0.3.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes