researchloop 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {researchloop-0.3.1 → researchloop-0.3.2}/PKG-INFO +1 -1
  2. {researchloop-0.3.1 → researchloop-0.3.2}/pyproject.toml +1 -1
  3. researchloop-0.3.2/researchloop/__init__.py +1 -0
  4. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/sge.sh.j2 +86 -15
  5. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/slurm.sh.j2 +86 -16
  6. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/report.md.j2 +11 -4
  7. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_runner.py +28 -0
  8. {researchloop-0.3.1 → researchloop-0.3.2}/uv.lock +1 -1
  9. researchloop-0.3.1/researchloop/__init__.py +0 -1
  10. {researchloop-0.3.1 → researchloop-0.3.2}/.github/workflows/ci.yml +0 -0
  11. {researchloop-0.3.1 → researchloop-0.3.2}/.github/workflows/docs.yml +0 -0
  12. {researchloop-0.3.1 → researchloop-0.3.2}/.github/workflows/release.yml +0 -0
  13. {researchloop-0.3.1 → researchloop-0.3.2}/.gitignore +0 -0
  14. {researchloop-0.3.1 → researchloop-0.3.2}/CLAUDE.md +0 -0
  15. {researchloop-0.3.1 → researchloop-0.3.2}/Dockerfile +0 -0
  16. {researchloop-0.3.1 → researchloop-0.3.2}/LICENSE +0 -0
  17. {researchloop-0.3.1 → researchloop-0.3.2}/README.md +0 -0
  18. {researchloop-0.3.1 → researchloop-0.3.2}/docs/assets/mmlu-combined.gif +0 -0
  19. {researchloop-0.3.1 → researchloop-0.3.2}/docs/assets/mmlu-combined.mp4 +0 -0
  20. {researchloop-0.3.1 → researchloop-0.3.2}/docs/cli.md +0 -0
  21. {researchloop-0.3.1 → researchloop-0.3.2}/docs/configuration.md +0 -0
  22. {researchloop-0.3.1 → researchloop-0.3.2}/docs/dashboard.md +0 -0
  23. {researchloop-0.3.1 → researchloop-0.3.2}/docs/deployment.md +0 -0
  24. {researchloop-0.3.1 → researchloop-0.3.2}/docs/development.md +0 -0
  25. {researchloop-0.3.1 → researchloop-0.3.2}/docs/getting-started.md +0 -0
  26. {researchloop-0.3.1 → researchloop-0.3.2}/docs/index.md +0 -0
  27. {researchloop-0.3.1 → researchloop-0.3.2}/docs/security.md +0 -0
  28. {researchloop-0.3.1 → researchloop-0.3.2}/docs/slack.md +0 -0
  29. {researchloop-0.3.1 → researchloop-0.3.2}/mkdocs.yml +0 -0
  30. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/__main__.py +0 -0
  31. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/cli.py +0 -0
  32. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/clusters/__init__.py +0 -0
  33. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/clusters/monitor.py +0 -0
  34. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/clusters/ssh.py +0 -0
  35. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/__init__.py +0 -0
  36. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/base.py +0 -0
  37. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/ntfy.py +0 -0
  38. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/router.py +0 -0
  39. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/comms/slack.py +0 -0
  40. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/__init__.py +0 -0
  41. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/config.py +0 -0
  42. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/credentials.py +0 -0
  43. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/models.py +0 -0
  44. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/core/orchestrator.py +0 -0
  45. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/__init__.py +0 -0
  46. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/app.py +0 -0
  47. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/auth.py +0 -0
  48. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/routes.py +0 -0
  49. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/base.html +0 -0
  50. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/login.html +0 -0
  51. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/loop_detail.html +0 -0
  52. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/loops.html +0 -0
  53. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/search.html +0 -0
  54. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/setup.html +0 -0
  55. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/sprint_detail.html +0 -0
  56. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/sprints.html +0 -0
  57. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/studies.html +0 -0
  58. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/study_detail.html +0 -0
  59. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/study_form.html +0 -0
  60. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/dashboard/templates/tweak_detail.html +0 -0
  61. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/__init__.py +0 -0
  62. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/database.py +0 -0
  63. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/migrations.py +0 -0
  64. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/db/queries.py +0 -0
  65. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/__init__.py +0 -0
  66. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/claude.py +0 -0
  67. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/sge_tweak.sh.j2 +0 -0
  68. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/job_templates/slurm_tweak.sh.j2 +0 -0
  69. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/main.py +0 -0
  70. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/pipeline.py +0 -0
  71. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/fix_issues.md.j2 +0 -0
  72. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/idea_generator.md.j2 +0 -0
  73. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/red_team.md.j2 +0 -0
  74. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/research_sprint.md.j2 +0 -0
  75. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/summarizer.md.j2 +0 -0
  76. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/templates/tweak.md.j2 +0 -0
  77. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/runner/upload.py +0 -0
  78. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/__init__.py +0 -0
  79. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/base.py +0 -0
  80. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/local.py +0 -0
  81. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/sge.py +0 -0
  82. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/schedulers/slurm.py +0 -0
  83. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/sprints/__init__.py +0 -0
  84. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/sprints/auto_loop.py +0 -0
  85. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/sprints/manager.py +0 -0
  86. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/studies/__init__.py +0 -0
  87. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/studies/manager.py +0 -0
  88. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/testing/__init__.py +0 -0
  89. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/testing/slack_mock.py +0 -0
  90. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop/testing/slack_simulator.py +0 -0
  91. {researchloop-0.3.1 → researchloop-0.3.2}/researchloop.toml.example +0 -0
  92. {researchloop-0.3.1 → researchloop-0.3.2}/slack-app-manifest.yml +0 -0
  93. {researchloop-0.3.1 → researchloop-0.3.2}/tests/__init__.py +0 -0
  94. {researchloop-0.3.1 → researchloop-0.3.2}/tests/conftest.py +0 -0
  95. {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/sge/Dockerfile +0 -0
  96. {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/sge/entrypoint.sh +0 -0
  97. {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/sge/mock_claude.sh +0 -0
  98. {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/slurm/Dockerfile +0 -0
  99. {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/slurm/entrypoint.sh +0 -0
  100. {researchloop-0.3.1 → researchloop-0.3.2}/tests/docker/slurm/mock_claude.sh +0 -0
  101. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/__init__.py +0 -0
  102. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/conftest.py +0 -0
  103. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_loop_advancement.py +0 -0
  104. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_loop_and_monitor.py +0 -0
  105. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_sge_scheduler.py +0 -0
  106. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_slurm_scheduler.py +0 -0
  107. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_sprint_slurm.py +0 -0
  108. {researchloop-0.3.1 → researchloop-0.3.2}/tests/integration/test_webhook_and_refresh.py +0 -0
  109. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_api.py +0 -0
  110. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_auto_loop.py +0 -0
  111. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_cli.py +0 -0
  112. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_config.py +0 -0
  113. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_dashboard.py +0 -0
  114. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_database.py +0 -0
  115. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_models.py +0 -0
  116. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_notification.py +0 -0
  117. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_queries.py +0 -0
  118. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_schedulers.py +0 -0
  119. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_search.py +0 -0
  120. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_sge.py +0 -0
  121. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack.py +0 -0
  122. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack_events.py +0 -0
  123. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack_mock.py +0 -0
  124. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_slack_simulator.py +0 -0
  125. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_sprint_manager.py +0 -0
  126. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_study_manager.py +0 -0
  127. {researchloop-0.3.1 → researchloop-0.3.2}/tests/test_tweaks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: researchloop
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Automated research sprint platform for HPC clusters
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "researchloop"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "Automated research sprint platform for HPC clusters"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}
@@ -0,0 +1 @@
1
+ __version__ = "0.3.2"
@@ -54,6 +54,7 @@ log() {
54
54
 
55
55
  # --- Helper: run claude and extract session ID ---
56
56
  SESSION_ID=""
57
+ ACTIVE_STEP_FILE="$SPRINT_DIR/.researchloop/active_step"
57
58
  run_step() {
58
59
  local prompt_file="$1"
59
60
  local step_name="$2"
@@ -65,12 +66,33 @@ run_step() {
65
66
  cmd+=(--resume "$SESSION_ID")
66
67
  fi
67
68
 
69
+ local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
70
+ local pgid_file="$SPRINT_DIR/.researchloop/${step_name}_pgid"
71
+ local result_sentinel="$SPRINT_DIR/.researchloop/${step_name}_result_seen"
72
+ # Reset per-step watchdog state and announce the active step to the
73
+ # heartbeat loop.
74
+ rm -f "$pgid_file" "$result_sentinel"
75
+ echo "$step_name" > "$ACTIVE_STEP_FILE"
76
+
68
77
  # Run claude with streaming JSON output.
69
78
  # Each line is a JSON event — we pipe through a filter that:
70
79
  # 1. Logs human-readable summaries of what Claude is doing
71
80
  # 2. Saves the full stream for post-processing
72
- local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
73
- "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) | tee "$output_file" | python3 -u -c "
81
+ # 3. Writes ${step}_result_seen when claude emits its terminal `result`
82
+ # event, so the heartbeat watchdog can detect "done but pipeline open"
83
+ #
84
+ # claude is launched under `setsid` so it (and any subprocesses spawned
85
+ # by Bash tool calls) share a dedicated process group. If a Bash-tool
86
+ # subprocess later leaks (e.g. hung network call) and keeps the
87
+ # tee|python3 pipeline open after claude has emitted result, the
88
+ # heartbeat watchdog can SIGTERM the whole group cleanly via the pgid.
89
+ (
90
+ setsid "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) &
91
+ claude_pid=$!
92
+ # setsid makes the child a session/group leader; pgid == pid.
93
+ echo "$claude_pid" > "$pgid_file"
94
+ wait "$claude_pid"
95
+ ) | tee "$output_file" | python3 -u -c "
74
96
  import sys, json, os, time
75
97
  log = open('$LOG_FILE', 'a', buffering=1)
76
98
  last_tool_time = 0
@@ -112,17 +134,32 @@ for line in sys.stdin:
112
134
  sid = evt.get('session_id', '')
113
135
  if sid:
114
136
  open('$SPRINT_DIR/.researchloop/session_id', 'w').write(sid)
137
+ # Mark claude's terminal result so the heartbeat watchdog can kill
138
+ # any leaked subprocesses keeping the pipeline open past this point.
139
+ try:
140
+ open('$result_sentinel', 'w').close()
141
+ except OSError:
142
+ pass
115
143
  result = evt.get('result', '') or ''
116
144
  log.write(f' Done ({tool_count} tool calls, {len(result)} chars output)\n')
117
145
  log.flush()
118
146
  log.close()
119
147
  " || {
120
148
  local rc=$?
121
- log "ERROR: Claude failed on step $step_name (exit $rc)"
122
- tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
123
- return 1
149
+ # Treat as failure only if claude never emitted its result event. If
150
+ # the sentinel exists, the work is done — the nonzero exit is just
151
+ # the watchdog cleaning up a leaked Bash-tool subprocess.
152
+ if [ ! -f "$result_sentinel" ]; then
153
+ log "ERROR: Claude failed on step $step_name (exit $rc)"
154
+ tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
155
+ rm -f "$ACTIVE_STEP_FILE"
156
+ return 1
157
+ fi
158
+ log "NOTE: $step_name pipeline closed via watchdog after result event (exit $rc); treating step as complete"
124
159
  }
125
160
 
161
+ rm -f "$ACTIVE_STEP_FILE"
162
+
126
163
  if [ -f "$SPRINT_DIR/.researchloop/session_id" ]; then
127
164
  SESSION_ID=$(cat "$SPRINT_DIR/.researchloop/session_id")
128
165
  fi
@@ -204,22 +241,36 @@ print(json.dumps({
204
241
  }
205
242
 
206
243
  _heartbeat_loop() {
207
- # Watchdog: warn once if the active step's stream-json file goes silent
208
- # for too long while heartbeats keep firing. Signature of a hung pipeline
209
- # (claude exited but a leaked fd keeps tee|python3 blocked).
244
+ # Watchdog: detect and recover from hung pipelines.
245
+ #
246
+ # STUCK_PIPE warn: the active step's stream-json file goes silent for
247
+ # 5+ minutes while heartbeats keep firing. Signature of claude having
248
+ # exited but a leaked Bash-tool subprocess holding the pipeline's
249
+ # stdout fd open.
250
+ #
251
+ # Hung-after-result kill: claude emits a terminal `result` event when
252
+ # its work is done, which the stream filter records as a sentinel file.
253
+ # If the bash pipeline does not close within result_grace_secs after
254
+ # that, SIGTERM the entire claude process group via its pgid so the
255
+ # leaked subprocesses die and run_step can return. Escalate to SIGKILL
256
+ # if the group ignores SIGTERM.
210
257
  local stuck_warned=0
211
258
  local stuck_threshold_secs=300
259
+ local result_grace_secs=60
260
+ local kill_escalate_secs=15
261
+ local last_killed_pgid=""
212
262
  while true; do
213
263
  sleep 60
264
+ # Detect current step from the active_step file written by run_step.
265
+ local active_step=""
266
+ if [ -f "$SPRINT_DIR/.researchloop/active_step" ]; then
267
+ active_step=$(cat "$SPRINT_DIR/.researchloop/active_step" 2>/dev/null || true)
268
+ fi
214
269
  local step="running"
215
- if [ -f "$LOG_FILE" ]; then
216
- local last_step
217
- last_step=$(grep -o '>>> Starting step: [a-z_]*' "$LOG_FILE" | tail -1 | sed 's/>>> Starting step: //' || true)
218
- if [ -n "$last_step" ]; then
219
- step="running ($last_step)"
220
- fi
270
+ if [ -n "$active_step" ]; then
271
+ step="running ($active_step)"
221
272
  fi
222
- # STUCK_PIPE watchdog (detect-and-warn only).
273
+ # STUCK_PIPE watchdog (detect-and-warn).
223
274
  local newest_jsonl
224
275
  newest_jsonl=$(ls -t "$SPRINT_DIR/.researchloop"/*_output.jsonl 2>/dev/null | head -1)
225
276
  if [ -n "$newest_jsonl" ] && [ -f "$newest_jsonl" ]; then
@@ -236,6 +287,26 @@ _heartbeat_loop() {
236
287
  stuck_warned=0
237
288
  fi
238
289
  fi
290
+ # Hung-after-result recovery: kill the claude pgid if the result
291
+ # sentinel is older than the grace period.
292
+ if [ -n "$active_step" ]; then
293
+ local sentinel="$SPRINT_DIR/.researchloop/${active_step}_result_seen"
294
+ local pgid_file="$SPRINT_DIR/.researchloop/${active_step}_pgid"
295
+ if [ -f "$sentinel" ] && [ -f "$pgid_file" ]; then
296
+ local s_mtime now_t s_age pgid
297
+ s_mtime=$(stat -c %Y "$sentinel" 2>/dev/null || stat -f %m "$sentinel" 2>/dev/null || echo 0)
298
+ now_t=$(date +%s)
299
+ s_age=$((now_t - s_mtime))
300
+ pgid=$(cat "$pgid_file" 2>/dev/null || true)
301
+ if [ -n "$pgid" ] && [ "$s_age" -ge "$result_grace_secs" ] && [ "$last_killed_pgid" != "$pgid" ]; then
302
+ echo "[$(date -u +%H:%M:%S)] STUCK_PIPE recovery: $active_step result event seen ${s_age}s ago but pipeline still open; SIGTERM pgid $pgid" >> "$LOG_FILE"
303
+ kill -TERM -"$pgid" 2>/dev/null || true
304
+ # Escalate to SIGKILL in case the group ignores SIGTERM.
305
+ ( sleep "$kill_escalate_secs" && kill -KILL -"$pgid" 2>/dev/null || true ) &
306
+ last_killed_pgid="$pgid"
307
+ fi
308
+ fi
309
+ fi
239
310
  send_heartbeat "$step"
240
311
  echo "[$(date -u +%H:%M:%S)] ... still running (heartbeat)" >> "$LOG_FILE"
241
312
  done
@@ -53,6 +53,7 @@ log() {
53
53
 
54
54
  # --- Helper: run claude and extract session ID ---
55
55
  SESSION_ID=""
56
+ ACTIVE_STEP_FILE="$SPRINT_DIR/.researchloop/active_step"
56
57
  run_step() {
57
58
  local prompt_file="$1"
58
59
  local step_name="$2"
@@ -64,12 +65,33 @@ run_step() {
64
65
  cmd+=(--resume "$SESSION_ID")
65
66
  fi
66
67
 
68
+ local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
69
+ local pgid_file="$SPRINT_DIR/.researchloop/${step_name}_pgid"
70
+ local result_sentinel="$SPRINT_DIR/.researchloop/${step_name}_result_seen"
71
+ # Reset per-step watchdog state and announce the active step to the
72
+ # heartbeat loop.
73
+ rm -f "$pgid_file" "$result_sentinel"
74
+ echo "$step_name" > "$ACTIVE_STEP_FILE"
75
+
67
76
  # Run claude with streaming JSON output.
68
77
  # Each line is a JSON event — we pipe through a filter that:
69
78
  # 1. Logs human-readable summaries of what Claude is doing
70
79
  # 2. Saves the full stream for post-processing
71
- local output_file="$SPRINT_DIR/.researchloop/${step_name}_output.jsonl"
72
- "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) | tee "$output_file" | python3 -u -c "
80
+ # 3. Writes ${step}_result_seen when claude emits its terminal `result`
81
+ # event, so the heartbeat watchdog can detect "done but pipeline open"
82
+ #
83
+ # claude is launched under `setsid` so it (and any subprocesses spawned
84
+ # by Bash tool calls) share a dedicated process group. If a Bash-tool
85
+ # subprocess later leaks (e.g. hung network call) and keeps the
86
+ # tee|python3 pipeline open after claude has emitted result, the
87
+ # heartbeat watchdog can SIGTERM the whole group cleanly via the pgid.
88
+ (
89
+ setsid "${cmd[@]}" 2> >(tee -a "$LOG_FILE" >&2) &
90
+ claude_pid=$!
91
+ # setsid makes the child a session/group leader; pgid == pid.
92
+ echo "$claude_pid" > "$pgid_file"
93
+ wait "$claude_pid"
94
+ ) | tee "$output_file" | python3 -u -c "
73
95
  import sys, json, os, time
74
96
  log = open('$LOG_FILE', 'a', buffering=1)
75
97
  last_tool_time = 0
@@ -112,17 +134,32 @@ for line in sys.stdin:
112
134
  sid = evt.get('session_id', '')
113
135
  if sid:
114
136
  open('$SPRINT_DIR/.researchloop/session_id', 'w').write(sid)
137
+ # Mark claude's terminal result so the heartbeat watchdog can kill
138
+ # any leaked subprocesses keeping the pipeline open past this point.
139
+ try:
140
+ open('$result_sentinel', 'w').close()
141
+ except OSError:
142
+ pass
115
143
  result = evt.get('result', '') or ''
116
144
  log.write(f' Done ({tool_count} tool calls, {len(result)} chars output)\n')
117
145
  log.flush()
118
146
  log.close()
119
147
  " || {
120
148
  local rc=$?
121
- log "ERROR: Claude failed on step $step_name (exit $rc)"
122
- tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
123
- return 1
149
+ # Treat as failure only if claude never emitted its result event. If
150
+ # the sentinel exists, the work is done — the nonzero exit is just
151
+ # the watchdog cleaning up a leaked Bash-tool subprocess.
152
+ if [ ! -f "$result_sentinel" ]; then
153
+ log "ERROR: Claude failed on step $step_name (exit $rc)"
154
+ tail -20 "$output_file" >> "$LOG_FILE" 2>/dev/null
155
+ rm -f "$ACTIVE_STEP_FILE"
156
+ return 1
157
+ fi
158
+ log "NOTE: $step_name pipeline closed via watchdog after result event (exit $rc); treating step as complete"
124
159
  }
125
160
 
161
+ rm -f "$ACTIVE_STEP_FILE"
162
+
126
163
  # Read session ID (written by the stream filter above).
127
164
  if [ -f "$SPRINT_DIR/.researchloop/session_id" ]; then
128
165
  SESSION_ID=$(cat "$SPRINT_DIR/.researchloop/session_id")
@@ -206,23 +243,36 @@ print(json.dumps({
206
243
  }
207
244
 
208
245
  _heartbeat_loop() {
209
- # Watchdog: warn once if the active step's stream-json file goes silent
210
- # for too long while heartbeats keep firing. Signature of a hung pipeline
211
- # (claude exited but a leaked fd keeps tee|python3 blocked).
246
+ # Watchdog: detect and recover from hung pipelines.
247
+ #
248
+ # STUCK_PIPE warn: the active step's stream-json file goes silent for
249
+ # 5+ minutes while heartbeats keep firing. Signature of claude having
250
+ # exited but a leaked Bash-tool subprocess holding the pipeline's
251
+ # stdout fd open.
252
+ #
253
+ # Hung-after-result kill: claude emits a terminal `result` event when
254
+ # its work is done, which the stream filter records as a sentinel file.
255
+ # If the bash pipeline does not close within result_grace_secs after
256
+ # that, SIGTERM the entire claude process group via its pgid so the
257
+ # leaked subprocesses die and run_step can return. Escalate to SIGKILL
258
+ # if the group ignores SIGTERM.
212
259
  local stuck_warned=0
213
260
  local stuck_threshold_secs=300
261
+ local result_grace_secs=60
262
+ local kill_escalate_secs=15
263
+ local last_killed_pgid=""
214
264
  while true; do
215
265
  sleep 60
216
- # Detect current step from log.
266
+ # Detect current step from the active_step file written by run_step.
267
+ local active_step=""
268
+ if [ -f "$SPRINT_DIR/.researchloop/active_step" ]; then
269
+ active_step=$(cat "$SPRINT_DIR/.researchloop/active_step" 2>/dev/null || true)
270
+ fi
217
271
  local step="running"
218
- if [ -f "$LOG_FILE" ]; then
219
- local last_step
220
- last_step=$(grep -o '>>> Starting step: [a-z_]*' "$LOG_FILE" | tail -1 | sed 's/>>> Starting step: //' || true)
221
- if [ -n "$last_step" ]; then
222
- step="running ($last_step)"
223
- fi
272
+ if [ -n "$active_step" ]; then
273
+ step="running ($active_step)"
224
274
  fi
225
- # STUCK_PIPE watchdog (detect-and-warn only).
275
+ # STUCK_PIPE watchdog (detect-and-warn).
226
276
  local newest_jsonl
227
277
  newest_jsonl=$(ls -t "$SPRINT_DIR/.researchloop"/*_output.jsonl 2>/dev/null | head -1)
228
278
  if [ -n "$newest_jsonl" ] && [ -f "$newest_jsonl" ]; then
@@ -239,6 +289,26 @@ _heartbeat_loop() {
239
289
  stuck_warned=0
240
290
  fi
241
291
  fi
292
+ # Hung-after-result recovery: kill the claude pgid if the result
293
+ # sentinel is older than the grace period.
294
+ if [ -n "$active_step" ]; then
295
+ local sentinel="$SPRINT_DIR/.researchloop/${active_step}_result_seen"
296
+ local pgid_file="$SPRINT_DIR/.researchloop/${active_step}_pgid"
297
+ if [ -f "$sentinel" ] && [ -f "$pgid_file" ]; then
298
+ local s_mtime now_t s_age pgid
299
+ s_mtime=$(stat -c %Y "$sentinel" 2>/dev/null || stat -f %m "$sentinel" 2>/dev/null || echo 0)
300
+ now_t=$(date +%s)
301
+ s_age=$((now_t - s_mtime))
302
+ pgid=$(cat "$pgid_file" 2>/dev/null || true)
303
+ if [ -n "$pgid" ] && [ "$s_age" -ge "$result_grace_secs" ] && [ "$last_killed_pgid" != "$pgid" ]; then
304
+ echo "[$(date -u +%H:%M:%S)] STUCK_PIPE recovery: $active_step result event seen ${s_age}s ago but pipeline still open; SIGTERM pgid $pgid" >> "$LOG_FILE"
305
+ kill -TERM -"$pgid" 2>/dev/null || true
306
+ # Escalate to SIGKILL in case the group ignores SIGTERM.
307
+ ( sleep "$kill_escalate_secs" && kill -KILL -"$pgid" 2>/dev/null || true ) &
308
+ last_killed_pgid="$pgid"
309
+ fi
310
+ fi
311
+ fi
242
312
  send_heartbeat "$step"
243
313
  # Also write a heartbeat timestamp to the log so it's visible.
244
314
  echo "[$(date -u +%H:%M:%S)] ... still running (heartbeat)" >> "$LOG_FILE"
@@ -48,12 +48,19 @@ After writing report.md, create `report.pdf` in the current directory using Pyth
48
48
  - Style: remove outer edges, add subtle row shading or gridlines for readability
49
49
  - If a table has many columns (>6), use font size 7 and/or landscape orientation for that page
50
50
 
51
- **Images:**
52
- - When generating plots earlier in the pipeline, save at **dpi=150 or higher** so text in plots is sharp in the PDF
53
- - When embedding in the PDF figure, use `ax.imshow(img, extent=[x_left, x_right, y_low, y_high], aspect='auto')` or insert via `fig.figimage()`. Scale to fit within margins while preserving aspect ratio
54
- - Do NOT downscale high-res images matplotlib's PdfPages will embed them at their native resolution
51
+ **Images — embed vector plots as vectors, not rasters:**
52
+ - Plots from `results/` are typically saved as both `.png` (raster) and `.pdf` (vector). Always embed the `.pdf`. **Anything passed through `ax.imshow()` or `fig.figimage()` gets rasterized when PdfPages saves the page** the source file's DPI is discarded, only the figure's savefig DPI matters, and the result is still raster. This is the dominant cause of blurry plots in the final report
53
+ - Two-pass approach to keep plots vector:
54
+ 1. **Matplotlib pass:** for each plot, draw a placeholder rectangle (or leave whitespace) at the intended position and record its bbox in page-fraction coordinates. Render the caption normally
55
+ 2. **PyMuPDF overlay pass:** after `PdfPages` writes `report.pdf`, open it with `fitz` and for each placeholder call `page.show_pdf_page(rect, src_doc, 0)` where `src_doc = fitz.open(plot_pdf_path)` and `rect = fitz.Rect(x0, y0, x1, y1)` is the placeholder's bbox converted to points (page-fraction × 72 × page-size-in-inches; remember PDF y-axis runs top-down). Overwrite `report.pdf` with the merged result
56
+ - Only use `ax.imshow(img, extent=..., aspect='auto')` for genuinely raster-only sources (photos, external screenshots) where no `.pdf` companion exists. Source those at dpi=150+
57
+ - If `PyMuPDF` is unavailable, fall back to `pypdf`'s `merge_transformed_page` (same idea, different API) rather than rasterizing
55
58
  - Leave space above/below for captions
56
59
 
60
+ **Verify the result is actually vector:**
61
+ - For each embedded plot's xref, `fitz.open("report.pdf").xref_get_key(xref, "Subtype")` should return `/Form`, never `/Image`
62
+ - A vector-only report is typically <150 KB; if `report.pdf` is 400 KB+ you almost certainly rasterized something — re-check the overlay pass
63
+
57
64
  **Pagination:**
58
65
  - Track y_cursor starting at y_top, decrementing after each element
59
66
  - When y_cursor < y_bottom + 0.05, save the page and start a new one
@@ -158,8 +158,36 @@ class TestJobScriptWatchdog:
158
158
  # Once-per-stuck-episode flag, not log-every-heartbeat.
159
159
  assert "stuck_warned" in script
160
160
 
161
+ def _assert_hung_pipeline_recovery_present(self, script: str) -> None:
162
+ # claude must run in its own session so the watchdog can SIGTERM the
163
+ # whole group (claude + any leaked Bash-tool subprocesses) by pgid.
164
+ assert "setsid " in script
165
+ # Per-step pgid + result sentinel files used by the watchdog.
166
+ assert "_pgid" in script
167
+ assert "_result_seen" in script
168
+ # Stream filter writes the sentinel on the terminal `result` event.
169
+ assert "open('$result_sentinel'" in script
170
+ # The watchdog escalates SIGTERM → SIGKILL against the pgid (negative
171
+ # number argument to kill targets a process group).
172
+ assert "kill -TERM -" in script
173
+ assert "kill -KILL -" in script
174
+ # Grace period between result event and pgid kill.
175
+ assert "result_grace_secs=60" in script
176
+ # Recovery messaging in the log so operators can tell when a kill ran.
177
+ assert "STUCK_PIPE recovery" in script
178
+ # run_step treats nonzero exit as success iff the sentinel exists.
179
+ assert 'if [ ! -f "$result_sentinel" ]; then' in script
180
+ # active_step file replaces the old log-grep step detection.
181
+ assert "ACTIVE_STEP_FILE=" in script
182
+
161
183
  def test_slurm_template_includes_watchdog(self):
162
184
  self._assert_watchdog_present(_render_job_template("slurm.sh.j2"))
163
185
 
164
186
  def test_sge_template_includes_watchdog(self):
165
187
  self._assert_watchdog_present(_render_job_template("sge.sh.j2"))
188
+
189
+ def test_slurm_template_includes_hung_pipeline_recovery(self):
190
+ self._assert_hung_pipeline_recovery_present(_render_job_template("slurm.sh.j2"))
191
+
192
+ def test_sge_template_includes_hung_pipeline_recovery(self):
193
+ self._assert_hung_pipeline_recovery_present(_render_job_template("sge.sh.j2"))
@@ -1152,7 +1152,7 @@ wheels = [
1152
1152
 
1153
1153
  [[package]]
1154
1154
  name = "researchloop"
1155
- version = "0.3.1"
1155
+ version = "0.3.2"
1156
1156
  source = { editable = "." }
1157
1157
  dependencies = [
1158
1158
  { name = "aiosqlite" },
@@ -1 +0,0 @@
1
- __version__ = "0.3.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes