specweave 0.33.3 → 0.33.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +77 -19
- package/dist/src/cli/cleanup-zombies.js +8 -5
- package/dist/src/cli/cleanup-zombies.js.map +1 -1
- package/dist/src/config/types.d.ts +203 -1208
- package/dist/src/config/types.d.ts.map +1 -1
- package/dist/src/importers/jira-importer.d.ts +10 -0
- package/dist/src/importers/jira-importer.d.ts.map +1 -1
- package/dist/src/importers/jira-importer.js +55 -5
- package/dist/src/importers/jira-importer.js.map +1 -1
- package/dist/src/init/architecture/types.d.ts +33 -140
- package/dist/src/init/architecture/types.d.ts.map +1 -1
- package/dist/src/init/compliance/types.d.ts +30 -27
- package/dist/src/init/compliance/types.d.ts.map +1 -1
- package/dist/src/init/repo/types.d.ts +11 -34
- package/dist/src/init/repo/types.d.ts.map +1 -1
- package/dist/src/init/research/src/config/types.d.ts +15 -82
- package/dist/src/init/research/src/config/types.d.ts.map +1 -1
- package/dist/src/init/research/types.d.ts +38 -93
- package/dist/src/init/research/types.d.ts.map +1 -1
- package/dist/src/init/team/types.d.ts +4 -42
- package/dist/src/init/team/types.d.ts.map +1 -1
- package/dist/src/sync/closure-metrics.d.ts +102 -0
- package/dist/src/sync/closure-metrics.d.ts.map +1 -0
- package/dist/src/sync/closure-metrics.js +267 -0
- package/dist/src/sync/closure-metrics.js.map +1 -0
- package/dist/src/sync/sync-coordinator.d.ts +29 -0
- package/dist/src/sync/sync-coordinator.d.ts.map +1 -1
- package/dist/src/sync/sync-coordinator.js +153 -16
- package/dist/src/sync/sync-coordinator.js.map +1 -1
- package/dist/src/utils/notification-constants.d.ts +85 -0
- package/dist/src/utils/notification-constants.d.ts.map +1 -0
- package/dist/src/utils/notification-constants.js +129 -0
- package/dist/src/utils/notification-constants.js.map +1 -0
- package/dist/src/utils/platform-utils.d.ts +13 -3
- package/dist/src/utils/platform-utils.d.ts.map +1 -1
- package/dist/src/utils/platform-utils.js +17 -6
- package/dist/src/utils/platform-utils.js.map +1 -1
- package/package.json +1 -1
- package/plugins/specweave/commands/specweave-increment.md +46 -0
- package/plugins/specweave/commands/specweave-jobs.md +153 -8
- package/plugins/specweave/hooks/spec-project-validator.sh +24 -2
- package/plugins/specweave/hooks/universal/hook-wrapper.cmd +26 -26
- package/plugins/specweave/hooks/universal/session-start.cmd +16 -16
- package/plugins/specweave/hooks/universal/session-start.ps1 +16 -16
- package/plugins/specweave/scripts/session-watchdog.sh +278 -130
- package/plugins/specweave/skills/increment-planner/SKILL.md +48 -18
- package/plugins/specweave/skills/increment-planner/templates/spec-multi-project.md +27 -14
- package/plugins/specweave/skills/increment-planner/templates/spec-single-project.md +16 -5
- package/plugins/specweave/skills/spec-generator/SKILL.md +74 -15
- package/plugins/specweave-github/hooks/.specweave/logs/hooks-debug.log +0 -738
- package/plugins/specweave-release/hooks/.specweave/logs/dora-tracking.log +0 -1107
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
|
-
# SpecWeave Session Watchdog
|
|
3
|
-
# Monitors Claude Code sessions and alerts when
|
|
4
|
-
#
|
|
2
|
+
# SpecWeave Session Watchdog v2.0
|
|
3
|
+
# Monitors Claude Code sessions and alerts ONLY when something is REALLY wrong
|
|
4
|
+
#
|
|
5
|
+
# FIXES (2025-12-10):
|
|
6
|
+
# - ELIMINATED FALSE POSITIVES: No longer triggers on stale files alone
|
|
7
|
+
# - SMART DETECTION: Verifies actual stuck processes, not just file ages
|
|
8
|
+
# - SEVERITY LEVELS: Only CRITICAL issues trigger notifications
|
|
9
|
+
# - DIAGNOSTICS: Writes detailed logs for /specweave:jobs to display
|
|
10
|
+
#
|
|
11
|
+
# Usage: bash session-watchdog.sh [--daemon] [--interval=60] [--threshold=300]
|
|
5
12
|
|
|
6
13
|
set -euo pipefail
|
|
7
14
|
|
|
@@ -10,16 +17,24 @@ STUCK_THRESHOLD_SECONDS="${STUCK_THRESHOLD:-300}" # 5 minutes
|
|
|
10
17
|
CHECK_INTERVAL="${CHECK_INTERVAL:-60}" # 1 minute
|
|
11
18
|
SPECWEAVE_ROOT="${SPECWEAVE_ROOT:-.specweave}"
|
|
12
19
|
SIGNAL_FILE="${SPECWEAVE_ROOT}/state/.session-stuck"
|
|
13
|
-
|
|
20
|
+
DIAGNOSTICS_FILE="${SPECWEAVE_ROOT}/state/.watchdog-diagnostics.json"
|
|
14
21
|
DAEMON_MODE=false
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
22
|
+
QUIET_MODE=false
|
|
23
|
+
|
|
24
|
+
# Severity levels (only CRITICAL triggers notifications)
|
|
25
|
+
SEVERITY_INFO=0
|
|
26
|
+
SEVERITY_WARNING=1
|
|
27
|
+
SEVERITY_CRITICAL=2
|
|
28
|
+
|
|
29
|
+
# Track consecutive warnings (avoids single-check false positives)
|
|
30
|
+
CONSECUTIVE_WARNINGS=0
|
|
31
|
+
CONSECUTIVE_THRESHOLD=3 # Need 3 consecutive warnings before alerting
|
|
18
32
|
|
|
19
33
|
# Colors
|
|
20
34
|
RED='\033[0;31m'
|
|
21
35
|
YELLOW='\033[1;33m'
|
|
22
36
|
GREEN='\033[0;32m'
|
|
37
|
+
BLUE='\033[0;34m'
|
|
23
38
|
NC='\033[0m'
|
|
24
39
|
|
|
25
40
|
# Parse arguments
|
|
@@ -28,6 +43,9 @@ for arg in "$@"; do
|
|
|
28
43
|
--daemon)
|
|
29
44
|
DAEMON_MODE=true
|
|
30
45
|
;;
|
|
46
|
+
--quiet)
|
|
47
|
+
QUIET_MODE=true
|
|
48
|
+
;;
|
|
31
49
|
--interval=*)
|
|
32
50
|
CHECK_INTERVAL="${arg#*=}"
|
|
33
51
|
;;
|
|
@@ -38,12 +56,26 @@ for arg in "$@"; do
|
|
|
38
56
|
done
|
|
39
57
|
|
|
40
58
|
log() {
|
|
59
|
+
[[ "$QUIET_MODE" == "true" ]] && return
|
|
41
60
|
echo -e "[$(date '+%H:%M:%S')] $1"
|
|
42
61
|
}
|
|
43
62
|
|
|
63
|
+
log_debug() {
|
|
64
|
+
# Always write to diagnostics file even in quiet mode
|
|
65
|
+
echo "[$(date '+%H:%M:%S')] $1" >> "${SPECWEAVE_ROOT}/logs/watchdog.log" 2>/dev/null || true
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Send notification ONLY for critical issues
|
|
44
69
|
send_notification() {
|
|
45
|
-
local
|
|
46
|
-
local
|
|
70
|
+
local severity="$1"
|
|
71
|
+
local title="$2"
|
|
72
|
+
local message="$3"
|
|
73
|
+
|
|
74
|
+
# Only notify for CRITICAL severity
|
|
75
|
+
if [[ "$severity" -lt "$SEVERITY_CRITICAL" ]]; then
|
|
76
|
+
log_debug "Skipping notification (severity=$severity, need=$SEVERITY_CRITICAL): $message"
|
|
77
|
+
return
|
|
78
|
+
fi
|
|
47
79
|
|
|
48
80
|
# macOS notification
|
|
49
81
|
if command -v osascript &> /dev/null; then
|
|
@@ -59,109 +91,275 @@ send_notification() {
|
|
|
59
91
|
get_file_age_seconds() {
|
|
60
92
|
local file="$1"
|
|
61
93
|
if [[ ! -f "$file" ]]; then
|
|
62
|
-
echo "
|
|
94
|
+
echo "-1" # -1 means file doesn't exist (not an error)
|
|
63
95
|
return
|
|
64
96
|
fi
|
|
65
97
|
|
|
66
|
-
local now
|
|
67
|
-
local mtime
|
|
98
|
+
local now mtime
|
|
68
99
|
now=$(date +%s)
|
|
69
100
|
|
|
70
101
|
if [[ "$(uname)" == "Darwin" ]]; then
|
|
71
|
-
mtime=$(stat -f %m "$file")
|
|
102
|
+
mtime=$(stat -f %m "$file" 2>/dev/null || echo "$now")
|
|
72
103
|
else
|
|
73
|
-
mtime=$(stat -c %Y "$file")
|
|
104
|
+
mtime=$(stat -c %Y "$file" 2>/dev/null || echo "$now")
|
|
74
105
|
fi
|
|
75
106
|
|
|
76
107
|
echo $((now - mtime))
|
|
77
108
|
}
|
|
78
109
|
|
|
110
|
+
# Check if a process is actually running
|
|
111
|
+
is_process_running() {
|
|
112
|
+
local pid="$1"
|
|
113
|
+
if [[ -z "$pid" ]] || [[ "$pid" == "0" ]]; then
|
|
114
|
+
return 1
|
|
115
|
+
fi
|
|
116
|
+
kill -0 "$pid" 2>/dev/null
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# SMART lock file check - verifies the PROCESS is actually stuck, not just file age
|
|
79
120
|
check_lock_file() {
|
|
121
|
+
local lock_dir="${SPECWEAVE_ROOT}/state/.processor.lock.d"
|
|
80
122
|
local lock_file="${SPECWEAVE_ROOT}/state/.processor.lock"
|
|
81
|
-
|
|
123
|
+
local result_severity=$SEVERITY_INFO
|
|
124
|
+
local result_message=""
|
|
125
|
+
|
|
126
|
+
# Check new lock directory format first (v2)
|
|
127
|
+
if [[ -d "$lock_dir" ]] && [[ -f "$lock_dir/pid" ]]; then
|
|
128
|
+
local lock_pid
|
|
129
|
+
lock_pid=$(cat "$lock_dir/pid" 2>/dev/null || echo "")
|
|
130
|
+
|
|
131
|
+
if [[ -n "$lock_pid" ]]; then
|
|
132
|
+
if is_process_running "$lock_pid"; then
|
|
133
|
+
# Process is actually running - check how long
|
|
134
|
+
local age
|
|
135
|
+
age=$(get_file_age_seconds "$lock_dir/pid")
|
|
136
|
+
if [[ "$age" -gt "$STUCK_THRESHOLD_SECONDS" ]]; then
|
|
137
|
+
result_severity=$SEVERITY_WARNING
|
|
138
|
+
result_message="Processor PID $lock_pid running for ${age}s (might be legitimate long operation)"
|
|
139
|
+
else
|
|
140
|
+
result_message="Processor PID $lock_pid active (${age}s)"
|
|
141
|
+
fi
|
|
142
|
+
else
|
|
143
|
+
# PID file exists but process is dead = STALE LOCK (cleanup needed, not stuck)
|
|
144
|
+
result_severity=$SEVERITY_WARNING
|
|
145
|
+
result_message="Stale lock: PID $lock_pid no longer running (auto-cleanup will handle)"
|
|
146
|
+
# Don't trigger alert - processor will clean this up on next run
|
|
147
|
+
fi
|
|
148
|
+
fi
|
|
149
|
+
# Check old lock file format (legacy)
|
|
150
|
+
elif [[ -f "$lock_file" ]]; then
|
|
82
151
|
local age
|
|
83
152
|
age=$(get_file_age_seconds "$lock_file")
|
|
84
153
|
if [[ "$age" -gt "$STUCK_THRESHOLD_SECONDS" ]]; then
|
|
85
|
-
|
|
86
|
-
|
|
154
|
+
result_severity=$SEVERITY_WARNING
|
|
155
|
+
result_message="Legacy lock file age: ${age}s (consider running cleanup-state.sh)"
|
|
87
156
|
fi
|
|
88
157
|
fi
|
|
89
|
-
|
|
158
|
+
|
|
159
|
+
# Write diagnostic
|
|
160
|
+
echo "lock_status=$result_severity" >> "$DIAGNOSTICS_FILE.tmp"
|
|
161
|
+
echo "lock_message=$result_message" >> "$DIAGNOSTICS_FILE.tmp"
|
|
162
|
+
|
|
163
|
+
if [[ -n "$result_message" ]]; then
|
|
164
|
+
log_debug "Lock check: $result_message"
|
|
165
|
+
fi
|
|
166
|
+
|
|
167
|
+
return $result_severity
|
|
90
168
|
}
|
|
91
169
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
170
|
+
# Check for ACTUAL zombie heredoc processes (CRITICAL - this is a real stuck indicator)
|
|
171
|
+
check_zombie_processes() {
|
|
172
|
+
local result_severity=$SEVERITY_INFO
|
|
173
|
+
local result_message=""
|
|
174
|
+
|
|
175
|
+
# Look for cat processes waiting for EOF (heredoc stuck)
|
|
176
|
+
local cat_zombies
|
|
177
|
+
cat_zombies=$(pgrep -f "cat.*EOF" 2>/dev/null | wc -l | tr -d ' \n' || echo "0")
|
|
178
|
+
cat_zombies="${cat_zombies:-0}"
|
|
179
|
+
[[ ! "$cat_zombies" =~ ^[0-9]+$ ]] && cat_zombies=0
|
|
180
|
+
|
|
181
|
+
# Look for bash processes that seem stuck on heredoc
|
|
182
|
+
local bash_heredocs
|
|
183
|
+
bash_heredocs=$(pgrep -f "bash.*<<" 2>/dev/null | wc -l | tr -d ' \n' || echo "0")
|
|
184
|
+
bash_heredocs="${bash_heredocs:-0}"
|
|
185
|
+
[[ ! "$bash_heredocs" =~ ^[0-9]+$ ]] && bash_heredocs=0
|
|
186
|
+
|
|
187
|
+
local total_zombies=$((cat_zombies + bash_heredocs))
|
|
188
|
+
|
|
189
|
+
if [[ "$total_zombies" -gt 0 ]]; then
|
|
190
|
+
result_severity=$SEVERITY_CRITICAL # This is DEFINITELY stuck!
|
|
191
|
+
result_message="$total_zombies zombie heredoc processes detected (cat=$cat_zombies, bash=$bash_heredocs)"
|
|
192
|
+
log "${RED}🚨 CRITICAL: $result_message${NC}"
|
|
100
193
|
fi
|
|
101
|
-
|
|
194
|
+
|
|
195
|
+
echo "zombie_count=$total_zombies" >> "$DIAGNOSTICS_FILE.tmp"
|
|
196
|
+
echo "zombie_message=$result_message" >> "$DIAGNOSTICS_FILE.tmp"
|
|
197
|
+
|
|
198
|
+
return $result_severity
|
|
102
199
|
}
|
|
103
200
|
|
|
104
|
-
|
|
201
|
+
# Check MCP connection health (WARNING level, not critical)
|
|
202
|
+
check_mcp_health() {
|
|
203
|
+
local result_severity=$SEVERITY_INFO
|
|
204
|
+
local result_message=""
|
|
205
|
+
local drops=0
|
|
105
206
|
local debug_log="$HOME/.claude/debug/latest"
|
|
207
|
+
|
|
106
208
|
if [[ -f "$debug_log" ]]; then
|
|
107
|
-
|
|
108
|
-
drops=$(grep -c "WS-IDE connection dropped"
|
|
209
|
+
# Count MCP drops in last 500 lines
|
|
210
|
+
drops=$(tail -500 "$debug_log" 2>/dev/null | grep -c "WS-IDE connection dropped" 2>/dev/null || echo "0")
|
|
109
211
|
drops="${drops//[^0-9]/}"
|
|
110
212
|
drops="${drops:-0}"
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
213
|
+
[[ ! "$drops" =~ ^[0-9]+$ ]] && drops=0
|
|
214
|
+
|
|
215
|
+
if [[ "$drops" -gt 10 ]]; then
|
|
216
|
+
result_severity=$SEVERITY_WARNING
|
|
217
|
+
result_message="MCP instability: $drops connection drops (consider restarting VS Code Extension Host)"
|
|
218
|
+
elif [[ "$drops" -gt 3 ]]; then
|
|
219
|
+
result_message="MCP: $drops drops detected (minor instability)"
|
|
114
220
|
fi
|
|
115
221
|
fi
|
|
116
|
-
return 0
|
|
117
|
-
}
|
|
118
222
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
zombies=$(pgrep -f "cat.*EOF" 2>/dev/null | wc -l | tr -d ' ')
|
|
122
|
-
if [[ "$zombies" -gt 0 ]]; then
|
|
123
|
-
log "${RED}⚠️ STUCK DETECTED: $zombies zombie heredoc processes${NC}"
|
|
124
|
-
return 1
|
|
125
|
-
fi
|
|
126
|
-
return 0
|
|
127
|
-
}
|
|
223
|
+
echo "mcp_drops=$drops" >> "$DIAGNOSTICS_FILE.tmp"
|
|
224
|
+
echo "mcp_message=$result_message" >> "$DIAGNOSTICS_FILE.tmp"
|
|
128
225
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
local reasons=()
|
|
226
|
+
return $result_severity
|
|
227
|
+
}
|
|
132
228
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
229
|
+
# Check for orphaned background jobs (informational, not critical)
|
|
230
|
+
check_orphaned_jobs() {
|
|
231
|
+
local result_severity=$SEVERITY_INFO
|
|
232
|
+
local result_message=""
|
|
233
|
+
local jobs_dir="${SPECWEAVE_ROOT}/state/jobs"
|
|
234
|
+
local orphaned_count=0
|
|
235
|
+
|
|
236
|
+
if [[ -d "$jobs_dir" ]]; then
|
|
237
|
+
for job_dir in "$jobs_dir"/*/; do
|
|
238
|
+
[[ ! -d "$job_dir" ]] && continue
|
|
239
|
+
|
|
240
|
+
local pid_file="${job_dir}worker.pid"
|
|
241
|
+
local config_file="${job_dir}config.json"
|
|
242
|
+
|
|
243
|
+
if [[ -f "$pid_file" ]] && [[ -f "$config_file" ]]; then
|
|
244
|
+
local pid
|
|
245
|
+
pid=$(cat "$pid_file" 2>/dev/null || echo "")
|
|
246
|
+
if [[ -n "$pid" ]] && ! is_process_running "$pid"; then
|
|
247
|
+
orphaned_count=$((orphaned_count + 1))
|
|
248
|
+
fi
|
|
249
|
+
fi
|
|
250
|
+
done
|
|
136
251
|
fi
|
|
137
252
|
|
|
138
|
-
if
|
|
139
|
-
|
|
140
|
-
reasons+=("No heartbeat")
|
|
253
|
+
if [[ "$orphaned_count" -gt 0 ]]; then
|
|
254
|
+
result_message="$orphaned_count orphaned job(s) found (run /specweave:jobs to see details)"
|
|
141
255
|
fi
|
|
142
256
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
257
|
+
echo "orphaned_jobs=$orphaned_count" >> "$DIAGNOSTICS_FILE.tmp"
|
|
258
|
+
echo "orphaned_message=$result_message" >> "$DIAGNOSTICS_FILE.tmp"
|
|
259
|
+
|
|
260
|
+
return $result_severity
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
# Write diagnostics file for /specweave:jobs to read
|
|
264
|
+
write_diagnostics() {
|
|
265
|
+
local overall_severity="$1"
|
|
266
|
+
local overall_status="$2"
|
|
267
|
+
|
|
268
|
+
{
|
|
269
|
+
echo "{"
|
|
270
|
+
echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
|
|
271
|
+
echo " \"severity\": $overall_severity,"
|
|
272
|
+
echo " \"status\": \"$overall_status\","
|
|
273
|
+
|
|
274
|
+
# Parse temp file into JSON
|
|
275
|
+
local lock_status="" lock_message="" zombie_count="" zombie_message=""
|
|
276
|
+
local mcp_drops="" mcp_message="" orphaned_jobs="" orphaned_message=""
|
|
277
|
+
|
|
278
|
+
while IFS='=' read -r key value; do
|
|
279
|
+
case "$key" in
|
|
280
|
+
lock_status) lock_status="$value" ;;
|
|
281
|
+
lock_message) lock_message="$value" ;;
|
|
282
|
+
zombie_count) zombie_count="$value" ;;
|
|
283
|
+
zombie_message) zombie_message="$value" ;;
|
|
284
|
+
mcp_drops) mcp_drops="$value" ;;
|
|
285
|
+
mcp_message) mcp_message="$value" ;;
|
|
286
|
+
orphaned_jobs) orphaned_jobs="$value" ;;
|
|
287
|
+
orphaned_message) orphaned_message="$value" ;;
|
|
288
|
+
esac
|
|
289
|
+
done < "$DIAGNOSTICS_FILE.tmp" 2>/dev/null || true
|
|
290
|
+
|
|
291
|
+
# Sanitize numeric values (ensure single digit format)
|
|
292
|
+
lock_status="${lock_status:-0}"; [[ ! "$lock_status" =~ ^[0-9]+$ ]] && lock_status=0
|
|
293
|
+
zombie_count="${zombie_count:-0}"; [[ ! "$zombie_count" =~ ^[0-9]+$ ]] && zombie_count=0
|
|
294
|
+
mcp_drops="${mcp_drops:-0}"; [[ ! "$mcp_drops" =~ ^[0-9]+$ ]] && mcp_drops=0
|
|
295
|
+
orphaned_jobs="${orphaned_jobs:-0}"; [[ ! "$orphaned_jobs" =~ ^[0-9]+$ ]] && orphaned_jobs=0
|
|
296
|
+
|
|
297
|
+
echo " \"checks\": {"
|
|
298
|
+
echo " \"lock\": { \"severity\": $((lock_status)), \"message\": \"${lock_message:-ok}\" },"
|
|
299
|
+
echo " \"zombies\": { \"count\": $((zombie_count)), \"message\": \"${zombie_message:-none}\" },"
|
|
300
|
+
echo " \"mcp\": { \"drops\": $((mcp_drops)), \"message\": \"${mcp_message:-stable}\" },"
|
|
301
|
+
echo " \"orphanedJobs\": { \"count\": $((orphaned_jobs)), \"message\": \"${orphaned_message:-none}\" }"
|
|
302
|
+
echo " },"
|
|
303
|
+
echo " \"consecutiveWarnings\": $CONSECUTIVE_WARNINGS,"
|
|
304
|
+
echo " \"thresholdSeconds\": $STUCK_THRESHOLD_SECONDS,"
|
|
305
|
+
echo " \"checkIntervalSeconds\": $CHECK_INTERVAL"
|
|
306
|
+
echo "}"
|
|
307
|
+
} > "$DIAGNOSTICS_FILE"
|
|
308
|
+
|
|
309
|
+
rm -f "$DIAGNOSTICS_FILE.tmp"
|
|
310
|
+
}
|
|
146
311
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
312
|
+
check_session_health() {
|
|
313
|
+
local max_severity=$SEVERITY_INFO
|
|
314
|
+
local issues=()
|
|
315
|
+
|
|
316
|
+
# Initialize temp diagnostics file
|
|
317
|
+
mkdir -p "$(dirname "$DIAGNOSTICS_FILE")"
|
|
318
|
+
: > "$DIAGNOSTICS_FILE.tmp"
|
|
319
|
+
|
|
320
|
+
# Run all checks
|
|
321
|
+
check_lock_file || true
|
|
322
|
+
local lock_sev=$?
|
|
323
|
+
[[ $lock_sev -gt $max_severity ]] && max_severity=$lock_sev
|
|
324
|
+
|
|
325
|
+
check_zombie_processes || true
|
|
326
|
+
local zombie_sev=$?
|
|
327
|
+
[[ $zombie_sev -gt $max_severity ]] && max_severity=$zombie_sev
|
|
328
|
+
|
|
329
|
+
check_mcp_health || true
|
|
330
|
+
local mcp_sev=$?
|
|
331
|
+
[[ $mcp_sev -gt $max_severity ]] && max_severity=$mcp_sev
|
|
332
|
+
|
|
333
|
+
check_orphaned_jobs || true
|
|
334
|
+
|
|
335
|
+
# Determine overall status
|
|
336
|
+
local overall_status="healthy"
|
|
337
|
+
if [[ $max_severity -eq $SEVERITY_CRITICAL ]]; then
|
|
338
|
+
overall_status="critical"
|
|
339
|
+
CONSECUTIVE_WARNINGS=$((CONSECUTIVE_WARNINGS + 1))
|
|
340
|
+
elif [[ $max_severity -eq $SEVERITY_WARNING ]]; then
|
|
341
|
+
overall_status="warning"
|
|
342
|
+
CONSECUTIVE_WARNINGS=$((CONSECUTIVE_WARNINGS + 1))
|
|
343
|
+
else
|
|
344
|
+
overall_status="healthy"
|
|
345
|
+
CONSECUTIVE_WARNINGS=0 # Reset on healthy check
|
|
150
346
|
fi
|
|
151
347
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
reason_str=$(IFS=", "; echo "${reasons[*]}")
|
|
348
|
+
# Write diagnostics for /specweave:jobs
|
|
349
|
+
write_diagnostics "$max_severity" "$overall_status"
|
|
155
350
|
|
|
351
|
+
# Only alert if CRITICAL and seen multiple consecutive times
|
|
352
|
+
if [[ $max_severity -eq $SEVERITY_CRITICAL ]] && [[ $CONSECUTIVE_WARNINGS -ge $CONSECUTIVE_THRESHOLD ]]; then
|
|
156
353
|
# Create signal file
|
|
157
354
|
echo "stuck_at=$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$SIGNAL_FILE"
|
|
158
|
-
echo "
|
|
355
|
+
echo "severity=critical" >> "$SIGNAL_FILE"
|
|
356
|
+
echo "consecutive_warnings=$CONSECUTIVE_WARNINGS" >> "$SIGNAL_FILE"
|
|
159
357
|
|
|
160
|
-
send_notification "🚨 Claude Code
|
|
358
|
+
send_notification $SEVERITY_CRITICAL "🚨 Claude Code STUCK" "Zombie processes detected - Run cleanup-state.sh"
|
|
161
359
|
|
|
162
360
|
log "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
163
|
-
log "${RED}SESSION STUCK DETECTED${NC}"
|
|
164
|
-
log "${RED}
|
|
361
|
+
log "${RED}CRITICAL: SESSION STUCK DETECTED${NC}"
|
|
362
|
+
log "${RED}Consecutive warnings: $CONSECUTIVE_WARNINGS${NC}"
|
|
165
363
|
log ""
|
|
166
364
|
log "Recovery steps:"
|
|
167
365
|
log " 1. Press Ctrl+C multiple times in Claude Code terminal"
|
|
@@ -171,85 +369,35 @@ check_session_health() {
|
|
|
171
369
|
log "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
172
370
|
|
|
173
371
|
return 1
|
|
372
|
+
|
|
373
|
+
elif [[ $max_severity -eq $SEVERITY_WARNING ]]; then
|
|
374
|
+
# Log warning but don't notify (might be false positive)
|
|
375
|
+
log "${YELLOW}⚠️ Warning detected (${CONSECUTIVE_WARNINGS}/${CONSECUTIVE_THRESHOLD} before alert)${NC}"
|
|
376
|
+
return 0
|
|
377
|
+
|
|
174
378
|
else
|
|
175
|
-
#
|
|
379
|
+
# Healthy - remove signal file if exists
|
|
176
380
|
rm -f "$SIGNAL_FILE"
|
|
177
381
|
log "${GREEN}✓ Session healthy${NC}"
|
|
178
382
|
return 0
|
|
179
383
|
fi
|
|
180
384
|
}
|
|
181
385
|
|
|
182
|
-
# Coordination Functions
|
|
183
|
-
check_active_watchdog() {
|
|
184
|
-
# Check if another watchdog is running via session registry
|
|
185
|
-
node "${PROJECT_ROOT}/dist/src/cli/check-watchdog.js" 2>/dev/null || echo ""
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
register_watchdog() {
|
|
189
|
-
# Register this watchdog in session registry
|
|
190
|
-
node "${PROJECT_ROOT}/dist/src/cli/register-session.js" "$SESSION_ID" $$ "watchdog" 2>&1 | \
|
|
191
|
-
grep -v "^$" | head -3
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
update_watchdog_heartbeat() {
|
|
195
|
-
# Update watchdog heartbeat
|
|
196
|
-
node "${PROJECT_ROOT}/dist/src/cli/update-heartbeat.js" "$SESSION_ID" 2>/dev/null || true
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
cleanup_watchdog() {
|
|
200
|
-
# Remove watchdog from registry on exit
|
|
201
|
-
node "${PROJECT_ROOT}/dist/src/cli/remove-session.js" "$SESSION_ID" 2>&1 | \
|
|
202
|
-
grep -v "^$" | head -3
|
|
203
|
-
log "Watchdog cleanup complete"
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
run_cleanup_service() {
|
|
207
|
-
# Run zombie process cleanup
|
|
208
|
-
node "${PROJECT_ROOT}/dist/src/cli/cleanup-zombies.js" 60 2>&1 | \
|
|
209
|
-
grep -v "^$" | head -10 || true
|
|
210
|
-
}
|
|
211
|
-
|
|
212
386
|
# Main execution
|
|
213
387
|
if [[ "$DAEMON_MODE" == "true" ]]; then
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
log "${YELLOW}Watchdog already active (PID: $active_watchdog)${NC}"
|
|
219
|
-
log "Exiting to avoid duplicate watchdogs"
|
|
220
|
-
exit 0
|
|
221
|
-
fi
|
|
222
|
-
|
|
223
|
-
# Register as watchdog
|
|
224
|
-
register_watchdog
|
|
225
|
-
|
|
226
|
-
# Trap signals for graceful shutdown
|
|
227
|
-
trap cleanup_watchdog SIGTERM SIGINT EXIT
|
|
228
|
-
|
|
229
|
-
log "Starting session watchdog daemon (interval: ${CHECK_INTERVAL}s, threshold: ${STUCK_THRESHOLD_SECONDS}s)"
|
|
230
|
-
log "Watchdog session: $SESSION_ID (PID: $$)"
|
|
388
|
+
log "${BLUE}Starting session watchdog v2.0 (smart detection, CRITICAL-only alerts)${NC}"
|
|
389
|
+
log " Interval: ${CHECK_INTERVAL}s | Threshold: ${STUCK_THRESHOLD_SECONDS}s"
|
|
390
|
+
log " Consecutive warnings needed: ${CONSECUTIVE_THRESHOLD}"
|
|
391
|
+
log " Diagnostics: ${DIAGNOSTICS_FILE}"
|
|
231
392
|
log "Press Ctrl+C to stop"
|
|
232
393
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
update_watchdog_heartbeat
|
|
394
|
+
# Create log directory
|
|
395
|
+
mkdir -p "${SPECWEAVE_ROOT}/logs"
|
|
236
396
|
|
|
237
|
-
|
|
397
|
+
while true; do
|
|
238
398
|
check_session_health || true
|
|
239
|
-
|
|
240
|
-
# Run cleanup service
|
|
241
|
-
run_cleanup_service
|
|
242
|
-
|
|
243
|
-
# Check if parent process still exists (if we have a parent session)
|
|
244
|
-
if ! kill -0 $PPID 2>/dev/null; then
|
|
245
|
-
log "${YELLOW}Parent process died, exiting watchdog${NC}"
|
|
246
|
-
break
|
|
247
|
-
fi
|
|
248
|
-
|
|
249
399
|
sleep "$CHECK_INTERVAL"
|
|
250
400
|
done
|
|
251
|
-
|
|
252
|
-
cleanup_watchdog
|
|
253
401
|
else
|
|
254
402
|
log "Running single health check..."
|
|
255
403
|
check_session_health
|
|
@@ -178,26 +178,29 @@ echo "Using coverageTarget: $coverageTarget"
|
|
|
178
178
|
|
|
179
179
|
### STEP 0B: Get Project Context (MANDATORY - BLOCKING!)
|
|
180
180
|
|
|
181
|
-
**⛔
|
|
181
|
+
**⛔ THIS IS A HARD BLOCK - YOU CANNOT PROCEED WITHOUT PROJECT CONTEXT!**
|
|
182
182
|
|
|
183
|
-
|
|
183
|
+
**🚨 FAILURE TO COMPLETE THIS STEP = spec.md WILL BE BLOCKED BY VALIDATION HOOK!**
|
|
184
184
|
|
|
185
|
+
Before generating ANY spec.md content, you MUST:
|
|
186
|
+
|
|
187
|
+
**1. RUN THE CONTEXT API (via Bash tool):**
|
|
185
188
|
```bash
|
|
186
189
|
specweave context projects
|
|
187
190
|
```
|
|
188
191
|
|
|
189
|
-
|
|
192
|
+
**2. CAPTURE AND STORE THE OUTPUT:**
|
|
190
193
|
|
|
194
|
+
For 1-level structures:
|
|
191
195
|
```json
|
|
192
196
|
{
|
|
193
197
|
"level": 1,
|
|
194
198
|
"projects": [{"id": "my-app", "name": "My App"}],
|
|
195
|
-
"detectionReason": "multiProject configuration"
|
|
196
|
-
"source": "multi-project"
|
|
199
|
+
"detectionReason": "multiProject configuration"
|
|
197
200
|
}
|
|
198
201
|
```
|
|
199
202
|
|
|
200
|
-
|
|
203
|
+
For 2-level structures (ADO/JIRA boards):
|
|
201
204
|
```json
|
|
202
205
|
{
|
|
203
206
|
"level": 2,
|
|
@@ -207,27 +210,54 @@ This returns JSON with available projects and structure level:
|
|
|
207
210
|
{"id": "digital-ops", "name": "Digital Operations"},
|
|
208
211
|
{"id": "mobile-team", "name": "Mobile Team"}
|
|
209
212
|
]
|
|
210
|
-
}
|
|
211
|
-
"detectionReason": "ADO area path mapping configured",
|
|
212
|
-
"source": "ado-area-path"
|
|
213
|
+
}
|
|
213
214
|
}
|
|
214
215
|
```
|
|
215
216
|
|
|
216
|
-
**
|
|
217
|
+
**3. RESOLVE PROJECT/BOARD FOR EACH USER STORY:**
|
|
218
|
+
|
|
219
|
+
```
|
|
220
|
+
CONTEXT_OUTPUT = <output from specweave context projects>
|
|
221
|
+
|
|
222
|
+
For each US you will generate:
|
|
223
|
+
IF CONTEXT_OUTPUT.level == 1:
|
|
224
|
+
US.project = select from CONTEXT_OUTPUT.projects[].id
|
|
225
|
+
|
|
226
|
+
IF CONTEXT_OUTPUT.level == 2:
|
|
227
|
+
US.project = select from CONTEXT_OUTPUT.projects[].id
|
|
228
|
+
US.board = select from CONTEXT_OUTPUT.boardsByProject[project][].id
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
**4. NOW PROCEED TO STEP 1 (with resolved values stored)**
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
**VALIDATION RULES (ENFORCED BY HOOK):**
|
|
217
236
|
|
|
218
237
|
```
|
|
219
|
-
✅ REQUIRED:
|
|
220
|
-
✅ REQUIRED:
|
|
221
|
-
✅ REQUIRED:
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
238
|
+
✅ REQUIRED: Actually RUN "specweave context projects" command
|
|
239
|
+
✅ REQUIRED: Parse the JSON and extract project IDs
|
|
240
|
+
✅ REQUIRED: project field MUST match one of projects[].id from output
|
|
241
|
+
✅ REQUIRED: board field (2-level) MUST match one of boardsByProject[project][].id
|
|
242
|
+
✅ REQUIRED: Each US has **Project**: and **Board**: (2-level) with RESOLVED values
|
|
243
|
+
|
|
244
|
+
❌ FORBIDDEN: Skipping this step and generating spec.md directly
|
|
245
|
+
❌ FORBIDDEN: Inventing project names not in the API output
|
|
246
|
+
❌ FORBIDDEN: Using folder names as project (e.g., "sw-olysense")
|
|
247
|
+
❌ FORBIDDEN: Using {{PROJECT_ID}} or {{BOARD_ID}} placeholders
|
|
225
248
|
❌ FORBIDDEN: Creating spec.md for 2-level without board: field
|
|
249
|
+
❌ FORBIDDEN: Generating spec.md without running context API first
|
|
226
250
|
```
|
|
227
251
|
|
|
252
|
+
**WHY THIS IS BLOCKING:**
|
|
253
|
+
- Hook `spec-project-validator.sh` BLOCKS spec.md with placeholders or invalid projects
|
|
254
|
+
- Without resolved project/board, living docs sync FAILS
|
|
255
|
+
- Without resolved project/board, external tool sync (GitHub/JIRA/ADO) FAILS
|
|
256
|
+
- User gets blocked error and must manually fix - BAD UX!
|
|
257
|
+
|
|
228
258
|
**Structure Levels:**
|
|
229
|
-
- **1-Level**: `internal/specs/{project}/FS-XXX/` - requires `project`
|
|
230
|
-
- **2-Level**: `internal/specs/{project}/{board}/FS-XXX/` - requires
|
|
259
|
+
- **1-Level**: `internal/specs/{project}/FS-XXX/` - requires `project` per US
|
|
260
|
+
- **2-Level**: `internal/specs/{project}/{board}/FS-XXX/` - requires `project` AND `board` per US
|
|
231
261
|
|
|
232
262
|
**Alternative: Interactive Selection:**
|
|
233
263
|
```bash
|