npm - specweave - Versions diffs - 1.0.28 → 1.0.30 - Mend

specweave 1.0.28 → 1.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/plugins/specweave/hooks/lib/semaphore.sh ADDED Viewed

@@ -0,0 +1,216 @@
+#!/bin/bash
+# semaphore.sh - File-based semaphore for limiting concurrent hook execution
+#
+# PROBLEM SOLVED:
+#   Process storms occur when many hooks spawn simultaneously, overwhelming the system.
+#   Instead of detecting storms and blocking EVERYTHING (current behavior), this semaphore
+#   limits concurrency properly - excess requests wait or timeout gracefully.
+#
+# USAGE:
+#   source semaphore.sh
+#   if acquire_semaphore "hook-name" 10 5000; then
+#     # Do work
+#     release_semaphore "hook-name"
+#   else
+#     # Timeout - return safe default
+#   fi
+#
+# DESIGN:
+#   - Uses file-based locks (portable, no dependencies)
+#   - Configurable max concurrent slots
+#   - Configurable timeout with exponential backoff
+#   - Auto-cleanup of stale locks (older than 30s)
+#   - Request queuing with FIFO ordering
+#
+# v1.0.0 - Initial implementation (2025-12-17)
+set -o pipefail
+# === Configuration ===
+SEMAPHORE_DIR="${SPECWEAVE_STATE_DIR:-.specweave/state}/semaphores"
+SEMAPHORE_MAX_AGE_SECONDS=30  # Stale lock threshold
+SEMAPHORE_DEBUG="${SEMAPHORE_DEBUG:-0}"
+# === Initialization ===
+_init_semaphore_dir() {
+  mkdir -p "$SEMAPHORE_DIR" 2>/dev/null || true
+}
+# === Logging ===
+_sem_log() {
+  [[ "$SEMAPHORE_DEBUG" == "1" ]] && echo "[SEM $(date +%H:%M:%S.%3N)] $*" >&2
+}
+# === Cleanup stale locks ===
+# Locks older than SEMAPHORE_MAX_AGE_SECONDS are considered abandoned
+cleanup_stale_locks() {
+  local name="$1"
+  local lock_dir="$SEMAPHORE_DIR/$name"
+  [[ ! -d "$lock_dir" ]] && return 0
+  local now
+  now=$(date +%s)
+  for lock_file in "$lock_dir"/*.lock; do
+    [[ ! -f "$lock_file" ]] && continue
+    local mtime
+    if [[ "$(uname)" == "Darwin" ]]; then
+      mtime=$(stat -f %m "$lock_file" 2>/dev/null || echo "0")
+    else
+      mtime=$(stat -c %Y "$lock_file" 2>/dev/null || echo "0")
+    fi
+    local age=$((now - mtime))
+    if [[ "$age" -gt "$SEMAPHORE_MAX_AGE_SECONDS" ]]; then
+      _sem_log "Cleaning stale lock: $lock_file (age: ${age}s)"
+      rm -f "$lock_file" 2>/dev/null || true
+    fi
+  done
+}
+# === Count active slots ===
+count_active_slots() {
+  local name="$1"
+  local lock_dir="$SEMAPHORE_DIR/$name"
+  [[ ! -d "$lock_dir" ]] && echo "0" && return
+  local count=0
+  for lock_file in "$lock_dir"/*.lock; do
+    [[ -f "$lock_file" ]] && count=$((count + 1))
+  done
+  echo "$count"
+}
+# === Acquire semaphore slot ===
+# Args: name, max_slots, timeout_ms
+# Returns: 0 if acquired, 1 if timeout
+acquire_semaphore() {
+  local name="${1:-default}"
+  local max_slots="${2:-10}"
+  local timeout_ms="${3:-5000}"
+  _init_semaphore_dir
+  local lock_dir="$SEMAPHORE_DIR/$name"
+  mkdir -p "$lock_dir" 2>/dev/null || true
+  # Generate unique slot ID (use random instead of %N which doesn't work on macOS)
+  local slot_id="$$-$RANDOM$RANDOM"
+  local lock_file="$lock_dir/${slot_id}.lock"
+  # Store slot ID for release
+  export _SEMAPHORE_SLOT_ID="$slot_id"
+  export _SEMAPHORE_NAME="$name"
+  export _SEMAPHORE_LOCK_FILE="$lock_file"
+  local start_time
+  # macOS doesn't support %N, use seconds * 1000 as approximation
+  if command -v gdate &>/dev/null; then
+    start_time=$(gdate +%s%3N)
+  else
+    start_time=$(($(date +%s) * 1000))
+  fi
+  local attempt=0
+  local backoff_ms=10  # Start with 10ms backoff
+  local max_backoff_ms=200
+  while true; do
+    # Cleanup stale locks periodically (every 5 attempts)
+    [[ $((attempt % 5)) -eq 0 ]] && cleanup_stale_locks "$name"
+    local current_slots
+    current_slots=$(count_active_slots "$name")
+    if [[ "$current_slots" -lt "$max_slots" ]]; then
+      # Try to acquire slot atomically
+      if (set -o noclobber; echo "$$" > "$lock_file") 2>/dev/null; then
+        _sem_log "Acquired slot $slot_id for $name (slots: $((current_slots + 1))/$max_slots)"
+        return 0
+      fi
+    fi
+    # Check timeout
+    local now
+    if command -v gdate &>/dev/null; then
+      now=$(gdate +%s%3N)
+    else
+      now=$(($(date +%s) * 1000))
+    fi
+    local elapsed=$((now - start_time))
+    if [[ "$elapsed" -ge "$timeout_ms" ]]; then
+      _sem_log "Timeout acquiring $name after ${elapsed}ms (slots: $current_slots/$max_slots)"
+      return 1
+    fi
+    # Exponential backoff with jitter
+    local jitter=$((RANDOM % 20))
+    local sleep_ms=$((backoff_ms + jitter))
+    _sem_log "Waiting for slot $name (attempt $attempt, backoff ${sleep_ms}ms, slots: $current_slots/$max_slots)"
+    # Sleep (convert ms to fractional seconds)
+    sleep "0.$(printf '%03d' $sleep_ms)" 2>/dev/null || sleep 0.1
+    # Increase backoff (exponential with cap)
+    backoff_ms=$((backoff_ms * 2))
+    [[ "$backoff_ms" -gt "$max_backoff_ms" ]] && backoff_ms=$max_backoff_ms
+    attempt=$((attempt + 1))
+  done
+}
+# === Release semaphore slot ===
+release_semaphore() {
+  local name="${1:-$_SEMAPHORE_NAME}"
+  local lock_file="${_SEMAPHORE_LOCK_FILE}"
+  if [[ -n "$lock_file" ]] && [[ -f "$lock_file" ]]; then
+    rm -f "$lock_file" 2>/dev/null || true
+    _sem_log "Released slot for $name"
+  fi
+  unset _SEMAPHORE_SLOT_ID
+  unset _SEMAPHORE_NAME
+  unset _SEMAPHORE_LOCK_FILE
+}
+# === Get semaphore status ===
+get_semaphore_status() {
+  local name="${1:-default}"
+  local max_slots="${2:-10}"
+  _init_semaphore_dir
+  cleanup_stale_locks "$name"
+  local active
+  active=$(count_active_slots "$name")
+  echo "{\"name\":\"$name\",\"active\":$active,\"max\":$max_slots,\"available\":$((max_slots - active))}"
+}
+# === Force release all slots (emergency) ===
+force_release_all() {
+  local name="${1:-default}"
+  local lock_dir="$SEMAPHORE_DIR/$name"
+  if [[ -d "$lock_dir" ]]; then
+    rm -f "$lock_dir"/*.lock 2>/dev/null || true
+    _sem_log "Force released all slots for $name"
+  fi
+}
+# === Trap handler for automatic cleanup ===
+_semaphore_cleanup_trap() {
+  release_semaphore 2>/dev/null || true
+}
+# Register cleanup trap if sourced
+if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
+  trap _semaphore_cleanup_trap EXIT INT TERM
+fi

package/plugins/specweave/hooks/universal/fail-fast-wrapper.sh CHANGED Viewed

@@ -1,37 +1,70 @@
 #!/bin/bash
-# fail-fast-wrapper.sh - HARD TIMEOUT wrapper for all hooks
+# fail-fast-wrapper.sh - HARD TIMEOUT wrapper for all hooks with proper concurrency control
 # If ANY hook takes longer than HOOK_TIMEOUT, it gets KILLED.
 #
 # Usage: bash fail-fast-wrapper.sh <hook-script> [args...]
 #
 # Environment:
-#   HOOK_TIMEOUT - max seconds (default: 5)
-#   HOOK_DEBUG   - set to 1 for verbose logging
+#   HOOK_TIMEOUT        - max seconds (default: 5)
+#   HOOK_DEBUG          - set to 1 for verbose logging
+#   HOOK_MAX_CONCURRENT - max concurrent hooks (default: 15)
+#   HOOK_ACQUIRE_TIMEOUT_MS - semaphore acquire timeout (default: 3000)
 #
 # Exit behavior:
 #   - Returns hook output on success
 #   - Returns safe JSON on timeout ({"continue":true} or {"decision":"approve"})
 #   - NEVER hangs - timeout is enforced with SIGKILL
 #
-# CRASH PREVENTION:
-#   - Integrates with crash-prevention.sh for process storm detection
-#   - Auto-kills zombie processes on timeout
-#   - Records failures for circuit breaker
+# CONCURRENCY CONTROL (v1.0.30):
+#   - Semaphore-based concurrency limiting (NOT process storm detection)
+#   - Proper circuit breaker with CLOSED/OPEN/HALF_OPEN states
+#   - Structured logging with request tracing
+#   - Metrics collection for observability
 #
 # v0.33.0 - Enhanced with crash prevention integration
+# v1.0.30 - Complete rewrite with proper concurrency primitives
 set -o pipefail
 # === Configuration ===
 HOOK_TIMEOUT="${HOOK_TIMEOUT:-5}"  # 5 seconds - more than enough for any hook
 HOOK_DEBUG="${HOOK_DEBUG:-0}"
+HOOK_MAX_CONCURRENT="${HOOK_MAX_CONCURRENT:-15}"  # Max concurrent hooks
+HOOK_ACQUIRE_TIMEOUT_MS="${HOOK_ACQUIRE_TIMEOUT_MS:-3000}"  # 3 seconds to acquire semaphore
 LOG_FILE="${HOME}/.claude/hook-failures.log"
-# === Crash Prevention Integration ===
+# === Library paths ===
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-CRASH_PREVENTION="${SCRIPT_DIR}/../lib/crash-prevention.sh"
+LIB_DIR="${SCRIPT_DIR}/../lib"
-# Source crash prevention if available (non-blocking)
+# === Source libraries (fail gracefully if missing) ===
+SEMAPHORE_LOADED=false
+CIRCUIT_BREAKER_LOADED=false
+LOGGING_LOADED=false
+METRICS_LOADED=false
+# Set state dir for all libraries
+export SPECWEAVE_STATE_DIR="${SPECWEAVE_STATE_DIR:-.specweave/state}"
+export SPECWEAVE_LOG_DIR="${SPECWEAVE_LOG_DIR:-.specweave/logs/hooks}"
+if [[ -f "$LIB_DIR/semaphore.sh" ]]; then
+  source "$LIB_DIR/semaphore.sh" 2>/dev/null && SEMAPHORE_LOADED=true
+fi
+if [[ -f "$LIB_DIR/circuit-breaker.sh" ]]; then
+  source "$LIB_DIR/circuit-breaker.sh" 2>/dev/null && CIRCUIT_BREAKER_LOADED=true
+fi
+if [[ -f "$LIB_DIR/logging.sh" ]]; then
+  source "$LIB_DIR/logging.sh" 2>/dev/null && LOGGING_LOADED=true
+fi
+if [[ -f "$LIB_DIR/metrics.sh" ]]; then
+  source "$LIB_DIR/metrics.sh" 2>/dev/null && METRICS_LOADED=true
+fi
+# Legacy crash prevention (fallback)
+CRASH_PREVENTION="${LIB_DIR}/crash-prevention.sh"
 if [[ -f "$CRASH_PREVENTION" ]]; then
   source "$CRASH_PREVENTION" 2>/dev/null || true
 fi
@@ -44,7 +77,7 @@ log_debug() {
 log_failure() {
   local msg="$1"
   mkdir -p "$(dirname "$LOG_FILE")"
-  echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] HOOK TIMEOUT: $msg" >> "$LOG_FILE"
+  echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] HOOK FAILURE: $msg" >> "$LOG_FILE"
 }
 # === Safe JSON output based on hook type ===
@@ -58,6 +91,12 @@ get_safe_output() {
   fi
 }
+# === Get hook name from script path ===
+get_hook_name() {
+  local script="$1"
+  basename "$script" .sh | tr '/' '-'
+}
 # === Read stdin with timeout ===
 # Critical: stdin can block forever if not handled properly
 read_stdin_with_timeout() {
@@ -92,18 +131,68 @@ main() {
     exit 0
   fi
-  # === CRASH PREVENTION: Process Storm Detection ===
-  # If too many hooks are running, skip this one to prevent cascade
-  if type detect_process_storm &>/dev/null; then
-    local storm_status
-    storm_status=$(detect_process_storm 25)
-    if [[ "$storm_status" == STORM* ]]; then
-      log_failure "$script - BLOCKED due to process storm: $storm_status"
+  local hook_name
+  hook_name=$(get_hook_name "$script")
+  # === Initialize libraries ===
+  if [[ "$LOGGING_LOADED" == "true" ]]; then
+    log_init "$hook_name"
+    log_debug "Starting hook execution" "script=$script"
+  fi
+  if [[ "$METRICS_LOADED" == "true" ]]; then
+    metrics_init "$hook_name"
+    metrics_start_request
+  fi
+  # === CIRCUIT BREAKER CHECK ===
+  # If circuit is open for this hook, fail fast
+  if [[ "$CIRCUIT_BREAKER_LOADED" == "true" ]]; then
+    if ! cb_allow_request "$hook_name"; then
+      local cb_status
+      cb_status=$(cb_get_status "$hook_name")
+      log_debug "Circuit breaker OPEN for $hook_name: $cb_status"
+      if [[ "$LOGGING_LOADED" == "true" ]]; then
+        log_warn "Circuit breaker open - failing fast" "hook=$hook_name"
+      fi
+      if [[ "$METRICS_LOADED" == "true" ]]; then
+        metrics_end_request "skipped"
+      fi
+      get_safe_output "$script"
+      exit 0
+    fi
+  fi
+  # === SEMAPHORE: Acquire concurrency slot ===
+  local semaphore_acquired=false
+  if [[ "$SEMAPHORE_LOADED" == "true" ]]; then
+    if acquire_semaphore "hooks" "$HOOK_MAX_CONCURRENT" "$HOOK_ACQUIRE_TIMEOUT_MS"; then
+      semaphore_acquired=true
+      log_debug "Acquired semaphore slot for $hook_name"
+    else
+      # Could not acquire semaphore in time - graceful degradation
+      log_debug "Semaphore timeout for $hook_name - graceful skip"
+      if [[ "$LOGGING_LOADED" == "true" ]]; then
+        log_warn "Semaphore acquisition timeout - graceful degradation" "hook=$hook_name" "max_concurrent=$HOOK_MAX_CONCURRENT"
+      fi
+      if [[ "$METRICS_LOADED" == "true" ]]; then
+        metrics_end_request "skipped"
+      fi
+      # DON'T record as failure - this is graceful degradation, not an error
       get_safe_output "$script"
       exit 0
     fi
   fi
+  # Ensure semaphore is released on exit
+  trap 'release_semaphore 2>/dev/null || true' EXIT INT TERM
   log_debug "Executing: $script (timeout: ${HOOK_TIMEOUT}s)"
   # Read stdin first (with its own timeout)
@@ -111,11 +200,8 @@ main() {
   stdin_content=$(read_stdin_with_timeout)
   # Execute the hook with hard timeout
-  # Using timeout with --kill-after to ensure SIGKILL if SIGTERM doesn't work
   local output
   local exit_code
-  # Create temp file for output (avoid subshell issues)
   local tmp_out
   tmp_out=$(mktemp)
@@ -156,12 +242,30 @@ main() {
   output=$(cat "$tmp_out" 2>/dev/null)
   rm -f "$tmp_out"
+  # Release semaphore immediately after execution
+  if [[ "$semaphore_acquired" == "true" ]]; then
+    release_semaphore
+  fi
   # Handle timeout (exit code 124 or 137)
   if [[ $exit_code -eq 124 ]] || [[ $exit_code -eq 137 ]]; then
     log_failure "$script - killed after ${HOOK_TIMEOUT}s"
     log_debug "TIMEOUT: $script killed after ${HOOK_TIMEOUT}s"
-    # === CRASH PREVENTION: Clean up potential zombie processes ===
+    if [[ "$LOGGING_LOADED" == "true" ]]; then
+      log_error "Hook timeout - killed after ${HOOK_TIMEOUT}s" "hook=$hook_name"
+    fi
+    if [[ "$METRICS_LOADED" == "true" ]]; then
+      metrics_end_request "timeout"
+    fi
+    # Record failure for circuit breaker
+    if [[ "$CIRCUIT_BREAKER_LOADED" == "true" ]]; then
+      cb_record_failure "$hook_name"
+    fi
+    # Clean up potential zombie processes (legacy)
     if type kill_zombie_heredocs &>/dev/null; then
       kill_zombie_heredocs 2>/dev/null || true
     fi
@@ -170,6 +274,36 @@ main() {
     exit 0
   fi
+  # Handle hook errors (non-zero exit, excluding block exit code 2)
+  if [[ $exit_code -ne 0 ]] && [[ $exit_code -ne 2 ]]; then
+    log_debug "Hook error: $script exited with $exit_code"
+    if [[ "$LOGGING_LOADED" == "true" ]]; then
+      log_warn "Hook exited with error" "hook=$hook_name" "exit_code=$exit_code"
+    fi
+    if [[ "$METRICS_LOADED" == "true" ]]; then
+      metrics_end_request "failure"
+    fi
+    if [[ "$CIRCUIT_BREAKER_LOADED" == "true" ]]; then
+      cb_record_failure "$hook_name"
+    fi
+  else
+    # Success!
+    if [[ "$METRICS_LOADED" == "true" ]]; then
+      metrics_end_request "success"
+    fi
+    if [[ "$CIRCUIT_BREAKER_LOADED" == "true" ]]; then
+      cb_record_success "$hook_name"
+    fi
+    if [[ "$LOGGING_LOADED" == "true" ]]; then
+      log_debug "Hook completed successfully" "hook=$hook_name"
+    fi
+  fi
   # Return output or safe default
   if [[ -n "$output" ]]; then
     echo "$output"