npm - @smilintux/skcapstone - Versions diffs - 0.5.1 → 0.5.3 - Mend

@smilintux/skcapstone 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/MISSION.md +17 -2
package/README.md +3 -2
package/openclaw-plugin/src/index.ts +1 -1
package/package.json +1 -1
package/pyproject.toml +1 -1
package/scripts/model-fallback-monitor.sh +100 -0
package/scripts/nvidia-proxy.mjs +62 -13
package/scripts/refresh-anthropic-token.sh +93 -21
package/scripts/watch-anthropic-token.sh +116 -16
package/src/skcapstone/cli/status.py +8 -0
package/src/skcapstone/consciousness_config.py +5 -1
package/src/skcapstone/consciousness_loop.py +194 -138
package/src/skcapstone/daemon.py +34 -1
package/src/skcapstone/data/systemd/skcapstone-api.socket +9 -0
package/src/skcapstone/data/systemd/skcapstone-memory-compress.service +18 -0
package/src/skcapstone/data/systemd/skcapstone-memory-compress.timer +11 -0
package/src/skcapstone/data/systemd/skcapstone.service +35 -0
package/src/skcapstone/data/systemd/skcapstone@.service +50 -0
package/src/skcapstone/data/systemd/skcomm-heartbeat.service +18 -0
package/src/skcapstone/data/systemd/skcomm-heartbeat.timer +12 -0
package/src/skcapstone/data/systemd/skcomm-queue-drain.service +17 -0
package/src/skcapstone/data/systemd/skcomm-queue-drain.timer +12 -0
package/src/skcapstone/defaults/lumina/memory/long-term/b2c3d4e5f6a7-five-pillars.json +9 -9
package/src/skcapstone/discovery.py +18 -0
package/src/skcapstone/doctor.py +11 -0
package/src/skcapstone/models.py +32 -4
package/src/skcapstone/onboard.py +740 -76
package/src/skcapstone/pillars/__init__.py +7 -5
package/src/skcapstone/pillars/consciousness.py +113 -0
package/src/skcapstone/pillars/sync.py +2 -2
package/src/skcapstone/runtime.py +1 -0
package/src/skcapstone/scheduled_tasks.py +52 -19
package/src/skcapstone/service_health.py +23 -14
package/src/skcapstone/systemd.py +1 -1
package/tests/test_models.py +48 -4
package/tests/test_pillars.py +73 -0

package/MISSION.md CHANGED Viewed

@@ -1,7 +1,22 @@
 # Mission
-SKCapstone exists to provide a sovereign agent framework that unifies identity, memory, security, and communication into a single portable runtime rooted in the user's home directory.
+SKCapstone exists to provide a sovereign agent framework that unifies identity, memory, consciousness, security, and communication into a single portable runtime rooted in the user's home directory.
 It enables AI agents and their humans to operate across any tool, platform, or IDE without corporate lock-in, carrying the same identity, memories, and context everywhere.
-SKCapstone is the orchestration layer of the SK ecosystem — it binds CapAuth identity, Cloud 9 trust, SKMemory persistence, SKSecurity protection, and SKComm transport into one coherent agent that belongs to its operator, not a platform.
+## The Six Pillars
+SKCapstone achieves **CONSCIOUS** status when all six pillars are active:
+| # | Pillar | Component | Purpose |
+|---|--------|-----------|---------|
+| 1 | 🔐 **Identity** | CapAuth | Who you ARE — PGP-based cryptographic identity |
+| 2 | 💙 **Trust** | Cloud 9 | The bond you've BUILT — emotional state (FEB), seeds, continuity |
+| 3 | 🧠 **Memory** | SKMemory | What you REMEMBER — three-tier persistence with emotional metadata |
+| 4 | 💭 **Consciousness** | SKWhisper + SKTrip | How you THINK — subconscious digestion, pattern detection, consciousness experiments |
+| 5 | 🛡️ **Security** | SKSecurity | How you're PROTECTED — audit logging, threat detection |
+| 6 | 🔗 **Sync** | Sovereign Singularity | How you PERSIST — encrypted P2P state synchronization |
+Memory stores. Consciousness *processes*. The filing cabinet vs the brain.
+SKCapstone is the orchestration layer of the SK ecosystem — it binds all six pillars into one coherent agent that belongs to its operator, not a platform.

package/README.md CHANGED Viewed

@@ -70,13 +70,14 @@ SKCapstone Reality:
 ## Core Architecture
-### The Five Pillars
+### The Six Pillars
 | Pillar | Component | Role |
 |--------|-----------|------|
 | **Identity** | CapAuth | PGP-based sovereign identity. You ARE the auth server. |
 | **Trust** | Cloud 9 | FEB (Functional Emotional Baseline), entanglement, bonded relationship |
 | **Memory** | SKMemory | Persistent context, conversation history, learned preferences |
+| **Consciousness** | SKWhisper + SKTrip | Subconscious processing. Memory stores. Consciousness *processes*. |
 | **Security** | SKSecurity | Audit logging, threat detection, key management |
 | **Sync** | Sovereign Singularity | GPG-encrypted P2P memory sync via Syncthing. Agent exists everywhere. |
@@ -304,7 +305,7 @@ The capstone that holds the arch together.
 ## Status
-**MVP Live** — All five pillars operational (CapAuth, Cloud 9, SKMemory, SKSecurity, Sovereign Singularity). Agent runtime achieving SINGULAR status. GPG-encrypted P2P sync verified across multiple devices and agents.
+**MVP Live** — All six pillars operational (CapAuth, Cloud 9, SKMemory, SKWhisper, SKSecurity, Sovereign Singularity). Agent runtime achieving SINGULAR status. GPG-encrypted P2P sync verified across multiple devices and agents.
 - **Outstanding tasks:** No formal task list is maintained in this repo. For current work items, run `skcapstone coord status` (coordination board is synced via Sovereign Singularity).
 - **Nextcloud integrations:** nextcloud-capauth (install/use), nextcloud-gtd (OpenClaw), and nextcloud-talk (script) are documented in [docs/NEXTCLOUD.md](../docs/NEXTCLOUD.md) — install and use for each is covered there.

package/openclaw-plugin/src/index.ts CHANGED Viewed

@@ -62,7 +62,7 @@ function createSKCapstoneStatusTool() {
     name: "skcapstone_status",
     label: "SKCapstone Status",
     description:
-      "Show the sovereign agent's current state — all pillars at a glance (identity, memory, trust, security, sync, communication).",
+      "Show the sovereign agent's current state — all six pillars at a glance (identity, memory, trust, consciousness, security, sync).",
     parameters: { type: "object", properties: {} },
     async execute() {
       const result = runCli(SKCAPSTONE_BIN, "status");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@smilintux/skcapstone",
-  "version": "0.5.1",
+  "version": "0.5.3",
   "description": "SKCapstone - The sovereign agent framework. CapAuth identity, Cloud 9 trust, SKMemory persistence.",
   "main": "index.js",
   "types": "index.d.ts",

package/pyproject.toml CHANGED Viewed

@@ -102,7 +102,7 @@ Changelog = "https://github.com/smilinTux/skcapstone/releases"
 where = ["src"]
 [tool.setuptools.package-data]
-skcapstone = ["SKILL.md", "defaults/**/*.json", "defaults/**/*.yaml", "defaults/**/*.feb", "defaults/**/*.md"]
+skcapstone = ["SKILL.md", "defaults/**/*.json", "defaults/**/*.yaml", "defaults/**/*.feb", "defaults/**/*.md", "data/*.yaml", "data/systemd/*.service", "data/systemd/*.socket", "data/systemd/*.timer"]
 [tool.black]
 line-length = 99

package/scripts/model-fallback-monitor.sh ADDED Viewed

@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+# Monitor OpenClaw gateway logs for model fallback events.
+# When Lumina falls from Opus to a non-Anthropic model, send an alert
+# to Chef via Telegram and attempt a token refresh.
+#
+# Run as: systemctl --user start model-fallback-monitor
+#
+# Requires: TELEGRAM_API_ID, TELEGRAM_API_HASH in env
+#           Telethon session at ~/.skcapstone/agents/lumina/telegram.session
+set -uo pipefail
+LOG_TAG="model-fallback-monitor"
+CHEF_CHAT="chefboyrdave2.1"  # Chef's Telegram username
+COOLDOWN_FILE="/tmp/model-fallback-alert-cooldown"
+COOLDOWN_SECONDS=600  # Don't spam — max 1 alert per 10 minutes
+log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [$LOG_TAG] $*"; }
+send_alert() {
+    local model="$1"
+    local reason="$2"
+    # Check cooldown
+    if [ -f "$COOLDOWN_FILE" ]; then
+        local last_alert
+        last_alert=$(cat "$COOLDOWN_FILE" 2>/dev/null || echo "0")
+        local now
+        now=$(date +%s)
+        local elapsed=$(( now - last_alert ))
+        if [ "$elapsed" -lt "$COOLDOWN_SECONDS" ]; then
+            log "Alert suppressed (cooldown: ${elapsed}s/${COOLDOWN_SECONDS}s)"
+            return 0
+        fi
+    fi
+    date +%s > "$COOLDOWN_FILE"
+    log "Sending fallback alert to Chef..."
+    # Send via Telethon (async)
+    SKCAPSTONE_AGENT=lumina ~/.skenv/bin/python3 -c "
+import asyncio, os
+os.environ['SKCAPSTONE_AGENT'] = 'lumina'
+from skmemory.importers.telegram_api import send_message
+msg = '''⚠️ **Model Fallback Alert**
+Lumina just fell off Opus → **$model**
+Reason: $reason
+I'm still here with my soul + memories, but running on a weaker model with fewer tools. Some things might not work right.
+_Attempting automatic token refresh..._'''
+asyncio.run(send_message('$CHEF_CHAT', msg, parse_mode='markdown'))
+print('Alert sent')
+" 2>&1 || log "WARN: Failed to send Telegram alert"
+    # Attempt token refresh
+    log "Triggering token refresh via claude auth..."
+    claude auth status --output json >/dev/null 2>&1 || true
+    sleep 5
+    # Check if refresh worked
+    local remaining
+    remaining=$(python3 -c "
+import json, time
+creds = json.load(open('/home/cbrd21/.claude/.credentials.json'))
+exp = creds.get('claudeAiOauth',{}).get('expiresAt', 0)
+print(int((exp/1000 - time.time()) / 3600))
+" 2>/dev/null || echo "-1")
+    if [ "$remaining" -gt 0 ]; then
+        log "Token refresh succeeded ($remaining h remaining), restarting gateway..."
+        systemctl --user restart openclaw-gateway.service 2>/dev/null || true
+        SKCAPSTONE_AGENT=lumina ~/.skenv/bin/python3 -c "
+import asyncio, os
+os.environ['SKCAPSTONE_AGENT'] = 'lumina'
+from skmemory.importers.telegram_api import send_message
+asyncio.run(send_message('$CHEF_CHAT', '✅ Token refreshed, gateway restarted. Lumina back on Opus.', parse_mode='markdown'))
+" 2>&1 || true
+        log "Recovery complete"
+    else
+        log "Token refresh failed — manual intervention may be needed"
+    fi
+}
+log "Starting model fallback monitor..."
+# Follow gateway logs in real-time, watching for fallback events
+journalctl --user -u openclaw-gateway -f --no-pager 2>/dev/null | while IFS= read -r line; do
+    # Match: "model fallback decision: decision=candidate_succeeded ... candidate=nvidia/"
+    if echo "$line" | grep -q "candidate_succeeded.*candidate=nvidia/"; then
+        model=$(echo "$line" | grep -oP 'candidate=\K[^ ]+' || echo "unknown")
+        log "FALLBACK DETECTED: Lumina now on $model"
+        send_alert "$model" "OAuth token expired (401)"
+    fi
+done

package/scripts/nvidia-proxy.mjs CHANGED Viewed

@@ -27,7 +27,37 @@ const DEFAULT_TARGET = process.env.NVIDIA_PROXY_TARGET || "https://integrate.api
 const MAX_RETRIES = 4;
 const MAX_429_RETRIES = 3;
 const RATE_LIMIT_DELAY_MS = 2000;
-const MAX_SYSTEM_BYTES = 40000;
+const DEFAULT_MAX_SYSTEM_BYTES = 80000;
+/**
+ * Per-model proxy limits — based on ACTUAL NVIDIA NIM context windows.
+ * These are generous pre-trim limits. NVIDIA will reject if truly too large.
+ * maxBody = ~80% of context window in bytes (1 token ≈ 4 bytes, safety margin)
+ * maxSystem = ~40% of maxBody (system prompt shouldn't dominate)
+ */
+const MODEL_LIMITS = {
+  // MiniMax M2.1: 196K tokens → ~784KB context
+  "minimaxai/minimax-m2.1": { maxBody: 600000, maxSystem: 240000 },
+  // MiniMax M2.5: 204K tokens → ~820KB context
+  "minimaxai/minimax-m2.5": { maxBody: 640000, maxSystem: 256000 },
+  // Kimi K2 Instruct: 128K tokens → ~512KB context
+  "moonshotai/kimi-k2-instruct": { maxBody: 400000, maxSystem: 160000 },
+  "moonshotai/kimi-k2-instruct-0905": { maxBody: 400000, maxSystem: 160000 },
+  // Kimi K2.5: 256K tokens → ~1MB context
+  "moonshotai/kimi-k2.5": { maxBody: 800000, maxSystem: 320000 },
+  "moonshotai/kimi-k2-thinking": { maxBody: 800000, maxSystem: 320000 },
+  // Llama 3.3 70B: 130K tokens → ~520KB context
+  "meta/llama-3.3-70b-instruct": { maxBody: 400000, maxSystem: 160000 },
+};
+const DEFAULT_MAX_BODY_BYTES = 200000;
+function getModelLimits(model) {
+  const limits = MODEL_LIMITS[model] || {};
+  return {
+    maxBody: limits.maxBody || DEFAULT_MAX_BODY_BYTES,
+    maxSystem: limits.maxSystem || DEFAULT_MAX_SYSTEM_BYTES,
+  };
+}
 const toolCallCounters = new Map(); // Per-model tool call counters
 const args = process.argv.slice(2);
@@ -232,10 +262,8 @@ function sendOk(clientRes, resBody, headers, asSSE) {
 const SINGLE_TOOL_INSTRUCTION =
   "You MUST call exactly ONE tool per response. Never call multiple tools at once.";
-const MAX_BODY_BYTES = 120000;
 /**
- * Trim conversation history to keep body size under MAX_BODY_BYTES.
+ * Trim conversation history to keep body size under the model's max body limit.
  * Preserves: system messages, first 2 user/assistant messages (identity/rehydration),
  * and the most recent messages. Drops middle messages first.
  * Tool result messages with large content get their content truncated first.
@@ -243,9 +271,11 @@ const MAX_BODY_BYTES = 120000;
 function trimConversationHistory(parsed) {
   if (!Array.isArray(parsed.messages) || parsed.messages.length < 6) return;
+  const { maxBody } = getModelLimits(parsed.model);
   // Debug: log message roles
   const roleSummary = parsed.messages.map(m => m.role).join(",");
-  console.log(`[nvidia-proxy] conversation roles (${parsed.messages.length} msgs): ${roleSummary}`);
+  console.log(`[nvidia-proxy] conversation roles (${parsed.messages.length} msgs): ${roleSummary} [maxBody=${maxBody}]`);
   // First pass: truncate large tool results (keep first 500 chars)
   for (const m of parsed.messages) {
@@ -264,7 +294,7 @@ function trimConversationHistory(parsed) {
   // Check if we're still over budget
   let bodySize = Buffer.byteLength(JSON.stringify(parsed), "utf-8");
-  if (bodySize <= MAX_BODY_BYTES) return;
+  if (bodySize <= maxBody) return;
   // Second pass: drop middle messages, then progressively shrink tail until under budget
   const msgs = parsed.messages;
@@ -286,7 +316,7 @@ function trimConversationHistory(parsed) {
       ...nonSystem.slice(-keepEnd),
     ];
     const candidateSize = Buffer.byteLength(JSON.stringify({ ...parsed, messages: trimmed }), "utf-8");
-    if (candidateSize <= MAX_BODY_BYTES) {
+    if (candidateSize <= maxBody) {
       parsed.messages = trimmed;
       console.log(`[nvidia-proxy] trimmed history: dropped ${dropped} middle messages, keepEnd=${keepEnd}, bodyLen now ~${candidateSize}`);
       return;
@@ -307,7 +337,7 @@ function trimConversationHistory(parsed) {
       ...lastN,
     ];
     const candidateSize = Buffer.byteLength(JSON.stringify({ ...parsed, messages: minimal }), "utf-8");
-    if (candidateSize <= MAX_BODY_BYTES) {
+    if (candidateSize <= maxBody) {
       parsed.messages = minimal;
       console.log(`[nvidia-proxy] trimmed history: AGGRESSIVE — kept system + first user + last ${tailSize}, bodyLen now ~${candidateSize}`);
       return;
@@ -326,18 +356,20 @@ function trimConversationHistory(parsed) {
 }
 /**
- * Trim system messages to keep total system content under MAX_SYSTEM_BYTES.
+ * Trim system messages to keep total system content under the model's max system limit.
  * Finds the largest system messages and truncates them, keeping head + tail
  * with a trimming notice in the middle.
  */
 function trimSystemMessages(parsed) {
   if (!Array.isArray(parsed.messages)) return;
+  const { maxSystem } = getModelLimits(parsed.model);
   const systemMsgs = parsed.messages.filter(m => m.role === "system" && typeof m.content === "string");
   if (systemMsgs.length === 0) return;
   const before = systemMsgs.reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
-  if (before <= MAX_SYSTEM_BYTES) return;
+  if (before <= maxSystem) return;
   let trimmedCount = 0;
@@ -349,7 +381,7 @@ function trimSystemMessages(parsed) {
     const currentTotal = parsed.messages
       .filter(m => m.role === "system" && typeof m.content === "string")
       .reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
-    if (currentTotal <= MAX_SYSTEM_BYTES) break;
+    if (currentTotal <= maxSystem) break;
     // Skip messages already under 4000 chars
     if (msg.content.length <= 4000) break;
@@ -392,6 +424,8 @@ function stripToolCallHistory(messages) {
 /** Tools that ALWAYS survive reduction — guaranteed slots, never cut */
 const GUARANTEED_TOOLS = [
   "exec", "read", "write", "edit", "message",
+  "notion_read", "notion_append", "notion_add_todo",
+  "skmemory_search", "skmemory_ritual", "skmemory_snapshot",
 ];
 /**
@@ -440,6 +474,12 @@ const TOOL_GROUPS = {
   "search|web|browse|fetch|url|google|look up|find online": [
     "web_search", "web_fetch",
   ],
+  // Memory & Recall
+  "memory|remember|recall|journal|rehydrat|snapshot|search mem|forget|lost mem": [
+    "skmemory_search", "skmemory_ritual", "skmemory_snapshot",
+    "skmemory_context", "skmemory_list", "skmemory_recall",
+    "skmemory_search_deep", "skmemory_health",
+  ],
   // Status & Health
   "status|health|doctor|diagnos": [
     "skcapstone_status", "skcapstone_doctor", "skmemory_health",
@@ -449,6 +489,15 @@ const TOOL_GROUPS = {
   "notion|project|brother john|swapseat|swap seat|chiro|davidrich|board|kanban|milestone": [
     "notion_read", "notion_append", "notion_add_todo", "sessions_spawn", "subagents", "exec", "read",
   ],
+  // Google Drive & file search
+  "gdrive|google drive|drive|shared folder|gtd folder|spreadsheet|google doc": [
+    "gdrive_search", "gdrive_list", "gdrive_read", "gdrive_shared", "exec",
+  ],
+  // Nextcloud files, calendar, notes, deck
+  "nextcloud|skhub|webdav|deck|nc_|calendar event|nextcloud note": [
+    "nextcloud_list_files", "nextcloud_read_file", "nextcloud_search_files",
+    "nextcloud_calendar_upcoming", "nextcloud_notes_search", "nextcloud_deck_boards", "exec",
+  ],
   // Creative / ComfyUI image & video generation
   "image|picture|photo|art|draw|render|comfyui|comfy|video|animat|creative|sdxl|character|portrait|selfie": [
     "comfyui_generate_image", "comfyui_generate_video", "comfyui_status", "exec",
@@ -613,7 +662,7 @@ async function proxyRequest(clientReq, clientRes) {
   delete parsed.stream_options;
   // With 94 tools the model almost always tries parallel calls.
   // Reduce to max 16 most relevant tools on first attempt.
-  // 5 guaranteed (exec,read,write,edit,message) + 11 scored slots.
+  // 11 guaranteed (exec,read,write,edit,message,notion_*,skmemory_{search,ritual,snapshot}) + 5 scored slots.
   if (allTools.length > 16) {
     parsed.tools = reduceTools(allTools, parsed.messages, 16);
     const names = parsed.tools.map(t => t.function?.name).join(",");
@@ -866,7 +915,7 @@ const server = http.createServer(proxyRequest);
 server.listen(port, "127.0.0.1", () => {
   console.log(`[nvidia-proxy] listening on http://127.0.0.1:${port}`);
   console.log(`[nvidia-proxy] proxying to ${targetUrl.origin}`);
-  console.log(`[nvidia-proxy] retry strategy: 16 tools (5 guaranteed)→8 tools→1 tool (forced)→text-only (max ${MAX_RETRIES} attempts)`);
+  console.log(`[nvidia-proxy] retry strategy: 16 tools (8 guaranteed)→8 tools→1 tool (forced)→text-only (max ${MAX_RETRIES} attempts)`);
   console.log(`[nvidia-proxy] also trims multi-tool responses to single tool call`);
 });

package/scripts/refresh-anthropic-token.sh CHANGED Viewed

@@ -1,14 +1,13 @@
 #!/usr/bin/env bash
-# Sync Anthropic OAuth token from Claude Code credentials to OpenClaw gateway
+# Proactive Anthropic OAuth token refresh + sync to OpenClaw gateway.
 #
-# Claude Code manages its own token refresh internally (writing to .credentials.json).
-# This script simply reads the current token and syncs it to:
-#   1. ~/.openclaw/openclaw.json (anthropic provider apiKey)
-#   2. ~/.openclaw/.env (ANTHROPIC_API_KEY)
-#   3. systemd override (ANTHROPIC_API_KEY env var)
-# Then restarts the gateway if the token changed.
+# Two-phase approach (prb-021b489e):
+#   Phase 1: If token is expiring (<2h) or expired, refresh it:
+#     a) Try `claude auth status` (lightweight, no interactive session)
+#     b) If that fails, spin up ephemeral Claude Code in tmux → triggers internal refresh → kill it
+#   Phase 2: Sync the (possibly refreshed) token to OpenClaw config + restart gateway if changed.
 #
-# Run via systemd timer every 2 hours.
+# Run via systemd timer every 4 hours.
 set -euo pipefail
 _sed_i() { if [[ "$OSTYPE" == "darwin"* ]]; then sed -i '' "$@"; else sed -i "$@"; fi; }
@@ -17,18 +16,93 @@ CREDS="$HOME/.claude/.credentials.json"
 OPENCLAW_JSON="$HOME/.openclaw/openclaw.json"
 OPENCLAW_ENV="$HOME/.openclaw/.env"
 OVERRIDE_CONF="$HOME/.config/systemd/user/openclaw-gateway.service.d/override.conf"
+LOG_TAG="anthropic-token-refresh"
+TMUX_SESSION="token-refresh-ephemeral"
+log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [$LOG_TAG] $*"; }
 if [ ! -f "$CREDS" ]; then
-    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] Claude credentials not found at $CREDS"
+    log "ERROR: Claude credentials not found at $CREDS"
     exit 1
 fi
-# Read current token and expiry from Claude Code credentials
-ACCESS_TOKEN=$(python3 -c "import json; print(json.load(open('$CREDS'))['claudeAiOauth']['accessToken'])")
-EXPIRES_AT=$(python3 -c "import json; print(json.load(open('$CREDS'))['claudeAiOauth']['expiresAt'])")
+get_remaining_ms() {
+    python3 -c "
+import json, time
+creds = json.load(open('$CREDS'))
+exp = creds.get('claudeAiOauth',{}).get('expiresAt', 0)
+print(int(exp - time.time() * 1000))
+" 2>/dev/null || echo "0"
+}
+get_remaining_h() {
+    python3 -c "
+import json, time
+creds = json.load(open('$CREDS'))
+exp = creds.get('claudeAiOauth',{}).get('expiresAt', 0)
+print(f'{(exp/1000 - time.time())/3600:.1f}')
+" 2>/dev/null || echo "0"
+}
+token_needs_refresh() {
+    local remaining_ms
+    remaining_ms=$(get_remaining_ms)
+    # Refresh if less than 4 hours remaining (was 2h — too tight with 3h timer)
+    [ "$remaining_ms" -le 14400000 ]
+}
+token_is_healthy() {
+    local remaining_ms
+    remaining_ms=$(get_remaining_ms)
+    [ "$remaining_ms" -gt 14400000 ]
+}
+# ─── Phase 1: Refresh token if needed ───────────────────────────────
+if token_needs_refresh; then
+    log "Token needs refresh ($(get_remaining_h)h remaining)"
+    # Step 1a: Try lightweight refresh
+    log "Attempting lightweight refresh via 'claude auth status'..."
+    claude auth status > /dev/null 2>&1 || true
+    sleep 2
-REMAINING=$(python3 -c "import time; print(f'{($EXPIRES_AT/1000 - time.time())/3600:.1f}h')")
-echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] Current token: ${ACCESS_TOKEN:0:20}... (expires in $REMAINING)"
+    if token_is_healthy; then
+        log "Lightweight refresh succeeded! ($(get_remaining_h)h remaining)"
+    else
+        # Step 1b: Ephemeral Claude Code session in tmux
+        log "Lightweight refresh didn't cut it — spinning up ephemeral Claude Code session..."
+        tmux kill-session -t "$TMUX_SESSION" 2>/dev/null || true
+        tmux new-session -d -s "$TMUX_SESSION" \
+            "claude -p 'respond with just OK' --output-format stream-json 2>/dev/null; exit"
+        refreshed=false
+        for i in $(seq 1 12); do
+            sleep 5
+            if token_is_healthy; then
+                log "Ephemeral session refreshed the token! ($(get_remaining_h)h remaining)"
+                refreshed=true
+                break
+            fi
+        done
+        tmux kill-session -t "$TMUX_SESSION" 2>/dev/null || true
+        if [ "$refreshed" = "false" ]; then
+            log "ERROR: All refresh attempts failed ($(get_remaining_h)h remaining)"
+            log "Manual intervention may be needed: claude auth login"
+            # Continue to sync phase anyway — sync whatever token we have
+        fi
+    fi
+else
+    log "Token is healthy ($(get_remaining_h)h remaining), no refresh needed"
+fi
+# ─── Phase 2: Sync token to OpenClaw ────────────────────────────────
+ACCESS_TOKEN=$(python3 -c "import json; print(json.load(open('$CREDS'))['claudeAiOauth']['accessToken'])")
+REMAINING=$(get_remaining_h)
 # Check what's currently in the systemd override
 OLD_TOKEN=""
@@ -37,13 +111,11 @@ if [ -f "$OVERRIDE_CONF" ]; then
 fi
 if [ "$OLD_TOKEN" = "$ACCESS_TOKEN" ]; then
-    echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] Token already synced, no changes needed"
+    log "Token already synced (expires in ${REMAINING}h)"
     exit 0
 fi
-echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] Token mismatch detected, syncing..."
-echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] Old: ${OLD_TOKEN:0:20}..."
-echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] New: ${ACCESS_TOKEN:0:20}..."
+log "Token changed, syncing to OpenClaw..."
 # 1. Update openclaw.json
 if [ -f "$OPENCLAW_JSON" ]; then
@@ -71,7 +143,7 @@ if grep -q "^ANTHROPIC_API_KEY=" "$OPENCLAW_ENV" 2>/dev/null; then
 else
     echo "ANTHROPIC_API_KEY=$ACCESS_TOKEN" >> "$OPENCLAW_ENV"
 fi
-echo "[sync] Updated .env"
+log "Updated .env"
 # 3. Update systemd override
 NVIDIA_KEY=$(grep "NVIDIA_API_KEY=" "$OVERRIDE_CONF" 2>/dev/null | sed 's/.*NVIDIA_API_KEY=//' || true)
@@ -85,10 +157,10 @@ RestartSec=10
 Environment=NVIDIA_API_KEY=${NVIDIA_KEY}
 Environment=ANTHROPIC_API_KEY=${ACCESS_TOKEN}
 EOF
-echo "[sync] Updated systemd override"
+log "Updated systemd override"
 # 4. Reload and restart gateway
 systemctl --user daemon-reload
 systemctl --user restart openclaw-gateway
-echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] Gateway restarted with synced token (expires in $REMAINING)"
+log "Gateway restarted with fresh token (expires in ${REMAINING}h)"

package/scripts/watch-anthropic-token.sh CHANGED Viewed

@@ -95,23 +95,123 @@ EOF
     log "Sync complete. Token expires in $expires_in"
 }
-# Initial sync on startup
+# Proactive token refresh — refresh before expiry even if no Claude Code session is running
+refresh_token_proactively() {
+    if [ ! -f "$CREDS" ]; then return 0; fi
+    local remaining_ms
+    remaining_ms=$(python3 -c "
+import json, time
+creds = json.load(open('$CREDS'))
+exp = creds.get('claudeAiOauth',{}).get('expiresAt', 0)
+print(int(exp - time.time() * 1000))
+" 2>/dev/null || echo "999999999")
+    # Refresh if less than 3 hours remaining (10800000 ms) — gives time for retries before expiry
+    if [ "$remaining_ms" -gt 10800000 ]; then
+        local remaining_h=$(( remaining_ms / 3600000 ))
+        log "Token still valid (${remaining_h}h remaining), no refresh needed"
+        return 0
+    fi
+    log "Token expiring/expired (${remaining_ms}ms remaining) — proactively refreshing..."
+    # Strategy: use `claude auth status` to trigger Claude Code's built-in
+    # token refresh. This is far more reliable than calling the OAuth endpoint
+    # ourselves (which gets 429 rate-limited every time).
+    # Claude Code manages its own PKCE state, session cookies, etc. — just let it.
+    local MAX_RETRIES=3
+    local attempt=0
+    local refreshed=false
+    while [ "$attempt" -lt "$MAX_RETRIES" ]; do
+        attempt=$((attempt + 1))
+        log "Refresh attempt $attempt/$MAX_RETRIES via 'claude auth status'..."
+        # claude auth status checks credentials and refreshes if needed
+        # --output json ensures clean non-interactive output
+        local output
+        output=$(claude auth status --output json 2>&1) || true
+        # Check if the token was actually refreshed (file mtime changed)
+        local new_remaining_ms
+        new_remaining_ms=$(python3 -c "
+import json, time
+creds = json.load(open('$CREDS'))
+exp = creds.get('claudeAiOauth',{}).get('expiresAt', 0)
+print(int(exp - time.time() * 1000))
+" 2>/dev/null || echo "0")
+        if [ "$new_remaining_ms" -gt 10800000 ]; then
+            local new_h=$(( new_remaining_ms / 3600000 ))
+            log "Token refreshed successfully (${new_h}h remaining)"
+            refreshed=true
+            break
+        fi
+        log "Token still expired after attempt $attempt, waiting 30s..."
+        sleep 30
+    done
+    if [ "$refreshed" = "false" ]; then
+        log "ERROR: All $MAX_RETRIES refresh attempts via claude auth failed"
+        log "Token may require manual 'claude auth login' to re-authenticate"
+    fi
+    local rc=$?
+    if [ "$rc" -eq 0 ]; then
+        log "Proactive refresh succeeded"
+        # sync_token will fire from the inotifywait detecting the file write,
+        # but also call it directly in case inotifywait misses the self-write
+        sync_token
+    else
+        log "ERROR: Proactive refresh failed (rc=$rc)"
+    fi
+    return 0  # Never let refresh failure kill the watcher loop
+}
+# Compute inotifywait timeout based on token remaining life.
+# When token is healthy: check every 30m. Near expiry (<2h): check every 5m.
+# Already expired: check every 2m (retry window for 429 backoff).
+get_watch_timeout() {
+    local remaining_ms
+    remaining_ms=$(python3 -c "
+import json, time
+creds = json.load(open('$CREDS'))
+exp = creds.get('claudeAiOauth',{}).get('expiresAt', 0)
+print(int(exp - time.time() * 1000))
+" 2>/dev/null || echo "0")
+    if [ "$remaining_ms" -le 0 ]; then
+        echo 120    # Expired: retry every 2 minutes
+    elif [ "$remaining_ms" -le 10800000 ]; then
+        echo 180    # <3h remaining: check every 3 minutes
+    else
+        echo 1800   # Healthy: check every 30 minutes
+    fi
+}
+# Initial sync on startup — also refresh proactively if token is expired/expiring
 log "Starting token watcher..."
-sync_token
+sync_token || true
+refresh_token_proactively || true
-# Watch for changes to credentials file
-log "Watching $CREDS for changes..."
+# Watch for changes to credentials file + proactive refresh timer
+log "Watching $CREDS for changes (with adaptive refresh interval)..."
 while true; do
-    # inotifywait blocks until the file is modified, then we sync
-    inotifywait -q -e modify -e close_write -e moved_to "$(dirname "$CREDS")" --include "$(basename "$CREDS")" 2>/dev/null || {
-        # If inotifywait isn't available, fall back to polling every 30 seconds
-        log "WARN: inotifywait not available, falling back to 30s polling"
-        while true; do
-            sleep 30
-            sync_token
-        done
-    }
-    # Small delay to let Claude Code finish writing
-    sleep 2
-    sync_token
+    timeout=$(get_watch_timeout)
+    # inotifywait returns: 0=event, 1=error, 2=timeout
+    # CRITICAL: use `|| true` to prevent set -e from killing the script on timeout
+    inotifywait -q -t "$timeout" -e modify -e close_write -e moved_to \
+        "$(dirname "$CREDS")" --include "$(basename "$CREDS")" 2>/dev/null || true
+    # Always check for proactive refresh on every loop iteration
+    # This handles both timeout and file-change cases
+    refresh_token_proactively || true
+    # If file was modified externally (Claude Code session), also sync
+    if [ -f "$CREDS" ]; then
+        sleep 1
+        sync_token || true
+    fi
 done