npm - bluera-knowledge - Versions diffs - 0.31.0 → 0.33.0 - Mend

bluera-knowledge 0.31.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/.claude-plugin/plugin.json +23 -0
package/.mcp.json +13 -0
package/CHANGELOG.md +42 -0
package/NOTICE +47 -0
package/README.md +2 -2
package/bun.lock +1978 -0
package/dist/{chunk-B335UOU7.js → chunk-3TB7TDVF.js} +24 -3
package/dist/chunk-3TB7TDVF.js.map +1 -0
package/dist/{chunk-KCI4U6FH.js → chunk-KDZDLJUY.js} +2 -2
package/dist/{chunk-AEXFPA57.js → chunk-YDTTD53Y.js} +158 -26
package/dist/chunk-YDTTD53Y.js.map +1 -0
package/dist/index.js +3 -3
package/dist/mcp/bootstrap.js +10 -0
package/dist/mcp/bootstrap.js.map +1 -1
package/dist/mcp/server.d.ts +5 -3
package/dist/mcp/server.js +2 -2
package/dist/workers/background-worker-cli.js +2 -2
package/hooks/check-ready.sh +109 -0
package/hooks/hooks.json +97 -0
package/hooks/job-status-hook.sh +51 -0
package/hooks/posttooluse-bk-reminder.py +126 -0
package/hooks/posttooluse-web-research.py +209 -0
package/hooks/posttooluse-websearch-bk.py +158 -0
package/hooks/pretooluse-bk-suggest.py +296 -0
package/hooks/skill-activation.py +221 -0
package/hooks/skill-rules.json +131 -0
package/package.json +9 -2
package/scripts/CLAUDE.md +65 -0
package/scripts/auto-setup.sh +65 -0
package/scripts/bench-regression.sh +345 -0
package/scripts/dev.sh +16 -0
package/scripts/doctor.sh +103 -0
package/scripts/download-models.ts +188 -0
package/scripts/export-web-store.ts +142 -0
package/scripts/lib/mock-server.sh +70 -0
package/scripts/mcp-wrapper.sh +91 -0
package/scripts/setup.sh +224 -0
package/scripts/statusline-module.sh +29 -0
package/scripts/test-mcp-dev.js +260 -0
package/scripts/validate-local.sh +412 -0
package/scripts/validate-npm-release.sh +406 -0
package/skills/add-folder/SKILL.md +48 -0
package/skills/add-repo/SKILL.md +50 -0
package/skills/advanced-workflows/SKILL.md +273 -0
package/skills/cancel/SKILL.md +63 -0
package/skills/check-status/SKILL.md +130 -0
package/skills/crawl/SKILL.md +61 -0
package/skills/doctor/SKILL.md +27 -0
package/skills/eval/SKILL.md +222 -0
package/skills/health/SKILL.md +72 -0
package/skills/index/SKILL.md +48 -0
package/skills/knowledge-search/SKILL.md +110 -0
package/skills/remove-store/SKILL.md +52 -0
package/skills/search/SKILL.md +80 -0
package/skills/search/search.sh +63 -0
package/skills/search-optimization/SKILL.md +199 -0
package/skills/search-optimization/references/mistakes.md +21 -0
package/skills/search-optimization/references/strategies.md +80 -0
package/skills/skill-activation/SKILL.md +131 -0
package/skills/statusline/SKILL.md +19 -0
package/skills/store-lifecycle/SKILL.md +470 -0
package/skills/stores/SKILL.md +54 -0
package/skills/suggest/SKILL.md +118 -0
package/skills/sync/SKILL.md +96 -0
package/skills/test-plugin/SKILL.md +547 -0
package/skills/uninstall/SKILL.md +65 -0
package/skills/when-to-query/SKILL.md +160 -0
package/dist/chunk-AEXFPA57.js.map +0 -1
package/dist/chunk-B335UOU7.js.map +0 -1
/package/dist/{chunk-KCI4U6FH.js.map → chunk-KDZDLJUY.js.map} +0 -0

package/hooks/skill-rules.json ADDED Viewed

@@ -0,0 +1,131 @@
+{
+  "description": "bluera-knowledge skill activation rules - technology-agnostic patterns for development scenarios",
+  "version": 2,
+  "globalExclusions": [
+    { "keyword": "bluera-knowledge" },
+    { "keyword": "bluera knowledge" },
+    { "keyword": "/bluera-knowledge:" },
+    { "regex": "mcp__.*bluera" }
+  ],
+  "threshold": 2,
+  "skills": [
+    {
+      "name": "knowledge-search",
+      "description": "How to query Bluera Knowledge for library/dependency questions",
+      "triggers": [
+        { "regex": "the\\s+\\w+(-\\w+)*\\s+(package|library|module|framework|dependency)", "weight": 3 },
+        { "regex": "\\w+(-\\w+)*\\s+(package|library|module)\\s+(is|does|keeps|isn't|won't|doesn't)", "weight": 3 },
+        { "regex": "\\w+(-\\w+)*\\s+(documentation|docs)\\b", "weight": 2 },
+        { "regex": "error\\s+(from|in|with)\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|module)", "weight": 3 },
+        { "regex": "(package|library|dependency|module)\\s+(is\\s+)?(throwing|throws|error|failing)", "weight": 3 },
+        { "regex": "how\\s+does\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|module|framework)\\s+(handle|work|process)", "weight": 3 },
+        { "regex": "what\\s+does\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library)\\s+(do|return|accept)", "weight": 3 },
+        { "regex": "why\\s+does\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|module)", "weight": 3 },
+        { "regex": "(configure|config|settings)\\s+(for\\s+)?(the\\s+)?\\w+(-\\w+)*\\s+(package|library)", "weight": 3 },
+        { "regex": "(upgraded?|updated?)\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|dependency)", "weight": 2 },
+        { "regex": "integrate\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library)", "weight": 2 },
+        { "regex": "dependency\\s+(is|keeps|isn't|won't|error|issue|problem)", "weight": 2 },
+        { "regex": "third[- ]party\\s+(library|package|code)", "weight": 2 },
+        { "regex": "look\\s+(at|in|into)\\s+(the\\s+)?\\w+(-\\w+)*\\s+(source|code|internals)", "weight": 3 },
+        { "regex": "\\w+(-\\w+)*\\s+source\\s+code", "weight": 2 },
+        { "regex": "(find|show|get)\\s+(me\\s+)?(the\\s+)?\\w+(-\\w+)*\\s+(implementation|source|code)", "weight": 3 },
+        { "regex": "TypeError|ImportError|ModuleNotFoundError|AttributeError", "weight": 2 },
+        { "regex": "\\w+(-\\w+)*\\s+is\\s+(deprecated|removed|changed)", "weight": 2 },
+        { "regex": "breaking\\s+change.*(\\w+(-\\w+)*\\s+)?(package|library|dependency)", "weight": 3 },
+        { "regex": "migrate\\s+(from|to)\\s+\\w+(-\\w+)*", "weight": 2 }
+      ],
+      "exclusions": [
+        { "keyword": "my code" },
+        { "keyword": "our code" },
+        { "keyword": "this function" },
+        { "keyword": "this file" },
+        { "keyword": "this component" },
+        { "keyword": "this class" },
+        { "keyword": "this project" },
+        { "keyword": "I wrote" },
+        { "keyword": "we wrote" },
+        { "keyword": "my implementation" },
+        { "keyword": "store" },
+        { "keyword": "index" },
+        { "keyword": "in my" },
+        { "keyword": "in our" }
+      ]
+    },
+    {
+      "name": "when-to-query",
+      "description": "Decision guide for Bluera Knowledge vs Grep/Read",
+      "triggers": [
+        { "keyword": "should i grep", "weight": 2 },
+        { "keyword": "where should i look", "weight": 2 },
+        { "keyword": "grep or search", "weight": 2 },
+        { "keyword": "search or grep", "weight": 2 },
+        { "regex": "where\\s+(should|do)\\s+i\\s+(find|look)\\s+.*(library|package|dependency)", "weight": 3 },
+        { "regex": "is\\s+there\\s+a\\s+better\\s+way\\s+to\\s+(search|find)", "weight": 2 },
+        { "regex": "should\\s+i\\s+(use\\s+)?grep\\s+(for|to)", "weight": 2 },
+        { "regex": "how\\s+(do|should)\\s+i\\s+find.*(in|from)\\s+(a\\s+)?(library|package|dependency)", "weight": 3 }
+      ],
+      "exclusions": [
+        { "keyword": "store" },
+        { "keyword": "index" }
+      ]
+    },
+    {
+      "name": "search-optimization",
+      "description": "Optimizing search parameters and token usage",
+      "triggers": [
+        { "keyword": "too many results", "weight": 2 },
+        { "keyword": "too few results", "weight": 2 },
+        { "keyword": "limit results", "weight": 2 },
+        { "keyword": "reduce tokens", "weight": 2 },
+        { "keyword": "token usage", "weight": 2 },
+        { "keyword": "optimize search", "weight": 2 },
+        { "keyword": "detail level", "weight": 2 },
+        { "regex": "\\b(minimal|contextual|full)\\s+detail", "weight": 2 },
+        { "regex": "\\b(vector|fts|hybrid)\\s+(search|mode)", "weight": 2 },
+        { "regex": "narrow\\s+(down\\s+)?(the\\s+)?results", "weight": 2 },
+        { "regex": "search\\s+(is\\s+)?(returning|giving)\\s+(too\\s+)?(many|few)", "weight": 2 }
+      ],
+      "exclusions": [
+        { "regex": "--?(limit|detail|mode|threshold)\\s*=" }
+      ]
+    },
+    {
+      "name": "advanced-workflows",
+      "description": "Multi-tool orchestration patterns",
+      "triggers": [
+        { "keyword": "multi-step", "weight": 2 },
+        { "keyword": "orchestration", "weight": 2 },
+        { "keyword": "job monitoring", "weight": 2 },
+        { "keyword": "background job", "weight": 2 },
+        { "keyword": "combine tools", "weight": 2 },
+        { "keyword": "chain operations", "weight": 2 },
+        { "regex": "chain.*searches", "weight": 2 },
+        { "regex": "multiple.*searches", "weight": 2 },
+        { "regex": "search.*then\\s+(summarize|extract|filter)", "weight": 2 },
+        { "regex": "for\\s+each\\s+(search\\s+)?(result|match)", "weight": 2 }
+      ],
+      "exclusions": []
+    },
+    {
+      "name": "store-lifecycle",
+      "description": "Managing knowledge stores",
+      "triggers": [
+        { "keyword": "add store", "weight": 2 },
+        { "keyword": "create store", "weight": 2 },
+        { "keyword": "delete store", "weight": 2 },
+        { "keyword": "remove store", "weight": 2 },
+        { "keyword": "index store", "weight": 2 },
+        { "keyword": "re-index", "weight": 2 },
+        { "keyword": "reindex", "weight": 2 },
+        { "keyword": "knowledge store", "weight": 2 },
+        { "regex": "add\\s+(a\\s+)?(repo|repository|folder|directory)\\s+(to|for)\\s+(knowledge|indexing|search)", "weight": 3 },
+        { "regex": "index\\s+(a|the|my)\\s+(repo|repository|folder|directory|library|package)", "weight": 2 },
+        { "regex": "set\\s+up.*(knowledge|search)\\s*(store|index)", "weight": 2 },
+        { "regex": "(backup|snapshot|archive).*(knowledge|search)\\s*(store|index)", "weight": 2 }
+      ],
+      "exclusions": [
+        { "regex": "/bluera-knowledge:(add-repo|add-folder|remove-store|index)" }
+      ]
+    }
+  ]
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "bluera-knowledge",
-  "version": "0.31.0",
+  "version": "0.33.0",
   "description": "CLI tool for managing knowledge stores with semantic search",
   "type": "module",
   "bin": {
@@ -76,9 +76,16 @@
   "files": [
     "dist/",
     "python/",
+    "skills/",
+    "hooks/",
+    "scripts/",
+    ".claude-plugin/",
+    ".mcp.json",
+    "bun.lock",
     "README.md",
     "CHANGELOG.md",
-    "LICENSE"
+    "LICENSE",
+    "NOTICE"
   ],
   "devDependencies": {
     "@anthropic-ai/sdk": "^0.72.1",

package/scripts/CLAUDE.md ADDED Viewed

@@ -0,0 +1,65 @@
+# Scripts Directory
+Shell scripts for plugin setup, diagnostics, and MCP server.
+---
+## MCP Wrapper (`mcp-wrapper.sh`)
+Entry point for MCP server. Called by Claude Code when starting the plugin.
+**CRITICAL:** Uses `sort -V -r` to get LATEST cached version, not first alphabetically.
+- Bug: Alphabetical sort put 0.20.0 before 0.22.x
+- Fix: Version sort ensures latest cached version with all fixes
+**Flow:**
+1. Check `installed_plugins.json` for explicit path
+2. Fallback: Scan cache dirs, sort by version (descending), use first valid
+3. Run `bootstrap.js` which handles deps and starts MCP
+---
+## Setup (`setup.sh`)
+Manual and auto setup script. Installs dependencies.
+**CRITICAL:** Uses `npm install --legacy-peer-deps`
+- Required because tree-sitter-go@0.25 and tree-sitter-rust@0.24 have incompatible peer deps
+- Without this flag, npm fails even though tree-sitter is optional
+**Installs:**
+- MCP wrapper to `~/.local/bin/bluera-knowledge-mcp`
+- node_modules (bun or npm)
+- Playwright Chromium browser
+---
+## Auto-Setup (`auto-setup.sh`)
+Runs on SessionStart (async, non-blocking). Calls setup.sh if needed.
+**Fast exit:** If node_modules AND wrapper exist, exits immediately.
+---
+## Doctor (`doctor.sh`)
+Diagnostic tool for MCP failures. **Use this first when MCP breaks.**
+Invoked via: `/bluera-knowledge:doctor`
+**Checks:**
+- Build tools (make) - REQUIRED for native modules
+- Node.js version - WARNS on v24+ (native module issues)
+- node_modules - installation status
+- MCP wrapper - installation status
+- Python 3 - optional, for embeddings
+- Playwright - optional, for web crawling
+---
+## Check-Ready (`check-ready.sh`)
+Fast validation on SessionStart (sync, blocking with timeout).
+Verifies prerequisites without full setup. Exits 2 for blocking errors.

package/scripts/auto-setup.sh ADDED Viewed

@@ -0,0 +1,65 @@
+#!/bin/bash
+# Bluera Knowledge Plugin - Auto Setup
+# Runs on: SessionStart (async) - automatically sets up plugin if needed
+#
+# This script runs in the background on every session start.
+# It exits quickly (0) if already set up, or runs full setup if needed.
+# Non-interactive: cannot prompt for user input (no TTY).
+PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$(dirname "$0")")}"
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+# Debug logging
+LOG_DIR="${PROJECT_ROOT:-.}/.bluera/bluera-knowledge/logs"
+LOG_FILE="$LOG_DIR/app.log"
+log_debug() {
+    local msg="$1"
+    mkdir -p "$LOG_DIR" 2>/dev/null || true
+    local timestamp
+    timestamp=$(date -u +"%Y-%m-%dT%H:%M:%S.000Z" 2>/dev/null || date -u +"%Y-%m-%dT%H:%M:%SZ")
+    echo "{\"time\":\"$timestamp\",\"level\":\"debug\",\"module\":\"auto-setup.sh\",\"msg\":\"$msg\"}" >> "$LOG_FILE" 2>/dev/null || true
+}
+log_debug "Auto-setup starting, PLUGIN_ROOT=$PLUGIN_ROOT"
+# Fast exit if already set up
+WRAPPER_PATH="$HOME/.local/bin/bluera-knowledge-mcp"
+if [ -d "$PLUGIN_ROOT/node_modules" ] && [ -f "$WRAPPER_PATH" ]; then
+    log_debug "Already set up (node_modules and wrapper exist), exiting"
+    exit 0
+fi
+log_debug "Setup needed - node_modules: $([ -d "$PLUGIN_ROOT/node_modules" ] && echo 'exists' || echo 'missing'), wrapper: $([ -f "$WRAPPER_PATH" ] && echo 'exists' || echo 'missing')"
+# Check for build tools - if missing, print instructions and exit with error
+# Cannot auto-install because that requires sudo (interactive)
+if ! command -v make &>/dev/null; then
+    log_debug "Build tools (make) not found, printing instructions"
+    echo "[bluera-knowledge] ERROR: Build tools (make) not found - required for native modules." >&2
+    echo "" >&2
+    echo "Install build tools, then restart Claude Code:" >&2
+    echo "  Debian/Ubuntu: sudo apt install build-essential" >&2
+    echo "  Fedora/RHEL:   sudo dnf groupinstall 'Development Tools'" >&2
+    echo "  macOS:         xcode-select --install" >&2
+    # Exit 2 = blocking error, stderr shown to user
+    exit 2
+fi
+# Run setup non-interactively
+log_debug "Running setup.sh with NONINTERACTIVE=1"
+echo -e "${YELLOW}[bluera-knowledge] Running first-time setup (this may take a moment)...${NC}"
+export NONINTERACTIVE=1
+if "$PLUGIN_ROOT/scripts/setup.sh"; then
+    log_debug "Setup completed successfully"
+    echo -e "${GREEN}[bluera-knowledge] Setup complete ✓${NC}"
+    echo -e "${GREEN}[bluera-knowledge] Restart Claude Code to enable MCP server.${NC}"
+else
+    log_debug "Setup failed"
+    echo -e "${YELLOW}[bluera-knowledge] Setup failed. Run manually: $PLUGIN_ROOT/scripts/setup.sh${NC}"
+fi

package/scripts/bench-regression.sh ADDED Viewed

@@ -0,0 +1,345 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# bench-regression.sh — Hard merge gate for model changes
+#
+# Runs candidate vs champion on real-v1-test (3x alternating),
+# writes comparison.json, exits non-zero on failure.
+#
+# Usage:
+#   BK_FINETUNED_MODEL=./models/bge-small-finetuned-onnx scripts/bench-regression.sh
+#
+# Requires: jq, bun, benchmarks/search/baselines/champion-mean-v1-test.json
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$PROJECT_ROOT"
+CHAMPION_BASELINE="benchmarks/search/baselines/champion-mean-v1-test.json"
+SPLIT_FILE="benchmarks/search/splits/test-cases.json"
+TIMESTAMP="$(date -u +%Y%m%dT%H%M%S)"
+RESULTS_DIR="benchmarks/search/regression-results/${TIMESTAMP}"
+# Validate prerequisites
+if [ ! -f "$CHAMPION_BASELINE" ]; then
+  echo "ERROR: Champion baseline not found: $CHAMPION_BASELINE"
+  echo "Run Step 8A first."
+  exit 1
+fi
+if [ ! -f "$SPLIT_FILE" ]; then
+  echo "ERROR: Test split not found: $SPLIT_FILE"
+  echo "Run split-dataset.ts first."
+  exit 1
+fi
+mkdir -p "$RESULTS_DIR"
+echo "============================================================"
+echo "REGRESSION GATE — bench-regression.sh"
+echo "============================================================"
+echo "Timestamp: $TIMESTAMP"
+echo "Results:   $RESULTS_DIR"
+echo "Champion:  $CHAMPION_BASELINE"
+echo ""
+# --- Run 3 alternating champion/candidate pairs ---
+run_champion() {
+  local run_num=$1
+  local artifact="$RESULTS_DIR/champion-run-${run_num}.json"
+  echo "[Run ${run_num}/3] Champion (baseline)..."
+  (
+    unset BK_FINETUNED_MODEL BK_QUERY_PREFIX
+    bun run bench:search --dataset real-v1-test --reuse-index \
+      --artifacts "$artifact" 2>&1 | grep -E "Hit@1|SUMMARY|Artifact"
+  )
+  echo "  -> $artifact"
+}
+run_candidate() {
+  local run_num=$1
+  local artifact="$RESULTS_DIR/candidate-run-${run_num}.json"
+  echo "[Run ${run_num}/3] Candidate (finetuned)..."
+  # Finetuned BGE models still need the BGE query prefix
+  BK_QUERY_PREFIX='Represent this sentence for searching relevant passages: ' \
+    bun run bench:search --dataset real-v1-test --setup --force \
+    --artifacts "$artifact" 2>&1 | grep -E "Hit@1|SUMMARY|Artifact"
+  echo "  -> $artifact"
+}
+# Alternating: C1 -> X1 -> C2 -> X2 -> C3 -> X3
+for i in 1 2 3; do
+  run_champion "$i"
+  run_candidate "$i"
+done
+echo ""
+echo "All 6 runs complete. Analyzing..."
+echo ""
+# --- Fingerprint parity check ---
+echo "Fingerprint parity check..."
+CHAMPION_FP=$(jq -S '.configFingerprint | del(.embedding.model, .embedding.modelSha256)' "$RESULTS_DIR/champion-run-1.json")
+CANDIDATE_FP=$(jq -S '.configFingerprint | del(.embedding.model, .embedding.modelSha256)' "$RESULTS_DIR/candidate-run-1.json")
+if [ "$CHAMPION_FP" != "$CANDIDATE_FP" ]; then
+  echo "FAIL: Fingerprint mismatch (non-model fields differ)"
+  diff <(echo "$CHAMPION_FP") <(echo "$CANDIDATE_FP") || true
+  echo ""
+  echo "Only embedding.model and embedding.modelSha256 may differ."
+  # Write failed comparison
+  jq -n \
+    --arg splitHash "$(sha256sum "$SPLIT_FILE" | cut -d' ' -f1)" \
+    --arg gitSha "$(git rev-parse --short HEAD)" \
+    --arg verdict "FAIL: fingerprint mismatch" \
+    '{splitHash: $splitHash, gitSha: $gitSha, verdict: $verdict}' \
+    > "$RESULTS_DIR/comparison.json"
+  exit 1
+fi
+echo "  PASS: all non-model fields identical"
+# --- Compute integer hits ---
+# Weighted integer hits = sum of weight for cases where hitAt1 == true
+# Matches aggregation in metrics.ts:216
+compute_hits() {
+  local artifact=$1
+  jq '[.results[] | select(.metrics.hitAt1 == true) | .weight // 1] | add // 0' "$artifact"
+}
+echo ""
+echo "Integer hit counts (weighted):"
+CHAMPION_HITS=()
+CANDIDATE_HITS=()
+for i in 1 2 3; do
+  ch=$(compute_hits "$RESULTS_DIR/champion-run-${i}.json")
+  cx=$(compute_hits "$RESULTS_DIR/candidate-run-${i}.json")
+  CHAMPION_HITS+=("$ch")
+  CANDIDATE_HITS+=("$cx")
+  echo "  Pair $i: champion=$ch, candidate=$cx (delta=$((cx - ch)))"
+done
+# --- Stability check ---
+echo ""
+echo "Stability check..."
+if [ "${CHAMPION_HITS[0]}" != "${CHAMPION_HITS[1]}" ] || [ "${CHAMPION_HITS[0]}" != "${CHAMPION_HITS[2]}" ]; then
+  echo "  WARNING: Champion hits vary across runs: ${CHAMPION_HITS[*]}"
+fi
+if [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[1]}" ] || [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[2]}" ]; then
+  echo "  WARNING: Candidate hits vary across runs: ${CANDIDATE_HITS[*]}"
+fi
+STABLE="true"
+if [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[1]}" ] || [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[2]}" ]; then
+  STABLE="false"
+fi
+echo "  Candidate stable: $STABLE"
+# --- Per-query win/loss (using run-1 as reference) ---
+echo ""
+echo "Per-query analysis (run 1)..."
+# Extract hitAt1 per case ID for champion and candidate
+CHAMPION_CASES=$(jq -r '[.results[] | {id, hit: .metrics.hitAt1, weight: (.weight // 1)}]' "$RESULTS_DIR/champion-run-1.json")
+CANDIDATE_CASES=$(jq -r '[.results[] | {id, hit: .metrics.hitAt1, weight: (.weight // 1)}]' "$RESULTS_DIR/candidate-run-1.json")
+WINS=0
+LOSSES=0
+TIES=0
+# Compare per case
+CASE_IDS=$(jq -r '.[].id' <<< "$CHAMPION_CASES")
+while IFS= read -r case_id; do
+  ch_hit=$(jq -r --arg id "$case_id" '.[] | select(.id == $id) | .hit' <<< "$CHAMPION_CASES")
+  cx_hit=$(jq -r --arg id "$case_id" '.[] | select(.id == $id) | .hit' <<< "$CANDIDATE_CASES")
+  if [ "$ch_hit" = "false" ] && [ "$cx_hit" = "true" ]; then
+    ((WINS++)) || true
+  elif [ "$ch_hit" = "true" ] && [ "$cx_hit" = "false" ]; then
+    ((LOSSES++)) || true
+  else
+    ((TIES++)) || true
+  fi
+done <<< "$CASE_IDS"
+echo "  Wins: $WINS, Losses: $LOSSES, Ties: $TIES"
+# --- Category analysis ---
+echo ""
+echo "Category analysis (run 1)..."
+# Per-category integer hit deltas
+CATEGORY_DELTAS=$(jq -n \
+  --argjson champion "$CHAMPION_CASES" \
+  --argjson candidate "$CANDIDATE_CASES" \
+  '[($champion | group_by(.id | split("-")[0:2] | join("-")) | .[] |
+    {category: (.[0].id | split("-")[0:2] | join("-")),
+     champion_hits: [.[] | select(.hit) | .weight // 1] | add // 0}) ] as $ch_cats |
+   [($candidate | group_by(.id | split("-")[0:2] | join("-")) | .[] |
+    {category: (.[0].id | split("-")[0:2] | join("-")),
+     candidate_hits: [.[] | select(.hit) | .weight // 1] | add // 0}) ] as $cx_cats |
+   [$ch_cats[] as $c | {category: $c.category,
+     champion: $c.champion_hits,
+     candidate: ([$cx_cats[] | select(.category == $c.category) | .candidate_hits][0] // 0),
+     delta: (([$cx_cats[] | select(.category == $c.category) | .candidate_hits][0] // 0) - $c.champion_hits)}]')
+echo "$CATEGORY_DELTAS" | jq -r '.[] | "  \(.category): champion=\(.champion) candidate=\(.candidate) delta=\(.delta)"'
+# Check category guardrail: no category loses more than 1 weighted hit
+CATEGORY_FAIL=$(echo "$CATEGORY_DELTAS" | jq '[.[] | select(.delta < -1)] | length')
+# --- P95 latency ---
+echo ""
+echo "P95 latency:"
+P95_DELTAS=()
+for i in 1 2 3; do
+  ch_p95=$(jq '.summary.latency.p95' "$RESULTS_DIR/champion-run-${i}.json")
+  cx_p95=$(jq '.summary.latency.p95' "$RESULTS_DIR/candidate-run-${i}.json")
+  delta=$(echo "$cx_p95 - $ch_p95" | bc -l 2>/dev/null || echo "0")
+  P95_DELTAS+=("$delta")
+  printf "  Pair %d: champion=%.1fms candidate=%.1fms delta=%.1fms\n" "$i" "$ch_p95" "$cx_p95" "$delta"
+done
+# Median P95 check (baseline + 20ms)
+CHAMPION_P95_MEDIAN=$(for i in 1 2 3; do jq '.summary.latency.p95' "$RESULTS_DIR/champion-run-${i}.json"; done | sort -n | sed -n '2p')
+CANDIDATE_P95_MEDIAN=$(for i in 1 2 3; do jq '.summary.latency.p95' "$RESULTS_DIR/candidate-run-${i}.json"; done | sort -n | sed -n '2p')
+LATENCY_LIMIT=$(echo "$CHAMPION_P95_MEDIAN + 20" | bc -l)
+LATENCY_OK=$(echo "$CANDIDATE_P95_MEDIAN <= $LATENCY_LIMIT" | bc -l)
+printf "  Median P95: champion=%.1fms candidate=%.1fms limit=%.1fms\n" "$CHAMPION_P95_MEDIAN" "$CANDIDATE_P95_MEDIAN" "$LATENCY_LIMIT"
+# --- Gate decisions ---
+echo ""
+echo "============================================================"
+echo "GATE EVALUATION"
+echo "============================================================"
+VERDICT="PASS"
+REASONS=()
+# Gate 1: Integer-hit gate (raised bar)
+HIT_DELTA=$((CANDIDATE_HITS[0] - CHAMPION_HITS[0]))
+echo "Gate 1 — Integer-hit gate: delta=$HIT_DELTA weighted hits"
+GATE1_PASS="false"
+if [ "$HIT_DELTA" -ge 2 ]; then
+  echo "  PASS (path a): gain >= 2 weighted hits"
+  GATE1_PASS="true"
+elif [ "$HIT_DELTA" -ge 1 ] && [ "$LOSSES" -eq 0 ]; then
+  echo "  PASS (path b): gain >= 1 AND zero losses"
+  GATE1_PASS="true"
+else
+  echo "  FAIL: delta=$HIT_DELTA, losses=$LOSSES (need >=2 hits, or >=1 with 0 losses)"
+  VERDICT="FAIL"
+  REASONS+=("integer-hit gate: delta=$HIT_DELTA losses=$LOSSES")
+fi
+# Gate 2: Category guardrail
+echo "Gate 2 — Category guardrail: $CATEGORY_FAIL categories with >1 hit loss"
+if [ "$CATEGORY_FAIL" -gt 0 ]; then
+  echo "  FAIL: category regression detected"
+  VERDICT="FAIL"
+  REASONS+=("category guardrail: $CATEGORY_FAIL categories regressed >1 hit")
+else
+  echo "  PASS"
+fi
+# Gate 3: Per-query win/loss
+echo "Gate 3 — Per-query: wins=$WINS losses=$LOSSES"
+if [ "$WINS" -gt "$LOSSES" ]; then
+  echo "  PASS"
+else
+  echo "  FAIL: wins must exceed losses"
+  VERDICT="FAIL"
+  REASONS+=("per-query: wins=$WINS <= losses=$LOSSES")
+fi
+# Gate 4: Latency
+echo "Gate 4 — Latency: candidate median P95 within baseline + 20ms"
+if [ "$LATENCY_OK" -eq 1 ]; then
+  echo "  PASS"
+else
+  echo "  FAIL: candidate P95 exceeds limit"
+  VERDICT="FAIL"
+  REASONS+=("latency: candidate=$CANDIDATE_P95_MEDIAN > limit=$LATENCY_LIMIT")
+fi
+# Gate 5: Stability
+echo "Gate 5 — Stability: candidate hits identical across 3 runs"
+if [ "$STABLE" = "true" ]; then
+  echo "  PASS"
+else
+  echo "  FAIL: candidate hit count varies"
+  VERDICT="FAIL"
+  REASONS+=("stability: hits vary ${CANDIDATE_HITS[*]}")
+fi
+# --- Build comparison.json ---
+SPLIT_HASH=$(sha256sum "$SPLIT_FILE" 2>/dev/null | cut -d' ' -f1 || shasum -a 256 "$SPLIT_FILE" | cut -d' ' -f1)
+GIT_SHA=$(git rev-parse --short HEAD)
+# Model SHA (try ONNX model if BK_FINETUNED_MODEL is set)
+MODEL_SHA="base-model"
+if [ -n "${BK_FINETUNED_MODEL:-}" ] && [ -f "${BK_FINETUNED_MODEL}/onnx/model.onnx" ]; then
+  MODEL_SHA=$(sha256sum "${BK_FINETUNED_MODEL}/onnx/model.onnx" 2>/dev/null | cut -d' ' -f1 || shasum -a 256 "${BK_FINETUNED_MODEL}/onnx/model.onnx" | cut -d' ' -f1)
+fi
+# Training manifest (if exists)
+TRAINING_MANIFEST="{}"
+if [ -f "training/data/training-manifest.json" ]; then
+  TRAINING_MANIFEST=$(cat "training/data/training-manifest.json")
+fi
+REASON_STR=""
+if [ ${#REASONS[@]} -gt 0 ]; then
+  REASON_STR=$(printf '%s; ' "${REASONS[@]}")
+fi
+jq -n \
+  --arg splitHash "$SPLIT_HASH" \
+  --arg gitSha "$GIT_SHA" \
+  --arg modelSha "$MODEL_SHA" \
+  --argjson championFP "$(jq '.configFingerprint' "$RESULTS_DIR/champion-run-1.json")" \
+  --argjson candidateFP "$(jq '.configFingerprint' "$RESULTS_DIR/candidate-run-1.json")" \
+  --argjson championHits "[${CHAMPION_HITS[0]},${CHAMPION_HITS[1]},${CHAMPION_HITS[2]}]" \
+  --argjson candidateHits "[${CANDIDATE_HITS[0]},${CANDIDATE_HITS[1]},${CANDIDATE_HITS[2]}]" \
+  --argjson categoryDeltas "$CATEGORY_DELTAS" \
+  --argjson perQueryWinLoss "{\"wins\":$WINS,\"losses\":$LOSSES,\"ties\":$TIES}" \
+  --argjson trainingManifest "$TRAINING_MANIFEST" \
+  --arg verdict "$VERDICT" \
+  --arg reason "$REASON_STR" \
+  '{
+    splitHash: $splitHash,
+    gitSha: $gitSha,
+    modelSha: $modelSha,
+    championFingerprint: $championFP,
+    candidateFingerprint: $candidateFP,
+    integerHits: {champion: $championHits, candidate: $candidateHits},
+    categoryDeltas: $categoryDeltas,
+    perQueryWinLoss: $perQueryWinLoss,
+    p95Deltas: [],
+    trainingManifest: $trainingManifest,
+    verdict: $verdict,
+    reason: (if $reason == "" then null else $reason end)
+  }' > "$RESULTS_DIR/comparison.json"
+echo ""
+echo "============================================================"
+echo "VERDICT: $VERDICT"
+if [ -n "$REASON_STR" ]; then
+  echo "Reason: $REASON_STR"
+fi
+echo "Comparison: $RESULTS_DIR/comparison.json"
+echo "============================================================"
+if [ "$VERDICT" = "PASS" ]; then
+  exit 0
+else
+  exit 1
+fi

package/scripts/dev.sh ADDED Viewed

@@ -0,0 +1,16 @@
+#!/bin/bash
+# Development script - runs Claude with this plugin loaded locally
+#
+# Usage: ./scripts/dev.sh [claude args...]
+#
+# This sets CLAUDE_PLUGIN_ROOT so the MCP server can find its files
+# when running with --plugin-dir (which doesn't set this var automatically).
+set -e
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PLUGIN_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+export CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT"
+exec claude --plugin-dir "$PLUGIN_ROOT" "$@"