bluera-knowledge 0.31.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/.claude-plugin/plugin.json +23 -0
  2. package/.mcp.json +13 -0
  3. package/CHANGELOG.md +42 -0
  4. package/NOTICE +47 -0
  5. package/README.md +2 -2
  6. package/bun.lock +1978 -0
  7. package/dist/{chunk-B335UOU7.js → chunk-3TB7TDVF.js} +24 -3
  8. package/dist/chunk-3TB7TDVF.js.map +1 -0
  9. package/dist/{chunk-KCI4U6FH.js → chunk-KDZDLJUY.js} +2 -2
  10. package/dist/{chunk-AEXFPA57.js → chunk-YDTTD53Y.js} +158 -26
  11. package/dist/chunk-YDTTD53Y.js.map +1 -0
  12. package/dist/index.js +3 -3
  13. package/dist/mcp/bootstrap.js +10 -0
  14. package/dist/mcp/bootstrap.js.map +1 -1
  15. package/dist/mcp/server.d.ts +5 -3
  16. package/dist/mcp/server.js +2 -2
  17. package/dist/workers/background-worker-cli.js +2 -2
  18. package/hooks/check-ready.sh +109 -0
  19. package/hooks/hooks.json +97 -0
  20. package/hooks/job-status-hook.sh +51 -0
  21. package/hooks/posttooluse-bk-reminder.py +126 -0
  22. package/hooks/posttooluse-web-research.py +209 -0
  23. package/hooks/posttooluse-websearch-bk.py +158 -0
  24. package/hooks/pretooluse-bk-suggest.py +296 -0
  25. package/hooks/skill-activation.py +221 -0
  26. package/hooks/skill-rules.json +131 -0
  27. package/package.json +9 -2
  28. package/scripts/CLAUDE.md +65 -0
  29. package/scripts/auto-setup.sh +65 -0
  30. package/scripts/bench-regression.sh +345 -0
  31. package/scripts/dev.sh +16 -0
  32. package/scripts/doctor.sh +103 -0
  33. package/scripts/download-models.ts +188 -0
  34. package/scripts/export-web-store.ts +142 -0
  35. package/scripts/lib/mock-server.sh +70 -0
  36. package/scripts/mcp-wrapper.sh +91 -0
  37. package/scripts/setup.sh +224 -0
  38. package/scripts/statusline-module.sh +29 -0
  39. package/scripts/test-mcp-dev.js +260 -0
  40. package/scripts/validate-local.sh +412 -0
  41. package/scripts/validate-npm-release.sh +406 -0
  42. package/skills/add-folder/SKILL.md +48 -0
  43. package/skills/add-repo/SKILL.md +50 -0
  44. package/skills/advanced-workflows/SKILL.md +273 -0
  45. package/skills/cancel/SKILL.md +63 -0
  46. package/skills/check-status/SKILL.md +130 -0
  47. package/skills/crawl/SKILL.md +61 -0
  48. package/skills/doctor/SKILL.md +27 -0
  49. package/skills/eval/SKILL.md +222 -0
  50. package/skills/health/SKILL.md +72 -0
  51. package/skills/index/SKILL.md +48 -0
  52. package/skills/knowledge-search/SKILL.md +110 -0
  53. package/skills/remove-store/SKILL.md +52 -0
  54. package/skills/search/SKILL.md +80 -0
  55. package/skills/search/search.sh +63 -0
  56. package/skills/search-optimization/SKILL.md +199 -0
  57. package/skills/search-optimization/references/mistakes.md +21 -0
  58. package/skills/search-optimization/references/strategies.md +80 -0
  59. package/skills/skill-activation/SKILL.md +131 -0
  60. package/skills/statusline/SKILL.md +19 -0
  61. package/skills/store-lifecycle/SKILL.md +470 -0
  62. package/skills/stores/SKILL.md +54 -0
  63. package/skills/suggest/SKILL.md +118 -0
  64. package/skills/sync/SKILL.md +96 -0
  65. package/skills/test-plugin/SKILL.md +547 -0
  66. package/skills/uninstall/SKILL.md +65 -0
  67. package/skills/when-to-query/SKILL.md +160 -0
  68. package/dist/chunk-AEXFPA57.js.map +0 -1
  69. package/dist/chunk-B335UOU7.js.map +0 -1
  70. /package/dist/{chunk-KCI4U6FH.js.map → chunk-KDZDLJUY.js.map} +0 -0
@@ -0,0 +1,131 @@
1
+ {
2
+ "description": "bluera-knowledge skill activation rules - technology-agnostic patterns for development scenarios",
3
+ "version": 2,
4
+ "globalExclusions": [
5
+ { "keyword": "bluera-knowledge" },
6
+ { "keyword": "bluera knowledge" },
7
+ { "keyword": "/bluera-knowledge:" },
8
+ { "regex": "mcp__.*bluera" }
9
+ ],
10
+ "threshold": 2,
11
+ "skills": [
12
+ {
13
+ "name": "knowledge-search",
14
+ "description": "How to query Bluera Knowledge for library/dependency questions",
15
+ "triggers": [
16
+ { "regex": "the\\s+\\w+(-\\w+)*\\s+(package|library|module|framework|dependency)", "weight": 3 },
17
+ { "regex": "\\w+(-\\w+)*\\s+(package|library|module)\\s+(is|does|keeps|isn't|won't|doesn't)", "weight": 3 },
18
+ { "regex": "\\w+(-\\w+)*\\s+(documentation|docs)\\b", "weight": 2 },
19
+ { "regex": "error\\s+(from|in|with)\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|module)", "weight": 3 },
20
+ { "regex": "(package|library|dependency|module)\\s+(is\\s+)?(throwing|throws|error|failing)", "weight": 3 },
21
+ { "regex": "how\\s+does\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|module|framework)\\s+(handle|work|process)", "weight": 3 },
22
+ { "regex": "what\\s+does\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library)\\s+(do|return|accept)", "weight": 3 },
23
+ { "regex": "why\\s+does\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|module)", "weight": 3 },
24
+ { "regex": "(configure|config|settings)\\s+(for\\s+)?(the\\s+)?\\w+(-\\w+)*\\s+(package|library)", "weight": 3 },
25
+ { "regex": "(upgraded?|updated?)\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library|dependency)", "weight": 2 },
26
+ { "regex": "integrate\\s+(the\\s+)?\\w+(-\\w+)*\\s+(package|library)", "weight": 2 },
27
+ { "regex": "dependency\\s+(is|keeps|isn't|won't|error|issue|problem)", "weight": 2 },
28
+ { "regex": "third[- ]party\\s+(library|package|code)", "weight": 2 },
29
+ { "regex": "look\\s+(at|in|into)\\s+(the\\s+)?\\w+(-\\w+)*\\s+(source|code|internals)", "weight": 3 },
30
+ { "regex": "\\w+(-\\w+)*\\s+source\\s+code", "weight": 2 },
31
+ { "regex": "(find|show|get)\\s+(me\\s+)?(the\\s+)?\\w+(-\\w+)*\\s+(implementation|source|code)", "weight": 3 },
32
+ { "regex": "TypeError|ImportError|ModuleNotFoundError|AttributeError", "weight": 2 },
33
+ { "regex": "\\w+(-\\w+)*\\s+is\\s+(deprecated|removed|changed)", "weight": 2 },
34
+ { "regex": "breaking\\s+change.*(\\w+(-\\w+)*\\s+)?(package|library|dependency)", "weight": 3 },
35
+ { "regex": "migrate\\s+(from|to)\\s+\\w+(-\\w+)*", "weight": 2 }
36
+ ],
37
+ "exclusions": [
38
+ { "keyword": "my code" },
39
+ { "keyword": "our code" },
40
+ { "keyword": "this function" },
41
+ { "keyword": "this file" },
42
+ { "keyword": "this component" },
43
+ { "keyword": "this class" },
44
+ { "keyword": "this project" },
45
+ { "keyword": "I wrote" },
46
+ { "keyword": "we wrote" },
47
+ { "keyword": "my implementation" },
48
+ { "keyword": "store" },
49
+ { "keyword": "index" },
50
+ { "keyword": "in my" },
51
+ { "keyword": "in our" }
52
+ ]
53
+ },
54
+ {
55
+ "name": "when-to-query",
56
+ "description": "Decision guide for Bluera Knowledge vs Grep/Read",
57
+ "triggers": [
58
+ { "keyword": "should i grep", "weight": 2 },
59
+ { "keyword": "where should i look", "weight": 2 },
60
+ { "keyword": "grep or search", "weight": 2 },
61
+ { "keyword": "search or grep", "weight": 2 },
62
+ { "regex": "where\\s+(should|do)\\s+i\\s+(find|look)\\s+.*(library|package|dependency)", "weight": 3 },
63
+ { "regex": "is\\s+there\\s+a\\s+better\\s+way\\s+to\\s+(search|find)", "weight": 2 },
64
+ { "regex": "should\\s+i\\s+(use\\s+)?grep\\s+(for|to)", "weight": 2 },
65
+ { "regex": "how\\s+(do|should)\\s+i\\s+find.*(in|from)\\s+(a\\s+)?(library|package|dependency)", "weight": 3 }
66
+ ],
67
+ "exclusions": [
68
+ { "keyword": "store" },
69
+ { "keyword": "index" }
70
+ ]
71
+ },
72
+ {
73
+ "name": "search-optimization",
74
+ "description": "Optimizing search parameters and token usage",
75
+ "triggers": [
76
+ { "keyword": "too many results", "weight": 2 },
77
+ { "keyword": "too few results", "weight": 2 },
78
+ { "keyword": "limit results", "weight": 2 },
79
+ { "keyword": "reduce tokens", "weight": 2 },
80
+ { "keyword": "token usage", "weight": 2 },
81
+ { "keyword": "optimize search", "weight": 2 },
82
+ { "keyword": "detail level", "weight": 2 },
83
+ { "regex": "\\b(minimal|contextual|full)\\s+detail", "weight": 2 },
84
+ { "regex": "\\b(vector|fts|hybrid)\\s+(search|mode)", "weight": 2 },
85
+ { "regex": "narrow\\s+(down\\s+)?(the\\s+)?results", "weight": 2 },
86
+ { "regex": "search\\s+(is\\s+)?(returning|giving)\\s+(too\\s+)?(many|few)", "weight": 2 }
87
+ ],
88
+ "exclusions": [
89
+ { "regex": "--?(limit|detail|mode|threshold)\\s*=" }
90
+ ]
91
+ },
92
+ {
93
+ "name": "advanced-workflows",
94
+ "description": "Multi-tool orchestration patterns",
95
+ "triggers": [
96
+ { "keyword": "multi-step", "weight": 2 },
97
+ { "keyword": "orchestration", "weight": 2 },
98
+ { "keyword": "job monitoring", "weight": 2 },
99
+ { "keyword": "background job", "weight": 2 },
100
+ { "keyword": "combine tools", "weight": 2 },
101
+ { "keyword": "chain operations", "weight": 2 },
102
+ { "regex": "chain.*searches", "weight": 2 },
103
+ { "regex": "multiple.*searches", "weight": 2 },
104
+ { "regex": "search.*then\\s+(summarize|extract|filter)", "weight": 2 },
105
+ { "regex": "for\\s+each\\s+(search\\s+)?(result|match)", "weight": 2 }
106
+ ],
107
+ "exclusions": []
108
+ },
109
+ {
110
+ "name": "store-lifecycle",
111
+ "description": "Managing knowledge stores",
112
+ "triggers": [
113
+ { "keyword": "add store", "weight": 2 },
114
+ { "keyword": "create store", "weight": 2 },
115
+ { "keyword": "delete store", "weight": 2 },
116
+ { "keyword": "remove store", "weight": 2 },
117
+ { "keyword": "index store", "weight": 2 },
118
+ { "keyword": "re-index", "weight": 2 },
119
+ { "keyword": "reindex", "weight": 2 },
120
+ { "keyword": "knowledge store", "weight": 2 },
121
+ { "regex": "add\\s+(a\\s+)?(repo|repository|folder|directory)\\s+(to|for)\\s+(knowledge|indexing|search)", "weight": 3 },
122
+ { "regex": "index\\s+(a|the|my)\\s+(repo|repository|folder|directory|library|package)", "weight": 2 },
123
+ { "regex": "set\\s+up.*(knowledge|search)\\s*(store|index)", "weight": 2 },
124
+ { "regex": "(backup|snapshot|archive).*(knowledge|search)\\s*(store|index)", "weight": 2 }
125
+ ],
126
+ "exclusions": [
127
+ { "regex": "/bluera-knowledge:(add-repo|add-folder|remove-store|index)" }
128
+ ]
129
+ }
130
+ ]
131
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bluera-knowledge",
3
- "version": "0.31.0",
3
+ "version": "0.33.0",
4
4
  "description": "CLI tool for managing knowledge stores with semantic search",
5
5
  "type": "module",
6
6
  "bin": {
@@ -76,9 +76,16 @@
76
76
  "files": [
77
77
  "dist/",
78
78
  "python/",
79
+ "skills/",
80
+ "hooks/",
81
+ "scripts/",
82
+ ".claude-plugin/",
83
+ ".mcp.json",
84
+ "bun.lock",
79
85
  "README.md",
80
86
  "CHANGELOG.md",
81
- "LICENSE"
87
+ "LICENSE",
88
+ "NOTICE"
82
89
  ],
83
90
  "devDependencies": {
84
91
  "@anthropic-ai/sdk": "^0.72.1",
@@ -0,0 +1,65 @@
1
+ # Scripts Directory
2
+
3
+ Shell scripts for plugin setup, diagnostics, and MCP server.
4
+
5
+ ---
6
+
7
+ ## MCP Wrapper (`mcp-wrapper.sh`)
8
+
9
+ Entry point for MCP server. Called by Claude Code when starting the plugin.
10
+
11
+ **CRITICAL:** Uses `sort -V -r` to get LATEST cached version, not first alphabetically.
12
+ - Bug: Alphabetical sort put 0.20.0 before 0.22.x
13
+ - Fix: Version sort ensures latest cached version with all fixes
14
+
15
+ **Flow:**
16
+ 1. Check `installed_plugins.json` for explicit path
17
+ 2. Fallback: Scan cache dirs, sort by version (descending), use first valid
18
+ 3. Run `bootstrap.js` which handles deps and starts MCP
19
+
20
+ ---
21
+
22
+ ## Setup (`setup.sh`)
23
+
24
+ Manual and auto setup script. Installs dependencies.
25
+
26
+ **CRITICAL:** Uses `npm install --legacy-peer-deps`
27
+ - Required because tree-sitter-go@0.25 and tree-sitter-rust@0.24 have incompatible peer deps
28
+ - Without this flag, npm fails even though tree-sitter is optional
29
+
30
+ **Installs:**
31
+ - MCP wrapper to `~/.local/bin/bluera-knowledge-mcp`
32
+ - node_modules (bun or npm)
33
+ - Playwright Chromium browser
34
+
35
+ ---
36
+
37
+ ## Auto-Setup (`auto-setup.sh`)
38
+
39
+ Runs on SessionStart (async, non-blocking). Calls setup.sh if needed.
40
+
41
+ **Fast exit:** If node_modules AND wrapper exist, exits immediately.
42
+
43
+ ---
44
+
45
+ ## Doctor (`doctor.sh`)
46
+
47
+ Diagnostic tool for MCP failures. **Use this first when MCP breaks.**
48
+
49
+ Invoked via: `/bluera-knowledge:doctor`
50
+
51
+ **Checks:**
52
+ - Build tools (make) - REQUIRED for native modules
53
+ - Node.js version - WARNS on v24+ (native module issues)
54
+ - node_modules - installation status
55
+ - MCP wrapper - installation status
56
+ - Python 3 - optional, for embeddings
57
+ - Playwright - optional, for web crawling
58
+
59
+ ---
60
+
61
+ ## Check-Ready (`check-ready.sh`)
62
+
63
+ Fast validation on SessionStart (sync, blocking with timeout).
64
+
65
+ Verifies prerequisites without full setup. Exits 2 for blocking errors.
@@ -0,0 +1,65 @@
1
+ #!/bin/bash
2
+ # Bluera Knowledge Plugin - Auto Setup
3
+ # Runs on: SessionStart (async) - automatically sets up plugin if needed
4
+ #
5
+ # This script runs in the background on every session start.
6
+ # It exits quickly (0) if already set up, or runs full setup if needed.
7
+ # Non-interactive: cannot prompt for user input (no TTY).
8
+
9
+ PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$(dirname "$0")")}"
10
+
11
+ # Colors for output
12
+ GREEN='\033[0;32m'
13
+ YELLOW='\033[1;33m'
14
+ NC='\033[0m'
15
+
16
+ # Debug logging
17
+ LOG_DIR="${PROJECT_ROOT:-.}/.bluera/bluera-knowledge/logs"
18
+ LOG_FILE="$LOG_DIR/app.log"
19
+
20
+ log_debug() {
21
+ local msg="$1"
22
+ mkdir -p "$LOG_DIR" 2>/dev/null || true
23
+ local timestamp
24
+ timestamp=$(date -u +"%Y-%m-%dT%H:%M:%S.000Z" 2>/dev/null || date -u +"%Y-%m-%dT%H:%M:%SZ")
25
+ echo "{\"time\":\"$timestamp\",\"level\":\"debug\",\"module\":\"auto-setup.sh\",\"msg\":\"$msg\"}" >> "$LOG_FILE" 2>/dev/null || true
26
+ }
27
+
28
+ log_debug "Auto-setup starting, PLUGIN_ROOT=$PLUGIN_ROOT"
29
+
30
+ # Fast exit if already set up
31
+ WRAPPER_PATH="$HOME/.local/bin/bluera-knowledge-mcp"
32
+ if [ -d "$PLUGIN_ROOT/node_modules" ] && [ -f "$WRAPPER_PATH" ]; then
33
+ log_debug "Already set up (node_modules and wrapper exist), exiting"
34
+ exit 0
35
+ fi
36
+
37
+ log_debug "Setup needed - node_modules: $([ -d "$PLUGIN_ROOT/node_modules" ] && echo 'exists' || echo 'missing'), wrapper: $([ -f "$WRAPPER_PATH" ] && echo 'exists' || echo 'missing')"
38
+
39
+ # Check for build tools - if missing, print instructions and exit with error
40
+ # Cannot auto-install because that requires sudo (interactive)
41
+ if ! command -v make &>/dev/null; then
42
+ log_debug "Build tools (make) not found, printing instructions"
43
+ echo "[bluera-knowledge] ERROR: Build tools (make) not found - required for native modules." >&2
44
+ echo "" >&2
45
+ echo "Install build tools, then restart Claude Code:" >&2
46
+ echo " Debian/Ubuntu: sudo apt install build-essential" >&2
47
+ echo " Fedora/RHEL: sudo dnf groupinstall 'Development Tools'" >&2
48
+ echo " macOS: xcode-select --install" >&2
49
+ # Exit 2 = blocking error, stderr shown to user
50
+ exit 2
51
+ fi
52
+
53
+ # Run setup non-interactively
54
+ log_debug "Running setup.sh with NONINTERACTIVE=1"
55
+ echo -e "${YELLOW}[bluera-knowledge] Running first-time setup (this may take a moment)...${NC}"
56
+
57
+ export NONINTERACTIVE=1
58
+ if "$PLUGIN_ROOT/scripts/setup.sh"; then
59
+ log_debug "Setup completed successfully"
60
+ echo -e "${GREEN}[bluera-knowledge] Setup complete ✓${NC}"
61
+ echo -e "${GREEN}[bluera-knowledge] Restart Claude Code to enable MCP server.${NC}"
62
+ else
63
+ log_debug "Setup failed"
64
+ echo -e "${YELLOW}[bluera-knowledge] Setup failed. Run manually: $PLUGIN_ROOT/scripts/setup.sh${NC}"
65
+ fi
@@ -0,0 +1,345 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # bench-regression.sh — Hard merge gate for model changes
5
+ #
6
+ # Runs candidate vs champion on real-v1-test (3x alternating),
7
+ # writes comparison.json, exits non-zero on failure.
8
+ #
9
+ # Usage:
10
+ # BK_FINETUNED_MODEL=./models/bge-small-finetuned-onnx scripts/bench-regression.sh
11
+ #
12
+ # Requires: jq, bun, benchmarks/search/baselines/champion-mean-v1-test.json
13
+
14
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
15
+ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
16
+ cd "$PROJECT_ROOT"
17
+
18
+ CHAMPION_BASELINE="benchmarks/search/baselines/champion-mean-v1-test.json"
19
+ SPLIT_FILE="benchmarks/search/splits/test-cases.json"
20
+ TIMESTAMP="$(date -u +%Y%m%dT%H%M%S)"
21
+ RESULTS_DIR="benchmarks/search/regression-results/${TIMESTAMP}"
22
+
23
+ # Validate prerequisites
24
+ if [ ! -f "$CHAMPION_BASELINE" ]; then
25
+ echo "ERROR: Champion baseline not found: $CHAMPION_BASELINE"
26
+ echo "Run Step 8A first."
27
+ exit 1
28
+ fi
29
+
30
+ if [ ! -f "$SPLIT_FILE" ]; then
31
+ echo "ERROR: Test split not found: $SPLIT_FILE"
32
+ echo "Run split-dataset.ts first."
33
+ exit 1
34
+ fi
35
+
36
+ mkdir -p "$RESULTS_DIR"
37
+
38
+ echo "============================================================"
39
+ echo "REGRESSION GATE — bench-regression.sh"
40
+ echo "============================================================"
41
+ echo "Timestamp: $TIMESTAMP"
42
+ echo "Results: $RESULTS_DIR"
43
+ echo "Champion: $CHAMPION_BASELINE"
44
+ echo ""
45
+
46
+ # --- Run 3 alternating champion/candidate pairs ---
47
+
48
+ run_champion() {
49
+ local run_num=$1
50
+ local artifact="$RESULTS_DIR/champion-run-${run_num}.json"
51
+ echo "[Run ${run_num}/3] Champion (baseline)..."
52
+ (
53
+ unset BK_FINETUNED_MODEL BK_QUERY_PREFIX
54
+ bun run bench:search --dataset real-v1-test --reuse-index \
55
+ --artifacts "$artifact" 2>&1 | grep -E "Hit@1|SUMMARY|Artifact"
56
+ )
57
+ echo " -> $artifact"
58
+ }
59
+
60
+ run_candidate() {
61
+ local run_num=$1
62
+ local artifact="$RESULTS_DIR/candidate-run-${run_num}.json"
63
+ echo "[Run ${run_num}/3] Candidate (finetuned)..."
64
+ # Finetuned BGE models still need the BGE query prefix
65
+ BK_QUERY_PREFIX='Represent this sentence for searching relevant passages: ' \
66
+ bun run bench:search --dataset real-v1-test --setup --force \
67
+ --artifacts "$artifact" 2>&1 | grep -E "Hit@1|SUMMARY|Artifact"
68
+ echo " -> $artifact"
69
+ }
70
+
71
+ # Alternating: C1 -> X1 -> C2 -> X2 -> C3 -> X3
72
+ for i in 1 2 3; do
73
+ run_champion "$i"
74
+ run_candidate "$i"
75
+ done
76
+
77
+ echo ""
78
+ echo "All 6 runs complete. Analyzing..."
79
+ echo ""
80
+
81
+ # --- Fingerprint parity check ---
82
+
83
+ echo "Fingerprint parity check..."
84
+ CHAMPION_FP=$(jq -S '.configFingerprint | del(.embedding.model, .embedding.modelSha256)' "$RESULTS_DIR/champion-run-1.json")
85
+ CANDIDATE_FP=$(jq -S '.configFingerprint | del(.embedding.model, .embedding.modelSha256)' "$RESULTS_DIR/candidate-run-1.json")
86
+
87
+ if [ "$CHAMPION_FP" != "$CANDIDATE_FP" ]; then
88
+ echo "FAIL: Fingerprint mismatch (non-model fields differ)"
89
+ diff <(echo "$CHAMPION_FP") <(echo "$CANDIDATE_FP") || true
90
+ echo ""
91
+ echo "Only embedding.model and embedding.modelSha256 may differ."
92
+ # Write failed comparison
93
+ jq -n \
94
+ --arg splitHash "$(sha256sum "$SPLIT_FILE" | cut -d' ' -f1)" \
95
+ --arg gitSha "$(git rev-parse --short HEAD)" \
96
+ --arg verdict "FAIL: fingerprint mismatch" \
97
+ '{splitHash: $splitHash, gitSha: $gitSha, verdict: $verdict}' \
98
+ > "$RESULTS_DIR/comparison.json"
99
+ exit 1
100
+ fi
101
+ echo " PASS: all non-model fields identical"
102
+
103
+ # --- Compute integer hits ---
104
+ # Weighted integer hits = sum of weight for cases where hitAt1 == true
105
+ # Matches aggregation in metrics.ts:216
106
+
107
+ compute_hits() {
108
+ local artifact=$1
109
+ jq '[.results[] | select(.metrics.hitAt1 == true) | .weight // 1] | add // 0' "$artifact"
110
+ }
111
+
112
+ echo ""
113
+ echo "Integer hit counts (weighted):"
114
+
115
+ CHAMPION_HITS=()
116
+ CANDIDATE_HITS=()
117
+ for i in 1 2 3; do
118
+ ch=$(compute_hits "$RESULTS_DIR/champion-run-${i}.json")
119
+ cx=$(compute_hits "$RESULTS_DIR/candidate-run-${i}.json")
120
+ CHAMPION_HITS+=("$ch")
121
+ CANDIDATE_HITS+=("$cx")
122
+ echo " Pair $i: champion=$ch, candidate=$cx (delta=$((cx - ch)))"
123
+ done
124
+
125
+ # --- Stability check ---
126
+ echo ""
127
+ echo "Stability check..."
128
+ if [ "${CHAMPION_HITS[0]}" != "${CHAMPION_HITS[1]}" ] || [ "${CHAMPION_HITS[0]}" != "${CHAMPION_HITS[2]}" ]; then
129
+ echo " WARNING: Champion hits vary across runs: ${CHAMPION_HITS[*]}"
130
+ fi
131
+ if [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[1]}" ] || [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[2]}" ]; then
132
+ echo " WARNING: Candidate hits vary across runs: ${CANDIDATE_HITS[*]}"
133
+ fi
134
+ STABLE="true"
135
+ if [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[1]}" ] || [ "${CANDIDATE_HITS[0]}" != "${CANDIDATE_HITS[2]}" ]; then
136
+ STABLE="false"
137
+ fi
138
+ echo " Candidate stable: $STABLE"
139
+
140
+ # --- Per-query win/loss (using run-1 as reference) ---
141
+
142
+ echo ""
143
+ echo "Per-query analysis (run 1)..."
144
+
145
+ # Extract hitAt1 per case ID for champion and candidate
146
+ CHAMPION_CASES=$(jq -r '[.results[] | {id, hit: .metrics.hitAt1, weight: (.weight // 1)}]' "$RESULTS_DIR/champion-run-1.json")
147
+ CANDIDATE_CASES=$(jq -r '[.results[] | {id, hit: .metrics.hitAt1, weight: (.weight // 1)}]' "$RESULTS_DIR/candidate-run-1.json")
148
+
149
+ WINS=0
150
+ LOSSES=0
151
+ TIES=0
152
+
153
+ # Compare per case
154
+ CASE_IDS=$(jq -r '.[].id' <<< "$CHAMPION_CASES")
155
+ while IFS= read -r case_id; do
156
+ ch_hit=$(jq -r --arg id "$case_id" '.[] | select(.id == $id) | .hit' <<< "$CHAMPION_CASES")
157
+ cx_hit=$(jq -r --arg id "$case_id" '.[] | select(.id == $id) | .hit' <<< "$CANDIDATE_CASES")
158
+ if [ "$ch_hit" = "false" ] && [ "$cx_hit" = "true" ]; then
159
+ ((WINS++)) || true
160
+ elif [ "$ch_hit" = "true" ] && [ "$cx_hit" = "false" ]; then
161
+ ((LOSSES++)) || true
162
+ else
163
+ ((TIES++)) || true
164
+ fi
165
+ done <<< "$CASE_IDS"
166
+
167
+ echo " Wins: $WINS, Losses: $LOSSES, Ties: $TIES"
168
+
169
+ # --- Category analysis ---
170
+
171
+ echo ""
172
+ echo "Category analysis (run 1)..."
173
+
174
+ # Per-category integer hit deltas
175
+ CATEGORY_DELTAS=$(jq -n \
176
+ --argjson champion "$CHAMPION_CASES" \
177
+ --argjson candidate "$CANDIDATE_CASES" \
178
+ '[($champion | group_by(.id | split("-")[0:2] | join("-")) | .[] |
179
+ {category: (.[0].id | split("-")[0:2] | join("-")),
180
+ champion_hits: [.[] | select(.hit) | .weight // 1] | add // 0}) ] as $ch_cats |
181
+ [($candidate | group_by(.id | split("-")[0:2] | join("-")) | .[] |
182
+ {category: (.[0].id | split("-")[0:2] | join("-")),
183
+ candidate_hits: [.[] | select(.hit) | .weight // 1] | add // 0}) ] as $cx_cats |
184
+ [$ch_cats[] as $c | {category: $c.category,
185
+ champion: $c.champion_hits,
186
+ candidate: ([$cx_cats[] | select(.category == $c.category) | .candidate_hits][0] // 0),
187
+ delta: (([$cx_cats[] | select(.category == $c.category) | .candidate_hits][0] // 0) - $c.champion_hits)}]')
188
+
189
+ echo "$CATEGORY_DELTAS" | jq -r '.[] | " \(.category): champion=\(.champion) candidate=\(.candidate) delta=\(.delta)"'
190
+
191
+ # Check category guardrail: no category loses more than 1 weighted hit
192
+ CATEGORY_FAIL=$(echo "$CATEGORY_DELTAS" | jq '[.[] | select(.delta < -1)] | length')
193
+
194
+ # --- P95 latency ---
195
+
196
+ echo ""
197
+ echo "P95 latency:"
198
+ P95_DELTAS=()
199
+ for i in 1 2 3; do
200
+ ch_p95=$(jq '.summary.latency.p95' "$RESULTS_DIR/champion-run-${i}.json")
201
+ cx_p95=$(jq '.summary.latency.p95' "$RESULTS_DIR/candidate-run-${i}.json")
202
+ delta=$(echo "$cx_p95 - $ch_p95" | bc -l 2>/dev/null || echo "0")
203
+ P95_DELTAS+=("$delta")
204
+ printf " Pair %d: champion=%.1fms candidate=%.1fms delta=%.1fms\n" "$i" "$ch_p95" "$cx_p95" "$delta"
205
+ done
206
+
207
+ # Median P95 check (baseline + 20ms)
208
+ CHAMPION_P95_MEDIAN=$(for i in 1 2 3; do jq '.summary.latency.p95' "$RESULTS_DIR/champion-run-${i}.json"; done | sort -n | sed -n '2p')
209
+ CANDIDATE_P95_MEDIAN=$(for i in 1 2 3; do jq '.summary.latency.p95' "$RESULTS_DIR/candidate-run-${i}.json"; done | sort -n | sed -n '2p')
210
+ LATENCY_LIMIT=$(echo "$CHAMPION_P95_MEDIAN + 20" | bc -l)
211
+ LATENCY_OK=$(echo "$CANDIDATE_P95_MEDIAN <= $LATENCY_LIMIT" | bc -l)
212
+
213
+ printf " Median P95: champion=%.1fms candidate=%.1fms limit=%.1fms\n" "$CHAMPION_P95_MEDIAN" "$CANDIDATE_P95_MEDIAN" "$LATENCY_LIMIT"
214
+
215
+ # --- Gate decisions ---
216
+
217
+ echo ""
218
+ echo "============================================================"
219
+ echo "GATE EVALUATION"
220
+ echo "============================================================"
221
+
222
+ VERDICT="PASS"
223
+ REASONS=()
224
+
225
+ # Gate 1: Integer-hit gate (raised bar)
226
+ HIT_DELTA=$((CANDIDATE_HITS[0] - CHAMPION_HITS[0]))
227
+ echo "Gate 1 — Integer-hit gate: delta=$HIT_DELTA weighted hits"
228
+
229
+ GATE1_PASS="false"
230
+ if [ "$HIT_DELTA" -ge 2 ]; then
231
+ echo " PASS (path a): gain >= 2 weighted hits"
232
+ GATE1_PASS="true"
233
+ elif [ "$HIT_DELTA" -ge 1 ] && [ "$LOSSES" -eq 0 ]; then
234
+ echo " PASS (path b): gain >= 1 AND zero losses"
235
+ GATE1_PASS="true"
236
+ else
237
+ echo " FAIL: delta=$HIT_DELTA, losses=$LOSSES (need >=2 hits, or >=1 with 0 losses)"
238
+ VERDICT="FAIL"
239
+ REASONS+=("integer-hit gate: delta=$HIT_DELTA losses=$LOSSES")
240
+ fi
241
+
242
+ # Gate 2: Category guardrail
243
+ echo "Gate 2 — Category guardrail: $CATEGORY_FAIL categories with >1 hit loss"
244
+ if [ "$CATEGORY_FAIL" -gt 0 ]; then
245
+ echo " FAIL: category regression detected"
246
+ VERDICT="FAIL"
247
+ REASONS+=("category guardrail: $CATEGORY_FAIL categories regressed >1 hit")
248
+ else
249
+ echo " PASS"
250
+ fi
251
+
252
+ # Gate 3: Per-query win/loss
253
+ echo "Gate 3 — Per-query: wins=$WINS losses=$LOSSES"
254
+ if [ "$WINS" -gt "$LOSSES" ]; then
255
+ echo " PASS"
256
+ else
257
+ echo " FAIL: wins must exceed losses"
258
+ VERDICT="FAIL"
259
+ REASONS+=("per-query: wins=$WINS <= losses=$LOSSES")
260
+ fi
261
+
262
+ # Gate 4: Latency
263
+ echo "Gate 4 — Latency: candidate median P95 within baseline + 20ms"
264
+ if [ "$LATENCY_OK" -eq 1 ]; then
265
+ echo " PASS"
266
+ else
267
+ echo " FAIL: candidate P95 exceeds limit"
268
+ VERDICT="FAIL"
269
+ REASONS+=("latency: candidate=$CANDIDATE_P95_MEDIAN > limit=$LATENCY_LIMIT")
270
+ fi
271
+
272
+ # Gate 5: Stability
273
+ echo "Gate 5 — Stability: candidate hits identical across 3 runs"
274
+ if [ "$STABLE" = "true" ]; then
275
+ echo " PASS"
276
+ else
277
+ echo " FAIL: candidate hit count varies"
278
+ VERDICT="FAIL"
279
+ REASONS+=("stability: hits vary ${CANDIDATE_HITS[*]}")
280
+ fi
281
+
282
+ # --- Build comparison.json ---
283
+
284
+ SPLIT_HASH=$(sha256sum "$SPLIT_FILE" 2>/dev/null | cut -d' ' -f1 || shasum -a 256 "$SPLIT_FILE" | cut -d' ' -f1)
285
+ GIT_SHA=$(git rev-parse --short HEAD)
286
+
287
+ # Model SHA (try ONNX model if BK_FINETUNED_MODEL is set)
288
+ MODEL_SHA="base-model"
289
+ if [ -n "${BK_FINETUNED_MODEL:-}" ] && [ -f "${BK_FINETUNED_MODEL}/onnx/model.onnx" ]; then
290
+ MODEL_SHA=$(sha256sum "${BK_FINETUNED_MODEL}/onnx/model.onnx" 2>/dev/null | cut -d' ' -f1 || shasum -a 256 "${BK_FINETUNED_MODEL}/onnx/model.onnx" | cut -d' ' -f1)
291
+ fi
292
+
293
+ # Training manifest (if exists)
294
+ TRAINING_MANIFEST="{}"
295
+ if [ -f "training/data/training-manifest.json" ]; then
296
+ TRAINING_MANIFEST=$(cat "training/data/training-manifest.json")
297
+ fi
298
+
299
+ REASON_STR=""
300
+ if [ ${#REASONS[@]} -gt 0 ]; then
301
+ REASON_STR=$(printf '%s; ' "${REASONS[@]}")
302
+ fi
303
+
304
+ jq -n \
305
+ --arg splitHash "$SPLIT_HASH" \
306
+ --arg gitSha "$GIT_SHA" \
307
+ --arg modelSha "$MODEL_SHA" \
308
+ --argjson championFP "$(jq '.configFingerprint' "$RESULTS_DIR/champion-run-1.json")" \
309
+ --argjson candidateFP "$(jq '.configFingerprint' "$RESULTS_DIR/candidate-run-1.json")" \
310
+ --argjson championHits "[${CHAMPION_HITS[0]},${CHAMPION_HITS[1]},${CHAMPION_HITS[2]}]" \
311
+ --argjson candidateHits "[${CANDIDATE_HITS[0]},${CANDIDATE_HITS[1]},${CANDIDATE_HITS[2]}]" \
312
+ --argjson categoryDeltas "$CATEGORY_DELTAS" \
313
+ --argjson perQueryWinLoss "{\"wins\":$WINS,\"losses\":$LOSSES,\"ties\":$TIES}" \
314
+ --argjson trainingManifest "$TRAINING_MANIFEST" \
315
+ --arg verdict "$VERDICT" \
316
+ --arg reason "$REASON_STR" \
317
+ '{
318
+ splitHash: $splitHash,
319
+ gitSha: $gitSha,
320
+ modelSha: $modelSha,
321
+ championFingerprint: $championFP,
322
+ candidateFingerprint: $candidateFP,
323
+ integerHits: {champion: $championHits, candidate: $candidateHits},
324
+ categoryDeltas: $categoryDeltas,
325
+ perQueryWinLoss: $perQueryWinLoss,
326
+ p95Deltas: [],
327
+ trainingManifest: $trainingManifest,
328
+ verdict: $verdict,
329
+ reason: (if $reason == "" then null else $reason end)
330
+ }' > "$RESULTS_DIR/comparison.json"
331
+
332
+ echo ""
333
+ echo "============================================================"
334
+ echo "VERDICT: $VERDICT"
335
+ if [ -n "$REASON_STR" ]; then
336
+ echo "Reason: $REASON_STR"
337
+ fi
338
+ echo "Comparison: $RESULTS_DIR/comparison.json"
339
+ echo "============================================================"
340
+
341
+ if [ "$VERDICT" = "PASS" ]; then
342
+ exit 0
343
+ else
344
+ exit 1
345
+ fi
package/scripts/dev.sh ADDED
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+ # Development script - runs Claude with this plugin loaded locally
3
+ #
4
+ # Usage: ./scripts/dev.sh [claude args...]
5
+ #
6
+ # This sets CLAUDE_PLUGIN_ROOT so the MCP server can find its files
7
+ # when running with --plugin-dir (which doesn't set this var automatically).
8
+
9
+ set -e
10
+
11
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12
+ PLUGIN_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
13
+
14
+ export CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT"
15
+
16
+ exec claude --plugin-dir "$PLUGIN_ROOT" "$@"