bluera-knowledge 0.36.0 → 0.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/CHANGELOG.md +16 -0
  3. package/README.md +1 -1
  4. package/bun.lock +27 -0
  5. package/dist/{chunk-L2SC6J4K.js → chunk-724FNI27.js} +466 -171
  6. package/dist/chunk-724FNI27.js.map +1 -0
  7. package/dist/{chunk-DNGE7FZ4.js → chunk-AO45YFHO.js} +1386 -42
  8. package/dist/chunk-AO45YFHO.js.map +1 -0
  9. package/dist/{chunk-MQQ46BST.js → chunk-F6DGSS2N.js} +2 -2
  10. package/dist/index.js +3 -3
  11. package/dist/mcp/server.d.ts +37 -3
  12. package/dist/mcp/server.js +2 -2
  13. package/dist/workers/background-worker-cli.js +2 -2
  14. package/hooks/check-ready.sh +17 -7
  15. package/hooks/lib/store_summary.py +111 -0
  16. package/hooks/posttooluse-bk-reminder.py +33 -6
  17. package/hooks/userpromptsubmit-bk-nudge.py +25 -5
  18. package/package.json +3 -1
  19. package/scripts/eval-candidates.sh +235 -0
  20. package/skills/advanced-workflows/references/combining-workflows.md +17 -0
  21. package/skills/advanced-workflows/references/error-recovery.md +44 -0
  22. package/skills/advanced-workflows/references/handling-large-results.md +48 -0
  23. package/skills/advanced-workflows/references/multi-store-search.md +42 -0
  24. package/skills/search/statusline.md +75 -0
  25. package/skills/store-lifecycle/references/failure-recovery.md +80 -0
  26. package/skills/store-lifecycle/references/indexing-strategies.md +67 -0
  27. package/skills/store-lifecycle/references/job-monitoring.md +72 -0
  28. package/skills/store-lifecycle/references/lifecycle-checklist.md +20 -0
  29. package/skills/store-lifecycle/references/storage-management.md +43 -0
  30. package/dist/chunk-DNGE7FZ4.js.map +0 -1
  31. package/dist/chunk-L2SC6J4K.js.map +0 -1
  32. /package/dist/{chunk-MQQ46BST.js.map → chunk-F6DGSS2N.js.map} +0 -0
@@ -0,0 +1,235 @@
1
+ #!/usr/bin/env bash
2
+ # Model Candidate Evaluation Script
3
+ # Phase 1: Smoke test (load model, embed one query) — ~30s per model
4
+ # Phase 2: Val benchmark (full real-v1-val dataset) — ~20-80min per model
5
+ #
6
+ # Usage:
7
+ # ./scripts/eval-candidates.sh # Run all phases
8
+ # ./scripts/eval-candidates.sh --smoke-only # Phase 1 only (fast)
9
+ # ./scripts/eval-candidates.sh --bench-only # Phase 2 only (skip smoke, assume all pass)
10
+ # ./scripts/eval-candidates.sh --model gte-modernbert-base # Test single model
11
+
12
+ set -euo pipefail
13
+
14
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
15
+ PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
16
+ cd "$PROJECT_DIR"
17
+
18
+ RESULTS_DIR="$PROJECT_DIR/.bluera/bluera-knowledge/bench-data/model-eval"
19
+ mkdir -p "$RESULTS_DIR"
20
+
21
+ SMOKE_ONLY=false
22
+ BENCH_ONLY=false
23
+ SINGLE_MODEL=""
24
+
25
+ while [[ $# -gt 0 ]]; do
26
+ case $1 in
27
+ --smoke-only) SMOKE_ONLY=true; shift ;;
28
+ --bench-only) BENCH_ONLY=true; shift ;;
29
+ --model) SINGLE_MODEL="$2"; shift 2 ;;
30
+ *) echo "Unknown option: $1"; exit 1 ;;
31
+ esac
32
+ done
33
+
34
+ # Priority-ordered candidate list
35
+ CANDIDATES=(
36
+ "gte-modernbert-base"
37
+ "snowflake-arctic-embed-s"
38
+ "snowflake-arctic-embed-m-v1.5"
39
+ "jina-embeddings-v2-base-code"
40
+ "modernbert-embed-base"
41
+ "snowflake-arctic-embed-xs"
42
+ "snowflake-arctic-embed-m-v2.0"
43
+ )
44
+
45
+ if [[ -n "$SINGLE_MODEL" ]]; then
46
+ CANDIDATES=("$SINGLE_MODEL")
47
+ fi
48
+
49
+ SMOKE_PASSED=()
50
+ SMOKE_FAILED=()
51
+
52
+ # ============================================================
53
+ # Phase 1: Smoke Tests
54
+ # ============================================================
55
+ smoke_test() {
56
+ local model_key="$1"
57
+ local result_file="$RESULTS_DIR/smoke-${model_key}.json"
58
+
59
+ echo -n " [$model_key] Loading model... "
60
+
61
+ # Use bun to run inline TypeScript that loads model via our pipeline
62
+ local output
63
+ if output=$(timeout 120 bun -e "
64
+ import { MODEL_REGISTRY } from './src/models/registry.ts';
65
+ import { pipeline } from '@huggingface/transformers';
66
+
67
+ const key = '${model_key}';
68
+ const config = MODEL_REGISTRY[key];
69
+ if (!config) {
70
+ console.error('Model not in registry: ' + key);
71
+ process.exit(1);
72
+ }
73
+
74
+ console.error('Downloading ' + config.id + '...');
75
+ const start = Date.now();
76
+
77
+ try {
78
+ const extractor = await pipeline('feature-extraction', config.id, { dtype: 'fp32' });
79
+
80
+ const testQuery = config.queryPrefix + 'How do I implement dependency injection?';
81
+ const result = await extractor(testQuery, { pooling: config.pooling, normalize: config.normalize });
82
+
83
+ const dims = result.dims;
84
+ const elapsed = Date.now() - start;
85
+ const sample = Array.from(result.data).slice(0, 3).map((v: number) => v.toFixed(4));
86
+
87
+ const report = {
88
+ model: key,
89
+ hfId: config.id,
90
+ status: 'pass',
91
+ dims: dims,
92
+ expectedDims: config.dimensions,
93
+ dimsMatch: JSON.stringify(dims) === JSON.stringify([1, config.dimensions]),
94
+ pooling: config.pooling,
95
+ sampleEmbedding: sample,
96
+ loadTimeMs: elapsed,
97
+ timestamp: new Date().toISOString(),
98
+ };
99
+ console.log(JSON.stringify(report));
100
+
101
+ await extractor.dispose();
102
+ } catch (err: any) {
103
+ const report = {
104
+ model: key,
105
+ hfId: config.id,
106
+ status: 'fail',
107
+ error: err.message || String(err),
108
+ timestamp: new Date().toISOString(),
109
+ };
110
+ console.log(JSON.stringify(report));
111
+ process.exit(1);
112
+ }
113
+ " 2>"$RESULTS_DIR/smoke-${model_key}.log"); then
114
+ echo "$output" > "$result_file"
115
+ local dims load_time
116
+ dims=$(echo "$output" | bun -e "const d=JSON.parse(await Bun.stdin.text()); process.stdout.write(String(d.dims))")
117
+ load_time=$(echo "$output" | bun -e "const d=JSON.parse(await Bun.stdin.text()); process.stdout.write(String(d.loadTimeMs))")
118
+ echo "PASS (dims=${dims}, ${load_time}ms)"
119
+ return 0
120
+ else
121
+ echo "FAIL"
122
+ if [[ -f "$RESULTS_DIR/smoke-${model_key}.log" ]]; then
123
+ echo " Error: $(tail -3 "$RESULTS_DIR/smoke-${model_key}.log")"
124
+ fi
125
+ return 1
126
+ fi
127
+ }
128
+
129
+ if [[ "$BENCH_ONLY" == "false" ]]; then
130
+ echo "========================================"
131
+ echo "Phase 1: Smoke Tests"
132
+ echo "========================================"
133
+ echo ""
134
+
135
+ for model in "${CANDIDATES[@]}"; do
136
+ if smoke_test "$model"; then
137
+ SMOKE_PASSED+=("$model")
138
+ else
139
+ SMOKE_FAILED+=("$model")
140
+ fi
141
+ done
142
+
143
+ echo ""
144
+ echo "----------------------------------------"
145
+ echo "Smoke Test Summary"
146
+ echo "----------------------------------------"
147
+ echo "Passed: ${#SMOKE_PASSED[@]}/${#CANDIDATES[@]}"
148
+ for m in "${SMOKE_PASSED[@]}"; do echo " + $m"; done
149
+ if [[ ${#SMOKE_FAILED[@]} -gt 0 ]]; then
150
+ echo "Failed: ${#SMOKE_FAILED[@]}"
151
+ for m in "${SMOKE_FAILED[@]}"; do echo " - $m"; done
152
+ fi
153
+ echo ""
154
+
155
+ if [[ "$SMOKE_ONLY" == "true" ]]; then
156
+ echo "Done (--smoke-only). To benchmark passing models:"
157
+ echo " ./scripts/eval-candidates.sh --bench-only"
158
+ exit 0
159
+ fi
160
+ else
161
+ # bench-only mode: assume all candidates pass smoke
162
+ SMOKE_PASSED=("${CANDIDATES[@]}")
163
+ fi
164
+
165
+ # ============================================================
166
+ # Phase 2: Val Benchmarks
167
+ # ============================================================
168
+ if [[ ${#SMOKE_PASSED[@]} -eq 0 ]]; then
169
+ echo "No models passed smoke test. Nothing to benchmark."
170
+ exit 1
171
+ fi
172
+
173
+ echo "========================================"
174
+ echo "Phase 2: Val Benchmarks (real-v1-val)"
175
+ echo "========================================"
176
+ echo "Models to benchmark: ${#SMOKE_PASSED[@]}"
177
+ echo ""
178
+
179
+ BENCH_RESULTS=()
180
+
181
+ for model in "${SMOKE_PASSED[@]}"; do
182
+ echo "========================================"
183
+ echo "Benchmarking: $model"
184
+ echo "========================================"
185
+
186
+ local_artifact="$RESULTS_DIR/bench-${model}.json"
187
+
188
+ if BK_MODEL="$model" bun run bench:search \
189
+ --dataset real-v1-val \
190
+ --setup --force \
191
+ --artifacts "$local_artifact" 2>&1 | tee "$RESULTS_DIR/bench-${model}.log"; then
192
+ BENCH_RESULTS+=("$model")
193
+ echo ""
194
+ echo " -> Artifact: $local_artifact"
195
+ else
196
+ echo " -> BENCHMARK FAILED for $model"
197
+ fi
198
+ echo ""
199
+ done
200
+
201
+ # ============================================================
202
+ # Phase 3: Comparison Summary
203
+ # ============================================================
204
+ echo "========================================"
205
+ echo "COMPARISON SUMMARY"
206
+ echo "========================================"
207
+ echo ""
208
+
209
+ # Extract key metrics from each artifact
210
+ printf "%-35s %8s %8s %8s %8s %8s\n" "Model" "Hit@1" "MRR" "nDCG@10" "R@10" "P95ms"
211
+ printf "%-35s %8s %8s %8s %8s %8s\n" "---" "---" "---" "---" "---" "---"
212
+
213
+ for model in "${BENCH_RESULTS[@]}"; do
214
+ artifact="$RESULTS_DIR/bench-${model}.json"
215
+ if [[ -f "$artifact" ]]; then
216
+ bun -e "
217
+ const data = JSON.parse(await Bun.file('${artifact}').text());
218
+ const s = data.summary;
219
+ const name = '${model}'.padEnd(35);
220
+ const hit1 = (s.hitAt1 * 100).toFixed(1).padStart(7) + '%';
221
+ const mrr = s.mrr.toFixed(3).padStart(8);
222
+ const ndcg = (s.ndcgAt10 * 100).toFixed(1).padStart(7) + '%';
223
+ const r10 = (s.recallAt10 * 100).toFixed(1).padStart(7) + '%';
224
+ const p95 = s.latency.p95.toFixed(0).padStart(5) + 'ms';
225
+ console.log(name + ' ' + hit1 + ' ' + mrr + ' ' + ndcg + ' ' + r10 + ' ' + p95);
226
+ "
227
+ fi
228
+ done
229
+
230
+ # Also show champion baseline for reference
231
+ echo ""
232
+ echo "Champion baseline (bge-small-en-v1.5):"
233
+ echo " Hit@1=29.4% MRR=0.412 nDCG@10=46.0% R@10=45.1% P95=111ms"
234
+ echo ""
235
+ echo "Results saved to: $RESULTS_DIR/"
@@ -0,0 +1,17 @@
1
+ # Combining Workflows
2
+
3
+ Real-world usage often combines these patterns:
4
+
5
+ ```
6
+ User: "I need to understand how Express and Hono handle middleware differently"
7
+
8
+ 1. list_stores() → check if both indexed
9
+ 2. If not: create_store() for missing framework(s)
10
+ 3. check_job_status() → wait for indexing
11
+ 4. search("middleware implementation", stores=['express', 'hono'], detail='minimal')
12
+ 5. Review summaries, identify key files
13
+ 6. get_full_context() for 2-3 most relevant from each framework
14
+ 7. Compare implementations with full context
15
+ ```
16
+
17
+ This multi-step workflow is efficient, targeted, and conserves context.
@@ -0,0 +1,44 @@
1
+ # Error Recovery
2
+
3
+ When operations fail, use these recovery patterns:
4
+
5
+ ### Workflow: Handle Indexing Failures
6
+
7
+ ```
8
+ 1. create_store() fails or job_status shows 'failed'
9
+ → Check error message
10
+ → Common issues:
11
+ - Git auth required (private repo)
12
+ - Invalid URL/path
13
+ - Disk space
14
+ - Network timeout
15
+
16
+ 2. Recovery actions:
17
+ - Auth issue: Provide credentials or use HTTPS
18
+ - Invalid path: Verify URL/path exists
19
+ - Disk space: delete_store() unused stores
20
+ - Network: Retry with smaller repo or use --shallow
21
+
22
+ 3. Verify recovery:
23
+ list_stores() → Check store appeared
24
+ search(test_query, stores=[new_store]) → Verify searchable
25
+ ```
26
+
27
+ **Example:**
28
+
29
+ ```
30
+ create_store('https://github.com/private/repo', 'my-repo')
31
+ → job_id: 'job_xyz'
32
+
33
+ check_job_status('job_xyz')
34
+ → Status: failed
35
+ → Error: "Authentication required for private repository"
36
+
37
+ # Recovery: Use authenticated URL or SSH
38
+ create_store('git@github.com:private/repo.git', 'my-repo')
39
+ → job_id: 'job_xyz2'
40
+
41
+ check_job_status('job_xyz2')
42
+ → Status: completed
43
+ → Success!
44
+ ```
@@ -0,0 +1,48 @@
1
+ # Handling Large Result Sets
2
+
3
+ When initial search returns many results, use progressive detail to avoid context overload:
4
+
5
+ ### Workflow: Progressive Detail Strategy
6
+
7
+ ```
8
+ 1. search(query, detail='minimal', limit=20)
9
+ → Get summaries only (~100 tokens/result)
10
+ → Review all 20 summaries quickly
11
+
12
+ 2. Filter by relevance score:
13
+ - Score > 0.8: Excellent match
14
+ - Score 0.6-0.8: Good match
15
+ - Score < 0.6: Possibly irrelevant
16
+
17
+ 3. For top 3-5 results (score > 0.7):
18
+ get_full_context(selected_ids)
19
+ → Fetch complete code only for relevant items
20
+ → Saves ~80% context vs fetching all upfront
21
+
22
+ 4. If nothing relevant:
23
+ search(refined_query, detail='contextual', limit=10)
24
+ → Try different query with more context
25
+ → Or broaden/narrow the search
26
+ ```
27
+
28
+ **Example:**
29
+
30
+ ```
31
+ # Initial broad search
32
+ search("authentication middleware", detail='minimal', limit=20)
33
+ → 20 results, scores ranging 0.45-0.92
34
+ → Total context: ~2k tokens (minimal)
35
+
36
+ # Filter by score
37
+ Top results (>0.7):
38
+ - Result 3: auth/jwt.ts (score: 0.92)
39
+ - Result 7: middleware/authenticate.ts (score: 0.85)
40
+ - Result 12: auth/session.ts (score: 0.74)
41
+
42
+ # Get full code for top 3 only
43
+ get_full_context(['result_3', 'result_7', 'result_12'])
44
+ → Complete implementations for relevant files only
45
+ → Context: ~3k tokens (vs ~15k if we fetched all 20)
46
+
47
+ # Found what we needed! If not, would refine query and retry.
48
+ ```
@@ -0,0 +1,42 @@
1
+ # Multi-Store Search with Ranking
2
+
3
+ When searching across multiple stores, use ranking to prioritize results:
4
+
5
+ ### Workflow: Cross-Library Search
6
+
7
+ ```
8
+ 1. search(query, limit=10)
9
+ → Searches ALL stores
10
+ → Returns mixed results ranked by relevance
11
+
12
+ 2. Review store distribution:
13
+ - If dominated by one store: might narrow to specific stores
14
+ - If balanced: good cross-library perspective
15
+
16
+ 3. For specific library focus:
17
+ search(query, stores=['lib1', 'lib2'], limit=15)
18
+ → Search only relevant libraries
19
+ → Get more results from target libraries
20
+ ```
21
+
22
+ **Example:**
23
+
24
+ User: "How do different frameworks handle routing?"
25
+
26
+ ```
27
+ # Search all indexed frameworks
28
+ search("routing implementation", intent='find-implementation', limit=15)
29
+ → Result mix:
30
+ - express (score: 0.91)
31
+ - fastapi (score: 0.89)
32
+ - hono (score: 0.87)
33
+ - vue-router (score: 0.82)
34
+ - ...
35
+
36
+ # All stores represented, good comparative view!
37
+
38
+ # If user wants deeper FastAPI focus:
39
+ search("routing implementation", stores=['fastapi', 'starlette'], limit=20)
40
+ → More FastAPI/Starlette-specific results
41
+ → Deeper exploration of Python framework routing
42
+ ```
@@ -0,0 +1,75 @@
1
+ ---
2
+ description: Add bluera-knowledge status indicator to the statusline
3
+ allowed-tools: [Read, Edit, Write, Bash]
4
+ ---
5
+
6
+ # Bluera Knowledge Statusline
7
+
8
+ Add a 📘 blue book icon with MCP connectivity LED to the Claude Code statusline.
9
+
10
+ ## What it shows
11
+
12
+ - `📘●` (green) — MCP server process is running
13
+ - `📘●` (red) — MCP server not detected
14
+
15
+ ## Instructions
16
+
17
+ ### 1. Check if already installed
18
+
19
+ ```bash
20
+ grep -c "# --- bluera-knowledge ---" ~/.claude/statusline.sh 2>/dev/null
21
+ ```
22
+
23
+ **If the count is >= 1: already installed.** Tell the user it's already present and stop. Do NOT inject again.
24
+
25
+ **If 0 or file missing:** proceed to install.
26
+
27
+ ### 2. Read the current statusline
28
+
29
+ ```bash
30
+ cat ~/.claude/statusline.sh
31
+ ```
32
+
33
+ If the file doesn't exist, create a minimal statusline with just the BK module.
34
+
35
+ ### 3. Inject the module
36
+
37
+ Read the module from the plugin:
38
+
39
+ ```bash
40
+ cat "${CLAUDE_PLUGIN_ROOT:-.}/scripts/statusline-module.sh"
41
+ ```
42
+
43
+ Insert the block between `# --- bluera-knowledge ---` and `# --- end bluera-knowledge ---` into `~/.claude/statusline.sh`:
44
+
45
+ - Place the function **before** the final output `printf`/`echo` statement(s)
46
+ - Place it **after** other module functions (like `get_bluera_status`, `get_project_type`, etc.)
47
+ - The leading space in the printf output is intentional — it separates from the previous badge
48
+
49
+ ### 4. Wire into the output
50
+
51
+ Find the output `printf` lines (there are typically 3 — one per context color threshold). Add `%s` and `"$BK_STATUS"` to each, positioned **after** `"$BLUERA_STATUS"` and **before** `"$GIT_INFO"`.
52
+
53
+ For example, if the current format is:
54
+ ```bash
55
+ printf "... %s%s%s ..." "$PROJECT_TYPE" "$BLUERA_STATUS" "$GIT_INFO" ...
56
+ ```
57
+
58
+ Change to:
59
+ ```bash
60
+ printf "... %s%s%s%s ..." "$PROJECT_TYPE" "$BLUERA_STATUS" "$BK_STATUS" "$GIT_INFO" ...
61
+ ```
62
+
63
+ **Important:** Add exactly one `%s` to each format string AND one `"$BK_STATUS"` to each argument list. Count the format specifiers vs arguments to ensure they match.
64
+
65
+ ### 5. Verify
66
+
67
+ ```bash
68
+ bash -n ~/.claude/statusline.sh && echo "Syntax OK"
69
+ ```
70
+
71
+ ### 6. Edge cases
72
+
73
+ - **No statusline.sh exists**: Create a minimal one that reads stdin, runs `get_bk_status`, and echoes the result
74
+ - **Non-bluera preset**: Find the output `echo`/`printf` and append `$BK_STATUS` to it
75
+ - **No `$BLUERA_STATUS` in output**: Place `$BK_STATUS` at the end of the output, before any separators
@@ -0,0 +1,80 @@
1
+ # Handling Indexing Failures
2
+
3
+ ### Common Failure Scenarios
4
+
5
+ **1. Authentication Required (Private Repos)**
6
+ ```
7
+ Error: "Authentication required"
8
+
9
+ Fix options:
10
+ - Use SSH URL: git@github.com:org/repo.git
11
+ - Use HTTPS with token: https://token@github.com/org/repo.git
12
+ - Make repo public (if appropriate)
13
+ ```
14
+
15
+ **2. Invalid URL/Path**
16
+ ```
17
+ Error: "Repository not found" or "Path does not exist"
18
+
19
+ Fix:
20
+ - Verify URL is correct (typos common!)
21
+ - Check path exists and is accessible
22
+ - Ensure network connectivity
23
+ ```
24
+
25
+ **3. Disk Space**
26
+ ```
27
+ Error: "No space left on device"
28
+
29
+ Fix:
30
+ - Check available space: df -h
31
+ - Delete unused stores: delete_store(old_store)
32
+ - Clear .bluera/bluera-knowledge/repos/ manually if needed
33
+ ```
34
+
35
+ **4. Network Timeout**
36
+ ```
37
+ Error: "Connection timeout" or "Failed to fetch"
38
+
39
+ Fix:
40
+ - Retry after checking network
41
+ - Use --shallow for large repos
42
+ - Clone manually then add-folder
43
+ ```
44
+
45
+ **5. Unsupported File Types**
46
+ ```
47
+ Warning: "Skipped 45 binary files"
48
+
49
+ This is normal!
50
+ - Binary files (images, compiled code) are skipped
51
+ - Only text files are indexed
52
+ - Check indexed count vs total to see ratio
53
+ ```
54
+
55
+ ### Recovery Workflow
56
+
57
+ ```
58
+ 1. Attempt fails:
59
+ create_store(url, name) → job fails
60
+
61
+ 2. Check error:
62
+ job_status = check_job_status(job_id)
63
+ error_msg = job_status['error']
64
+
65
+ 3. Determine fix based on error type (see above)
66
+
67
+ 4. Retry with fix:
68
+ create_store(corrected_url, name)
69
+
70
+ 5. Verify success:
71
+ check_job_status(new_job_id)
72
+ → Status: completed
73
+
74
+ list_stores()
75
+ → Store appears in list
76
+
77
+ 6. Test search:
78
+ search(test_query, stores=[name], limit=3)
79
+ → Returns results: Ready to use!
80
+ ```
@@ -0,0 +1,67 @@
1
+ # Indexing Strategies
2
+
3
+ ### Initial Indexing
4
+
5
+ When creating a store, indexing happens automatically in the background:
6
+
7
+ ```
8
+ create_store(url, name)
9
+ → Returns: job_id
10
+ → Background: clone/download → analyze → index
11
+ → Status: pending → running → completed
12
+
13
+ # Monitor progress
14
+ check_job_status(job_id)
15
+ → Progress: 45% (processing src/core.ts)
16
+ → Estimated: ~2 minutes remaining
17
+ ```
18
+
19
+ **Indexing time estimates:**
20
+ - Small library (<1k files): 30-60 seconds
21
+ - Medium library (1k-5k files): 1-3 minutes
22
+ - Large library (>5k files): 3-10 minutes
23
+ - Documentation crawl (100 pages): 1-2 minutes
24
+
25
+ ### Re-indexing (Updates)
26
+
27
+ When library code changes or you modify indexed content:
28
+
29
+ ```
30
+ # For git repos: pull latest changes
31
+ cd .bluera/bluera-knowledge/repos/vue
32
+ git pull origin main
33
+ cd -
34
+
35
+ # Re-index
36
+ /bluera-knowledge:index vue
37
+
38
+ # Or via MCP:
39
+ index_store(store='vue')
40
+ → Re-processes all files
41
+ → Updates vector embeddings
42
+ → Rebuilds search index
43
+ ```
44
+
45
+ **When to re-index:**
46
+ - Library released new version
47
+ - You modified local folder content
48
+ - Search results seem outdated
49
+ - After significant codebase changes
50
+
51
+ **Re-indexing is incremental** - only changed files are re-processed.
52
+
53
+ ### Selective Indexing
54
+
55
+ For large repos, you might want to index specific directories:
56
+
57
+ ```
58
+ # Clone full repo manually
59
+ git clone https://github.com/microsoft/vscode
60
+ cd vscode
61
+
62
+ # Index only specific dirs
63
+ /bluera-knowledge:add-folder ./src/vs/editor --name=vscode-editor
64
+ /bluera-knowledge:add-folder ./src/vs/workbench --name=vscode-workbench
65
+
66
+ # Result: Multiple focused stores instead of one massive store
67
+ ```
@@ -0,0 +1,72 @@
1
+ # Background Job Monitoring
2
+
3
+ All expensive operations run as background jobs: cloning, indexing, crawling.
4
+
5
+ ### Job Lifecycle
6
+
7
+ ```
8
+ 1. create_store() or index_store() → Returns job_id
9
+
10
+ 2. Job states:
11
+ - pending: In queue, not started
12
+ - running: Actively processing
13
+ - completed: Finished successfully
14
+ - failed: Error occurred
15
+
16
+ 3. Monitor progress:
17
+ check_job_status(job_id)
18
+ → Current state, percentage, current file
19
+
20
+ 4. List all jobs:
21
+ list_jobs()
22
+ → See pending, running, completed jobs
23
+
24
+ 5. Cancel if needed:
25
+ cancel_job(job_id)
26
+ → Stops running job, cleans up
27
+ ```
28
+
29
+ ### Best Practices for Job Monitoring
30
+
31
+ **Do poll, but not too frequently:**
32
+ ```
33
+ # Too frequent - wastes resources
34
+ while status != 'completed':
35
+ check_job_status(job_id) # Every second!
36
+ sleep(1)
37
+
38
+ # Reasonable polling interval
39
+ while status != 'completed':
40
+ check_job_status(job_id)
41
+ sleep(15) # Every 15 seconds is fine
42
+ ```
43
+
44
+ **Do handle failures gracefully:**
45
+ ```
46
+ status = check_job_status(job_id)
47
+
48
+ if status['state'] == 'failed':
49
+ error = status['error']
50
+
51
+ if 'auth' in error.lower():
52
+ print("Authentication required - try SSH URL or provide credentials")
53
+ elif 'not found' in error.lower():
54
+ print("Repository/URL not found - check the source")
55
+ elif 'disk' in error.lower():
56
+ print("Disk space issue - delete unused stores")
57
+ else:
58
+ print(f"Unexpected error: {error}")
59
+ ```
60
+
61
+ **Do list jobs to avoid duplicates:**
62
+ ```
63
+ # Before creating new store
64
+ jobs = list_jobs()
65
+ existing = [j for j in jobs if j['store'] == 'vue' and j['state'] in ['pending', 'running']]
66
+
67
+ if existing:
68
+ print(f"Job already running for 'vue': {existing[0]['id']}")
69
+ # Wait for it instead of creating duplicate
70
+ else:
71
+ create_store(...)
72
+ ```