bluera-knowledge 0.35.0 → 0.37.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +32 -0
- package/README.md +8 -20
- package/bun.lock +27 -0
- package/dist/{chunk-L2SC6J4K.js → chunk-724FNI27.js} +466 -171
- package/dist/chunk-724FNI27.js.map +1 -0
- package/dist/{chunk-DNGE7FZ4.js → chunk-AO45YFHO.js} +1386 -42
- package/dist/chunk-AO45YFHO.js.map +1 -0
- package/dist/{chunk-MQQ46BST.js → chunk-F6DGSS2N.js} +2 -2
- package/dist/index.js +72 -5
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.d.ts +37 -3
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +2 -2
- package/hooks/check-ready.sh +17 -7
- package/hooks/hooks.json +17 -1
- package/hooks/lib/store_summary.py +111 -0
- package/hooks/posttooluse-bk-reminder.py +33 -6
- package/hooks/stop-bk-check.py +86 -0
- package/hooks/userpromptsubmit-bk-nudge.py +156 -0
- package/package.json +3 -1
- package/scripts/auto-setup.sh +11 -3
- package/scripts/eval-candidates.sh +235 -0
- package/skills/advanced-workflows/references/combining-workflows.md +17 -0
- package/skills/advanced-workflows/references/error-recovery.md +44 -0
- package/skills/advanced-workflows/references/handling-large-results.md +48 -0
- package/skills/advanced-workflows/references/multi-store-search.md +42 -0
- package/skills/knowledge-search/SKILL.md +1 -1
- package/skills/search/statusline.md +75 -0
- package/skills/store-lifecycle/references/failure-recovery.md +80 -0
- package/skills/store-lifecycle/references/indexing-strategies.md +67 -0
- package/skills/store-lifecycle/references/job-monitoring.md +72 -0
- package/skills/store-lifecycle/references/lifecycle-checklist.md +20 -0
- package/skills/store-lifecycle/references/storage-management.md +43 -0
- package/skills/suggest/SKILL.md +13 -6
- package/dist/chunk-DNGE7FZ4.js.map +0 -1
- package/dist/chunk-L2SC6J4K.js.map +0 -1
- /package/dist/{chunk-MQQ46BST.js.map → chunk-F6DGSS2N.js.map} +0 -0
|
@@ -2072,40 +2072,6 @@ import { readFile as readFile2, access } from "fs/promises";
|
|
|
2072
2072
|
import { homedir as homedir2 } from "os";
|
|
2073
2073
|
import { isAbsolute, join as join6, resolve } from "path";
|
|
2074
2074
|
|
|
2075
|
-
// src/services/reranker-env.ts
|
|
2076
|
-
var logger = createLogger("reranker-env");
|
|
2077
|
-
function parseRerankerEnvOverrides(strict) {
|
|
2078
|
-
return {
|
|
2079
|
-
enabled: parseEnabled(process.env["BK_RERANKER_ENABLED"], strict),
|
|
2080
|
-
topK: parseTopK(process.env["BK_RERANKER_TOPK"], strict)
|
|
2081
|
-
};
|
|
2082
|
-
}
|
|
2083
|
-
function parseEnabled(raw, strict) {
|
|
2084
|
-
if (raw === void 0 || raw === "") return void 0;
|
|
2085
|
-
if (raw === "1") return true;
|
|
2086
|
-
if (raw === "0") return false;
|
|
2087
|
-
const msg = `BK_RERANKER_ENABLED must be '0' or '1', got: "${raw}"`;
|
|
2088
|
-
if (strict) throw new Error(msg);
|
|
2089
|
-
logger.warn(msg);
|
|
2090
|
-
return void 0;
|
|
2091
|
-
}
|
|
2092
|
-
function parseTopK(raw, strict) {
|
|
2093
|
-
if (raw === void 0 || raw === "") return void 0;
|
|
2094
|
-
const parsed = Number.parseInt(raw, 10);
|
|
2095
|
-
if (Number.isNaN(parsed) || parsed < 1) {
|
|
2096
|
-
const msg = `BK_RERANKER_TOPK must be a positive integer, got: "${raw}"`;
|
|
2097
|
-
if (strict) throw new Error(msg);
|
|
2098
|
-
logger.warn(msg);
|
|
2099
|
-
return void 0;
|
|
2100
|
-
}
|
|
2101
|
-
return parsed;
|
|
2102
|
-
}
|
|
2103
|
-
|
|
2104
|
-
// src/db/embeddings.ts
|
|
2105
|
-
import { homedir } from "os";
|
|
2106
|
-
import { join as join5 } from "path";
|
|
2107
|
-
import { pipeline, env } from "@huggingface/transformers";
|
|
2108
|
-
|
|
2109
2075
|
// src/models/registry.ts
|
|
2110
2076
|
var MODEL_REGISTRY = {
|
|
2111
2077
|
// ============================================================
|
|
@@ -2362,7 +2328,7 @@ var MODEL_REGISTRY = {
|
|
|
2362
2328
|
normalize: true,
|
|
2363
2329
|
queryPrefix: "",
|
|
2364
2330
|
docPrefix: "",
|
|
2365
|
-
category: "
|
|
2331
|
+
category: "jina",
|
|
2366
2332
|
sizeCategory: "small",
|
|
2367
2333
|
notes: "8192 token context. Good for long documents."
|
|
2368
2334
|
},
|
|
@@ -2374,12 +2340,102 @@ var MODEL_REGISTRY = {
|
|
|
2374
2340
|
normalize: true,
|
|
2375
2341
|
queryPrefix: "",
|
|
2376
2342
|
docPrefix: "",
|
|
2377
|
-
category: "
|
|
2343
|
+
category: "jina",
|
|
2378
2344
|
sizeCategory: "base",
|
|
2379
2345
|
notes: "8192 token context. Larger Jina variant."
|
|
2346
|
+
},
|
|
2347
|
+
"jina-embeddings-v2-base-code": {
|
|
2348
|
+
id: "jinaai/jina-embeddings-v2-base-code",
|
|
2349
|
+
name: "Jina Embeddings v2 Base Code",
|
|
2350
|
+
dimensions: 768,
|
|
2351
|
+
pooling: "mean",
|
|
2352
|
+
normalize: true,
|
|
2353
|
+
queryPrefix: "",
|
|
2354
|
+
docPrefix: "",
|
|
2355
|
+
category: "jina",
|
|
2356
|
+
sizeCategory: "base",
|
|
2357
|
+
notes: "161M params. Code-specific (150M+ code QA pairs, 30+ langs). 8K context. May need trust_remote_code."
|
|
2358
|
+
},
|
|
2359
|
+
// ============================================================
|
|
2360
|
+
// Snowflake Arctic Embed - Retrieval-optimized
|
|
2361
|
+
// ============================================================
|
|
2362
|
+
"snowflake-arctic-embed-xs": {
|
|
2363
|
+
id: "Snowflake/snowflake-arctic-embed-xs",
|
|
2364
|
+
name: "Snowflake Arctic Embed XS",
|
|
2365
|
+
dimensions: 384,
|
|
2366
|
+
pooling: "cls",
|
|
2367
|
+
normalize: true,
|
|
2368
|
+
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
2369
|
+
docPrefix: "",
|
|
2370
|
+
category: "snowflake",
|
|
2371
|
+
sizeCategory: "small",
|
|
2372
|
+
notes: "22M params. Ultra-small retrieval model. Based on all-MiniLM-L6-v2."
|
|
2373
|
+
},
|
|
2374
|
+
"snowflake-arctic-embed-s": {
|
|
2375
|
+
id: "Snowflake/snowflake-arctic-embed-s",
|
|
2376
|
+
name: "Snowflake Arctic Embed S",
|
|
2377
|
+
dimensions: 384,
|
|
2378
|
+
pooling: "cls",
|
|
2379
|
+
normalize: true,
|
|
2380
|
+
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
2381
|
+
docPrefix: "",
|
|
2382
|
+
category: "snowflake",
|
|
2383
|
+
sizeCategory: "small",
|
|
2384
|
+
notes: "33M params. Same size as bge-small, trained for retrieval. Drop-in candidate."
|
|
2385
|
+
},
|
|
2386
|
+
"snowflake-arctic-embed-m-v1.5": {
|
|
2387
|
+
id: "Snowflake/snowflake-arctic-embed-m-v1.5",
|
|
2388
|
+
name: "Snowflake Arctic Embed M v1.5",
|
|
2389
|
+
dimensions: 768,
|
|
2390
|
+
pooling: "cls",
|
|
2391
|
+
normalize: true,
|
|
2392
|
+
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
2393
|
+
docPrefix: "",
|
|
2394
|
+
category: "snowflake",
|
|
2395
|
+
sizeCategory: "base",
|
|
2396
|
+
notes: "109M params. BEIR 55.14. Matryoshka (truncate to 256d). 7 ONNX quant variants."
|
|
2397
|
+
},
|
|
2398
|
+
"snowflake-arctic-embed-m-v2.0": {
|
|
2399
|
+
id: "Snowflake/snowflake-arctic-embed-m-v2.0",
|
|
2400
|
+
name: "Snowflake Arctic Embed M v2.0",
|
|
2401
|
+
dimensions: 768,
|
|
2402
|
+
pooling: "cls",
|
|
2403
|
+
normalize: true,
|
|
2404
|
+
queryPrefix: "query: ",
|
|
2405
|
+
docPrefix: "",
|
|
2406
|
+
category: "snowflake",
|
|
2407
|
+
sizeCategory: "base",
|
|
2408
|
+
notes: "305M params. Multilingual, 8K context. Custom GTE arch \u2014 may need trust_remote_code."
|
|
2409
|
+
},
|
|
2410
|
+
// ============================================================
|
|
2411
|
+
// ModernBERT Embedding Models - Latest architecture (2024+)
|
|
2412
|
+
// ============================================================
|
|
2413
|
+
"gte-modernbert-base": {
|
|
2414
|
+
id: "Alibaba-NLP/gte-modernbert-base",
|
|
2415
|
+
name: "GTE ModernBERT Base",
|
|
2416
|
+
dimensions: 768,
|
|
2417
|
+
pooling: "cls",
|
|
2418
|
+
normalize: true,
|
|
2419
|
+
queryPrefix: "",
|
|
2420
|
+
docPrefix: "",
|
|
2421
|
+
category: "gte",
|
|
2422
|
+
sizeCategory: "base",
|
|
2423
|
+
notes: "149M params. CoIR code retrieval 79.31. BEIR 55.33. 8K context. No trust_remote_code. Top candidate."
|
|
2424
|
+
},
|
|
2425
|
+
"modernbert-embed-base": {
|
|
2426
|
+
id: "nomic-ai/modernbert-embed-base",
|
|
2427
|
+
name: "ModernBERT Embed Base (Nomic)",
|
|
2428
|
+
dimensions: 768,
|
|
2429
|
+
pooling: "mean",
|
|
2430
|
+
normalize: true,
|
|
2431
|
+
queryPrefix: "search_query: ",
|
|
2432
|
+
docPrefix: "search_document: ",
|
|
2433
|
+
category: "nomic",
|
|
2434
|
+
sizeCategory: "base",
|
|
2435
|
+
notes: "149M params. BEIR 52.89. Matryoshka (truncate to 256d). 8K context."
|
|
2380
2436
|
}
|
|
2381
2437
|
};
|
|
2382
|
-
var DEFAULT_MODEL_ID = "
|
|
2438
|
+
var DEFAULT_MODEL_ID = "snowflake-arctic-embed-s";
|
|
2383
2439
|
function getModelConfig(modelId) {
|
|
2384
2440
|
if (modelId in MODEL_REGISTRY) {
|
|
2385
2441
|
return MODEL_REGISTRY[modelId];
|
|
@@ -2403,8 +2459,75 @@ function getConfiguredModelId() {
|
|
|
2403
2459
|
}
|
|
2404
2460
|
return DEFAULT_MODEL_ID;
|
|
2405
2461
|
}
|
|
2462
|
+
var RERANKER_REGISTRY = {
|
|
2463
|
+
"ms-marco-MiniLM-L-6-v2": {
|
|
2464
|
+
id: "Xenova/ms-marco-MiniLM-L-6-v2",
|
|
2465
|
+
name: "MS MARCO MiniLM L6 v2",
|
|
2466
|
+
notes: "Default reranker. Fast cross-encoder for passage ranking."
|
|
2467
|
+
},
|
|
2468
|
+
"ms-marco-MiniLM-L-12-v2": {
|
|
2469
|
+
id: "Xenova/ms-marco-MiniLM-L-12-v2",
|
|
2470
|
+
name: "MS MARCO MiniLM L12 v2",
|
|
2471
|
+
notes: "Deeper reranker. Better quality but slower."
|
|
2472
|
+
},
|
|
2473
|
+
"bge-reranker-base": {
|
|
2474
|
+
id: "Xenova/bge-reranker-base",
|
|
2475
|
+
name: "BGE Reranker Base",
|
|
2476
|
+
notes: "BGE-family cross-encoder. Better fit for BGE embeddings."
|
|
2477
|
+
},
|
|
2478
|
+
"bge-reranker-large": {
|
|
2479
|
+
id: "Xenova/bge-reranker-large",
|
|
2480
|
+
name: "BGE Reranker Large",
|
|
2481
|
+
notes: "Larger BGE cross-encoder. Higher quality, slower."
|
|
2482
|
+
}
|
|
2483
|
+
};
|
|
2484
|
+
|
|
2485
|
+
// src/services/reranker-env.ts
|
|
2486
|
+
var logger = createLogger("reranker-env");
|
|
2487
|
+
function parseRerankerEnvOverrides(strict) {
|
|
2488
|
+
return {
|
|
2489
|
+
enabled: parseEnabled(process.env["BK_RERANKER_ENABLED"], strict),
|
|
2490
|
+
topK: parseTopK(process.env["BK_RERANKER_TOPK"], strict),
|
|
2491
|
+
model: parseModel(process.env["BK_RERANKER_MODEL"], strict)
|
|
2492
|
+
};
|
|
2493
|
+
}
|
|
2494
|
+
function parseEnabled(raw, strict) {
|
|
2495
|
+
if (raw === void 0 || raw === "") return void 0;
|
|
2496
|
+
if (raw === "1") return true;
|
|
2497
|
+
if (raw === "0") return false;
|
|
2498
|
+
const msg = `BK_RERANKER_ENABLED must be '0' or '1', got: "${raw}"`;
|
|
2499
|
+
if (strict) throw new Error(msg);
|
|
2500
|
+
logger.warn(msg);
|
|
2501
|
+
return void 0;
|
|
2502
|
+
}
|
|
2503
|
+
function parseTopK(raw, strict) {
|
|
2504
|
+
if (raw === void 0 || raw === "") return void 0;
|
|
2505
|
+
const parsed = Number.parseInt(raw, 10);
|
|
2506
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
2507
|
+
const msg = `BK_RERANKER_TOPK must be a positive integer, got: "${raw}"`;
|
|
2508
|
+
if (strict) throw new Error(msg);
|
|
2509
|
+
logger.warn(msg);
|
|
2510
|
+
return void 0;
|
|
2511
|
+
}
|
|
2512
|
+
return parsed;
|
|
2513
|
+
}
|
|
2514
|
+
function parseModel(raw, strict) {
|
|
2515
|
+
if (raw === void 0 || raw === "") return void 0;
|
|
2516
|
+
const entry = RERANKER_REGISTRY[raw];
|
|
2517
|
+
if (entry === void 0) {
|
|
2518
|
+
const valid = Object.keys(RERANKER_REGISTRY).join(", ");
|
|
2519
|
+
const msg = `BK_RERANKER_MODEL must be one of [${valid}], got: "${raw}"`;
|
|
2520
|
+
if (strict) throw new Error(msg);
|
|
2521
|
+
logger.warn(msg);
|
|
2522
|
+
return void 0;
|
|
2523
|
+
}
|
|
2524
|
+
return entry.id;
|
|
2525
|
+
}
|
|
2406
2526
|
|
|
2407
2527
|
// src/db/embeddings.ts
|
|
2528
|
+
import { homedir } from "os";
|
|
2529
|
+
import { join as join5 } from "path";
|
|
2530
|
+
import { pipeline, env } from "@huggingface/transformers";
|
|
2408
2531
|
env.cacheDir = join5(homedir(), ".cache", "huggingface-transformers");
|
|
2409
2532
|
function getFinetunedModelPath() {
|
|
2410
2533
|
const path4 = process.env["BK_FINETUNED_MODEL"];
|
|
@@ -2419,14 +2542,16 @@ function buildEmbeddingConfig(modelId, overrides) {
|
|
|
2419
2542
|
const envQueryPrefix = process.env["BK_QUERY_PREFIX"];
|
|
2420
2543
|
const modelConfig = getModelConfig(modelId);
|
|
2421
2544
|
if (modelConfig === void 0) {
|
|
2545
|
+
const baseModelId = process.env["BK_BASE_MODEL"];
|
|
2546
|
+
const baseConfig = baseModelId !== void 0 && baseModelId !== "" ? getModelConfig(baseModelId) : void 0;
|
|
2422
2547
|
return {
|
|
2423
2548
|
model: modelId,
|
|
2424
2549
|
batchSize: overrides?.batchSize ?? 32,
|
|
2425
2550
|
dtype: overrides?.dtype ?? "fp32",
|
|
2426
|
-
pooling: overrides?.pooling ?? envPooling ?? "mean",
|
|
2427
|
-
normalize: overrides?.normalize ?? true,
|
|
2428
|
-
queryPrefix: overrides?.queryPrefix ?? envQueryPrefix ?? "",
|
|
2429
|
-
docPrefix: overrides?.docPrefix ?? "",
|
|
2551
|
+
pooling: overrides?.pooling ?? envPooling ?? baseConfig?.pooling ?? "mean",
|
|
2552
|
+
normalize: overrides?.normalize ?? baseConfig?.normalize ?? true,
|
|
2553
|
+
queryPrefix: overrides?.queryPrefix ?? envQueryPrefix ?? baseConfig?.queryPrefix ?? "",
|
|
2554
|
+
docPrefix: overrides?.docPrefix ?? baseConfig?.docPrefix ?? "",
|
|
2430
2555
|
maxInFlightBatches: overrides?.maxInFlightBatches ?? 1
|
|
2431
2556
|
};
|
|
2432
2557
|
}
|
|
@@ -2670,10 +2795,10 @@ var DEFAULT_CONFIG = {
|
|
|
2670
2795
|
version: 1,
|
|
2671
2796
|
dataDir: ".bluera/bluera-knowledge/data",
|
|
2672
2797
|
embedding: {
|
|
2673
|
-
model: "
|
|
2798
|
+
model: "Snowflake/snowflake-arctic-embed-s",
|
|
2674
2799
|
batchSize: 32,
|
|
2675
2800
|
dtype: "fp32",
|
|
2676
|
-
pooling: "
|
|
2801
|
+
pooling: "cls",
|
|
2677
2802
|
normalize: true,
|
|
2678
2803
|
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
2679
2804
|
docPrefix: "",
|
|
@@ -2824,13 +2949,14 @@ var ConfigService = class {
|
|
|
2824
2949
|
};
|
|
2825
2950
|
}
|
|
2826
2951
|
const rerankerOverrides = parseRerankerEnvOverrides(false);
|
|
2827
|
-
if (rerankerOverrides.enabled !== void 0 || rerankerOverrides.topK !== void 0) {
|
|
2952
|
+
if (rerankerOverrides.enabled !== void 0 || rerankerOverrides.topK !== void 0 || rerankerOverrides.model !== void 0) {
|
|
2828
2953
|
this.config = {
|
|
2829
2954
|
...this.config,
|
|
2830
2955
|
reranker: {
|
|
2831
2956
|
...this.config.reranker,
|
|
2832
2957
|
...rerankerOverrides.enabled !== void 0 ? { enabled: rerankerOverrides.enabled } : {},
|
|
2833
|
-
...rerankerOverrides.topK !== void 0 ? { topK: rerankerOverrides.topK } : {}
|
|
2958
|
+
...rerankerOverrides.topK !== void 0 ? { topK: rerankerOverrides.topK } : {},
|
|
2959
|
+
...rerankerOverrides.model !== void 0 ? { model: rerankerOverrides.model } : {}
|
|
2834
2960
|
}
|
|
2835
2961
|
};
|
|
2836
2962
|
}
|
|
@@ -3361,11 +3487,15 @@ var DriftService = class {
|
|
|
3361
3487
|
}
|
|
3362
3488
|
};
|
|
3363
3489
|
|
|
3364
|
-
// src/
|
|
3490
|
+
// src/services/index.service.ts
|
|
3491
|
+
var minimatch = minimatchFn;
|
|
3492
|
+
var execFileAsync = promisify(execFile);
|
|
3493
|
+
var logger2 = createLogger("index-service");
|
|
3365
3494
|
var TEXT_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
3366
3495
|
// Text/docs
|
|
3367
3496
|
".txt",
|
|
3368
3497
|
".md",
|
|
3498
|
+
".mdx",
|
|
3369
3499
|
".rst",
|
|
3370
3500
|
".adoc",
|
|
3371
3501
|
// JavaScript/TypeScript
|
|
@@ -3471,11 +3601,33 @@ var TEXT_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
3471
3601
|
".makefile",
|
|
3472
3602
|
".cmake"
|
|
3473
3603
|
]);
|
|
3474
|
-
|
|
3475
|
-
//
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3604
|
+
var TEXT_BASENAMES = /* @__PURE__ */ new Set([
|
|
3605
|
+
// Build systems
|
|
3606
|
+
"Dockerfile",
|
|
3607
|
+
"Makefile",
|
|
3608
|
+
"Rakefile",
|
|
3609
|
+
"Gemfile",
|
|
3610
|
+
"Procfile",
|
|
3611
|
+
"Vagrantfile",
|
|
3612
|
+
"Justfile",
|
|
3613
|
+
"Brewfile",
|
|
3614
|
+
"Earthfile",
|
|
3615
|
+
"Tiltfile",
|
|
3616
|
+
"CMakeLists.txt",
|
|
3617
|
+
"BUILD",
|
|
3618
|
+
"BUILD.bazel",
|
|
3619
|
+
"WORKSPACE",
|
|
3620
|
+
// Dotfiles (config)
|
|
3621
|
+
".gitignore",
|
|
3622
|
+
".gitattributes",
|
|
3623
|
+
".editorconfig",
|
|
3624
|
+
".dockerignore",
|
|
3625
|
+
".eslintignore",
|
|
3626
|
+
".prettierignore",
|
|
3627
|
+
".npmignore",
|
|
3628
|
+
".env.example",
|
|
3629
|
+
".env.sample"
|
|
3630
|
+
]);
|
|
3479
3631
|
function isMinifiedFile(filename) {
|
|
3480
3632
|
const minPatterns = [
|
|
3481
3633
|
/\.min\.(js|css|mjs|cjs)$/i,
|
|
@@ -3542,6 +3694,7 @@ var EXT_TO_LANGUAGE = {
|
|
|
3542
3694
|
".psm1": "powershell",
|
|
3543
3695
|
".sql": "sql",
|
|
3544
3696
|
".md": "markdown",
|
|
3697
|
+
".mdx": "mdx",
|
|
3545
3698
|
".rst": "restructuredtext",
|
|
3546
3699
|
".lua": "lua",
|
|
3547
3700
|
".r": "r",
|
|
@@ -3581,6 +3734,19 @@ var EXT_TO_LANGUAGE = {
|
|
|
3581
3734
|
".toml": "toml",
|
|
3582
3735
|
".xml": "xml"
|
|
3583
3736
|
};
|
|
3737
|
+
var BASENAME_TO_LANGUAGE = {
|
|
3738
|
+
Dockerfile: "dockerfile",
|
|
3739
|
+
Makefile: "makefile",
|
|
3740
|
+
Rakefile: "ruby",
|
|
3741
|
+
Gemfile: "ruby",
|
|
3742
|
+
Brewfile: "ruby",
|
|
3743
|
+
"CMakeLists.txt": "cmake",
|
|
3744
|
+
BUILD: "starlark",
|
|
3745
|
+
"BUILD.bazel": "starlark",
|
|
3746
|
+
WORKSPACE: "starlark",
|
|
3747
|
+
Tiltfile: "starlark",
|
|
3748
|
+
Justfile: "just"
|
|
3749
|
+
};
|
|
3584
3750
|
var ENTRY_POINT_NAMES = /* @__PURE__ */ new Set([
|
|
3585
3751
|
"index.ts",
|
|
3586
3752
|
"index.js",
|
|
@@ -3969,7 +4135,7 @@ ${rawContent}` : rawContent;
|
|
|
3969
4135
|
const ext = extname(filePath).toLowerCase();
|
|
3970
4136
|
const fileName = basename(filePath).toLowerCase();
|
|
3971
4137
|
const fileType = this.classifyFileType(ext, fileName, filePath);
|
|
3972
|
-
const language = EXT_TO_LANGUAGE[ext];
|
|
4138
|
+
const language = EXT_TO_LANGUAGE[ext] ?? BASENAME_TO_LANGUAGE[basename(filePath)];
|
|
3973
4139
|
const normalizedRelPath = relativePath.replaceAll("\\", "/");
|
|
3974
4140
|
const depth = normalizedRelPath.split("/").length - 1;
|
|
3975
4141
|
const isEntryPoint = ENTRY_POINT_NAMES.has(basename(filePath));
|
|
@@ -4058,7 +4224,8 @@ ${rawContent}` : rawContent;
|
|
|
4058
4224
|
candidates = await this.scanDirectory(storePath);
|
|
4059
4225
|
logger2.debug({ storePath, count: candidates.length }, "Using filesystem walk for discovery");
|
|
4060
4226
|
}
|
|
4061
|
-
|
|
4227
|
+
const { files } = await this.filterFiles(candidates, storePath, ingestConfig);
|
|
4228
|
+
return files;
|
|
4062
4229
|
}
|
|
4063
4230
|
/**
|
|
4064
4231
|
* Apply extension filter, ignore patterns, size limit, and per-store ingest filters.
|
|
@@ -4070,23 +4237,35 @@ ${rawContent}` : rawContent;
|
|
|
4070
4237
|
const skippedMinified = [];
|
|
4071
4238
|
const skippedBinary = [];
|
|
4072
4239
|
const skippedExcluded = [];
|
|
4240
|
+
let skippedExtension = 0;
|
|
4241
|
+
let skippedDirSegment = 0;
|
|
4242
|
+
let skippedIgnorePattern = 0;
|
|
4243
|
+
let skippedStatError = 0;
|
|
4073
4244
|
const maxFileSize = ingestConfig?.maxFileSizeBytes ?? this.maxFileSizeBytes;
|
|
4074
|
-
const
|
|
4245
|
+
const skipMinifiedOpt = ingestConfig?.skipMinified ?? true;
|
|
4075
4246
|
const skipBinaries = ingestConfig?.skipBinaries ?? true;
|
|
4076
4247
|
const excludeGlobs = ingestConfig?.excludeGlobs ?? [];
|
|
4248
|
+
const maxFiles = ingestConfig?.maxFiles;
|
|
4077
4249
|
for (const filePath of files) {
|
|
4078
4250
|
const ext = extname(filePath).toLowerCase();
|
|
4079
4251
|
const filename = basename(filePath);
|
|
4080
|
-
if (!TEXT_EXTENSIONS.has(ext))
|
|
4252
|
+
if (!TEXT_EXTENSIONS.has(ext) && !TEXT_BASENAMES.has(filename)) {
|
|
4253
|
+
skippedExtension++;
|
|
4254
|
+
continue;
|
|
4255
|
+
}
|
|
4081
4256
|
const relativePath = relative(storePath, filePath).replaceAll("\\", "/");
|
|
4082
4257
|
const pathSegments = relativePath.split("/");
|
|
4083
4258
|
const dirSegments = pathSegments.slice(0, -1);
|
|
4084
4259
|
if (dirSegments.some((segment) => this.ignoreDirs.has(segment))) {
|
|
4260
|
+
skippedDirSegment++;
|
|
4085
4261
|
continue;
|
|
4086
4262
|
}
|
|
4087
4263
|
const shouldIgnore = this.ignoreFilePatterns.some((matcher) => matcher(filename));
|
|
4088
|
-
if (shouldIgnore)
|
|
4089
|
-
|
|
4264
|
+
if (shouldIgnore) {
|
|
4265
|
+
skippedIgnorePattern++;
|
|
4266
|
+
continue;
|
|
4267
|
+
}
|
|
4268
|
+
if (skipMinifiedOpt && isMinifiedFile(filename)) {
|
|
4090
4269
|
skippedMinified.push(filename);
|
|
4091
4270
|
continue;
|
|
4092
4271
|
}
|
|
@@ -4100,13 +4279,14 @@ ${rawContent}` : rawContent;
|
|
|
4100
4279
|
}
|
|
4101
4280
|
let fileSize;
|
|
4102
4281
|
try {
|
|
4103
|
-
const
|
|
4104
|
-
fileSize =
|
|
4282
|
+
const stats2 = await stat2(filePath);
|
|
4283
|
+
fileSize = stats2.size;
|
|
4105
4284
|
if (fileSize > maxFileSize) {
|
|
4106
4285
|
skippedLarge.push({ path: filePath, size: fileSize });
|
|
4107
4286
|
continue;
|
|
4108
4287
|
}
|
|
4109
4288
|
} catch {
|
|
4289
|
+
skippedStatError++;
|
|
4110
4290
|
continue;
|
|
4111
4291
|
}
|
|
4112
4292
|
if (skipBinaries) {
|
|
@@ -4116,17 +4296,23 @@ ${rawContent}` : rawContent;
|
|
|
4116
4296
|
continue;
|
|
4117
4297
|
}
|
|
4118
4298
|
} catch {
|
|
4299
|
+
skippedStatError++;
|
|
4119
4300
|
continue;
|
|
4120
4301
|
}
|
|
4121
4302
|
}
|
|
4122
4303
|
result.push(filePath);
|
|
4123
4304
|
}
|
|
4305
|
+
if (maxFiles !== void 0 && result.length > maxFiles) {
|
|
4306
|
+
logger2.info({ storePath, total: result.length, maxFiles }, "Applying maxFiles cap");
|
|
4307
|
+
result.length = maxFiles;
|
|
4308
|
+
}
|
|
4309
|
+
const skippedLargeBytes = skippedLarge.reduce((sum, f) => sum + f.size, 0);
|
|
4124
4310
|
if (skippedLarge.length > 0) {
|
|
4125
4311
|
logger2.info(
|
|
4126
4312
|
{
|
|
4127
4313
|
storePath,
|
|
4128
4314
|
count: skippedLarge.length,
|
|
4129
|
-
totalBytes:
|
|
4315
|
+
totalBytes: skippedLargeBytes,
|
|
4130
4316
|
examples: skippedLarge.slice(0, 5).map((f) => relative(storePath, f.path))
|
|
4131
4317
|
},
|
|
4132
4318
|
"Skipped large files"
|
|
@@ -4150,7 +4336,21 @@ ${rawContent}` : rawContent;
|
|
|
4150
4336
|
"Skipped excluded files"
|
|
4151
4337
|
);
|
|
4152
4338
|
}
|
|
4153
|
-
|
|
4339
|
+
const stats = {
|
|
4340
|
+
candidates: files.length,
|
|
4341
|
+
accepted: result.length,
|
|
4342
|
+
skippedExtension,
|
|
4343
|
+
skippedDirSegment,
|
|
4344
|
+
skippedIgnorePattern,
|
|
4345
|
+
skippedMinified: skippedMinified.length,
|
|
4346
|
+
skippedExcluded: skippedExcluded.length,
|
|
4347
|
+
skippedLargeFiles: skippedLarge.length,
|
|
4348
|
+
skippedLargeBytes,
|
|
4349
|
+
skippedBinary: skippedBinary.length,
|
|
4350
|
+
skippedStatError
|
|
4351
|
+
};
|
|
4352
|
+
logger2.info({ storePath, ...stats }, "File filtering complete");
|
|
4353
|
+
return { files: result, stats };
|
|
4154
4354
|
}
|
|
4155
4355
|
async scanDirectory(dir) {
|
|
4156
4356
|
const files = [];
|
|
@@ -4167,7 +4367,7 @@ ${rawContent}` : rawContent;
|
|
|
4167
4367
|
continue;
|
|
4168
4368
|
}
|
|
4169
4369
|
const ext = extname(entry.name).toLowerCase();
|
|
4170
|
-
if (TEXT_EXTENSIONS.has(ext)) {
|
|
4370
|
+
if (TEXT_EXTENSIONS.has(ext) || TEXT_BASENAMES.has(entry.name)) {
|
|
4171
4371
|
files.push(fullPath);
|
|
4172
4372
|
}
|
|
4173
4373
|
}
|
|
@@ -4761,7 +4961,8 @@ function parseSearchEnvOverrides(strict) {
|
|
|
4761
4961
|
return {
|
|
4762
4962
|
rrfK: parseRrfK(process.env["BK_RRF_K"], strict),
|
|
4763
4963
|
vectorWeight: parseVectorWeight(process.env["BK_RRF_VECTOR_WEIGHT"], strict),
|
|
4764
|
-
candidateMultiplier: parseCandidateMultiplier(process.env["BK_CANDIDATE_MULTIPLIER"], strict)
|
|
4964
|
+
candidateMultiplier: parseCandidateMultiplier(process.env["BK_CANDIDATE_MULTIPLIER"], strict),
|
|
4965
|
+
queryExpansion: parseBoolFlag(process.env["BK_QUERY_EXPANSION"], "BK_QUERY_EXPANSION", strict)
|
|
4765
4966
|
};
|
|
4766
4967
|
}
|
|
4767
4968
|
function parseRrfK(raw, strict) {
|
|
@@ -4797,6 +4998,69 @@ function parseCandidateMultiplier(raw, strict) {
|
|
|
4797
4998
|
}
|
|
4798
4999
|
return parsed;
|
|
4799
5000
|
}
|
|
5001
|
+
function parseBoolFlag(raw, name, strict) {
|
|
5002
|
+
if (raw === void 0 || raw === "") return void 0;
|
|
5003
|
+
if (raw === "1") return true;
|
|
5004
|
+
if (raw === "0") return false;
|
|
5005
|
+
const msg = `${name} must be '0' or '1', got: "${raw}"`;
|
|
5006
|
+
if (strict) throw new Error(msg);
|
|
5007
|
+
logger3.warn(msg);
|
|
5008
|
+
return void 0;
|
|
5009
|
+
}
|
|
5010
|
+
|
|
5011
|
+
// src/utils/code-tokenizer.ts
|
|
5012
|
+
function splitIdentifier(identifier) {
|
|
5013
|
+
const segments = identifier.split(/[_.]/).filter((s) => s.length > 0);
|
|
5014
|
+
const words = [];
|
|
5015
|
+
for (const segment of segments) {
|
|
5016
|
+
const parts = segment.replace(/([a-z])([A-Z])/g, "$1\0$2").replace(/([A-Z]{2,})([A-Z][a-z])/g, "$1\0$2").replace(/([a-zA-Z])(\d)/g, "$1\0$2").replace(/(\d)([a-zA-Z])/g, "$1\0$2").split("\0");
|
|
5017
|
+
for (const part of parts) {
|
|
5018
|
+
if (part.length > 0) {
|
|
5019
|
+
words.push(part);
|
|
5020
|
+
}
|
|
5021
|
+
}
|
|
5022
|
+
}
|
|
5023
|
+
return words;
|
|
5024
|
+
}
|
|
5025
|
+
var IDENTIFIER_PATTERN = /(?:[a-zA-Z_$][\w$]*(?:\.[\w$]+)*)/g;
|
|
5026
|
+
function isSplittable(identifier) {
|
|
5027
|
+
if (/[a-z][A-Z]/.test(identifier)) return true;
|
|
5028
|
+
if (identifier.includes("_")) return true;
|
|
5029
|
+
if (identifier.includes(".")) return true;
|
|
5030
|
+
if (/[A-Z]{2,}[a-z]/.test(identifier)) return true;
|
|
5031
|
+
if (/[a-zA-Z]\d|\d[a-zA-Z]/.test(identifier)) return true;
|
|
5032
|
+
return false;
|
|
5033
|
+
}
|
|
5034
|
+
function extractSplitVariants(text) {
|
|
5035
|
+
const seen = /* @__PURE__ */ new Set();
|
|
5036
|
+
const variants = [];
|
|
5037
|
+
for (const match of text.matchAll(IDENTIFIER_PATTERN)) {
|
|
5038
|
+
const identifier = match[0];
|
|
5039
|
+
if (identifier.length < 3 || seen.has(identifier)) continue;
|
|
5040
|
+
seen.add(identifier);
|
|
5041
|
+
if (!isSplittable(identifier)) continue;
|
|
5042
|
+
const parts = splitIdentifier(identifier);
|
|
5043
|
+
if (parts.length > 1) {
|
|
5044
|
+
variants.push(parts.join(" "));
|
|
5045
|
+
const lower = parts.map((p) => p.toLowerCase()).join(" ");
|
|
5046
|
+
if (lower !== parts.join(" ").toLowerCase()) {
|
|
5047
|
+
variants.push(lower);
|
|
5048
|
+
}
|
|
5049
|
+
}
|
|
5050
|
+
}
|
|
5051
|
+
return variants.join(" ");
|
|
5052
|
+
}
|
|
5053
|
+
function buildFtsContent(originalContent) {
|
|
5054
|
+
const variants = extractSplitVariants(originalContent);
|
|
5055
|
+
if (variants.length === 0) return originalContent;
|
|
5056
|
+
return `${originalContent}
|
|
5057
|
+
${variants}`;
|
|
5058
|
+
}
|
|
5059
|
+
function normalizeFtsQuery(query) {
|
|
5060
|
+
const variants = extractSplitVariants(query);
|
|
5061
|
+
if (variants.length === 0) return query;
|
|
5062
|
+
return `${query} ${variants}`;
|
|
5063
|
+
}
|
|
4800
5064
|
|
|
4801
5065
|
// src/services/search.service.ts
|
|
4802
5066
|
var logger4 = createLogger("search-service");
|
|
@@ -4868,6 +5132,18 @@ var INTENT_FILE_BOOSTS = {
|
|
|
4868
5132
|
changelog: 1.1,
|
|
4869
5133
|
// Often contains bug fixes and known issues
|
|
4870
5134
|
other: 1
|
|
5135
|
+
},
|
|
5136
|
+
testing: {
|
|
5137
|
+
"documentation-primary": 0.8,
|
|
5138
|
+
documentation: 0.85,
|
|
5139
|
+
example: 1,
|
|
5140
|
+
source: 0.9,
|
|
5141
|
+
"source-internal": 0.85,
|
|
5142
|
+
test: 1.5,
|
|
5143
|
+
// Tests are exactly what the user wants
|
|
5144
|
+
config: 0.7,
|
|
5145
|
+
changelog: 0.6,
|
|
5146
|
+
other: 0.9
|
|
4871
5147
|
}
|
|
4872
5148
|
};
|
|
4873
5149
|
var FRAMEWORK_PATTERNS = [
|
|
@@ -4923,12 +5199,23 @@ var CONCEPTUAL_PATTERNS = [
|
|
|
4923
5199
|
/\bhow does .* work\b/i,
|
|
4924
5200
|
/\bwhat('s| is) the (purpose|point|idea)\b/i
|
|
4925
5201
|
];
|
|
5202
|
+
var TESTING_PATTERNS = [
|
|
5203
|
+
/\b(test|tests|testing)\b/i,
|
|
5204
|
+
/\b(mock|mocking|stub|stubs)\b/i,
|
|
5205
|
+
/\b(assert|assertion|expect)\b/i,
|
|
5206
|
+
/\btest\s*(client|runner|helper|fixture|suite)\b/i,
|
|
5207
|
+
/\b(unit|integration|e2e|end-to-end)\s*test/i,
|
|
5208
|
+
/\b(pytest|jest|vitest|mocha|gotest)\b/i
|
|
5209
|
+
];
|
|
4926
5210
|
function classifyQueryIntents(query) {
|
|
4927
5211
|
const q = query.toLowerCase();
|
|
4928
5212
|
const intents = [];
|
|
4929
5213
|
if (IMPLEMENTATION_PATTERNS.some((p) => p.test(q))) {
|
|
4930
5214
|
intents.push({ intent: "implementation", confidence: 0.9 });
|
|
4931
5215
|
}
|
|
5216
|
+
if (TESTING_PATTERNS.some((p) => p.test(q))) {
|
|
5217
|
+
intents.push({ intent: "testing", confidence: 0.85 });
|
|
5218
|
+
}
|
|
4932
5219
|
if (DEBUGGING_PATTERNS.some((p) => p.test(q))) {
|
|
4933
5220
|
intents.push({ intent: "debugging", confidence: 0.85 });
|
|
4934
5221
|
}
|
|
@@ -4954,41 +5241,16 @@ function mapSearchIntentToQueryIntent(intent) {
|
|
|
4954
5241
|
case "find-pattern":
|
|
4955
5242
|
case "find-implementation":
|
|
4956
5243
|
case "find-definition":
|
|
4957
|
-
case "find-files":
|
|
4958
5244
|
return "implementation";
|
|
4959
5245
|
case "find-usage":
|
|
4960
5246
|
case "find-documentation":
|
|
4961
5247
|
return "how-to";
|
|
5248
|
+
case "find-files":
|
|
5249
|
+
return "implementation";
|
|
4962
5250
|
}
|
|
4963
5251
|
}
|
|
4964
|
-
var INTENT_EXPANSION_TERMS = {
|
|
4965
|
-
"find-implementation": "source code implementation function class",
|
|
4966
|
-
"find-documentation": "documentation guide tutorial example",
|
|
4967
|
-
"find-usage": "usage example how to use",
|
|
4968
|
-
"find-pattern": "pattern matching code structure",
|
|
4969
|
-
"find-definition": "definition type interface declaration",
|
|
4970
|
-
"find-files": "file module path"
|
|
4971
|
-
};
|
|
4972
|
-
function expandQueryWithIntent(query, intent) {
|
|
4973
|
-
if (intent === void 0) return query;
|
|
4974
|
-
const expansion = INTENT_EXPANSION_TERMS[intent];
|
|
4975
|
-
return `${query} ${expansion}`;
|
|
4976
|
-
}
|
|
4977
|
-
function isStrongFtsSignal(query, ftsResults) {
|
|
4978
|
-
if (ftsResults.length < 2) return false;
|
|
4979
|
-
const top = ftsResults[0];
|
|
4980
|
-
const second = ftsResults[1];
|
|
4981
|
-
if (top === void 0 || second === void 0) return false;
|
|
4982
|
-
if (second.score > 0 && top.score / second.score <= 2) return false;
|
|
4983
|
-
const queryLower = query.toLowerCase();
|
|
4984
|
-
const rawFile = top.metadata["file"] ?? top.metadata["path"];
|
|
4985
|
-
const rawName = top.metadata["name"];
|
|
4986
|
-
const filePath = typeof rawFile === "string" ? rawFile : "";
|
|
4987
|
-
const name = typeof rawName === "string" ? rawName : "";
|
|
4988
|
-
return filePath.toLowerCase().includes(queryLower) || name.toLowerCase().includes(queryLower);
|
|
4989
|
-
}
|
|
4990
5252
|
var RRF_PRESETS = {
|
|
4991
|
-
code: { k: 25, vectorWeight: 0.
|
|
5253
|
+
code: { k: 25, vectorWeight: 0.35, ftsWeight: 0.65 },
|
|
4992
5254
|
web: { k: 30, vectorWeight: 0.7, ftsWeight: 0.3 }
|
|
4993
5255
|
};
|
|
4994
5256
|
var DEFAULT_CANDIDATE_MULTIPLIER = 2;
|
|
@@ -5075,8 +5337,7 @@ var SearchService = class {
|
|
|
5075
5337
|
let rerankTimeMs;
|
|
5076
5338
|
const fetchLimit = limit * 3;
|
|
5077
5339
|
if (mode === "vector") {
|
|
5078
|
-
const
|
|
5079
|
-
const rawResults = await this.vectorSearchRaw(expandedQuery, stores, fetchLimit);
|
|
5340
|
+
const rawResults = await this.vectorSearchRaw(query.query, stores, fetchLimit);
|
|
5080
5341
|
maxRawScore = rawResults.length > 0 ? rawResults[0]?.score ?? 0 : 0;
|
|
5081
5342
|
allResults = this.normalizeAndFilterScores(rawResults, query.threshold).slice(0, fetchLimit);
|
|
5082
5343
|
} else if (mode === "fts") {
|
|
@@ -5086,8 +5347,7 @@ var SearchService = class {
|
|
|
5086
5347
|
query.query,
|
|
5087
5348
|
stores,
|
|
5088
5349
|
fetchLimit,
|
|
5089
|
-
query.threshold
|
|
5090
|
-
query.intent
|
|
5350
|
+
query.threshold
|
|
5091
5351
|
);
|
|
5092
5352
|
allResults = hybridResult.results;
|
|
5093
5353
|
maxRawScore = hybridResult.maxRawScore;
|
|
@@ -5226,6 +5486,41 @@ var SearchService = class {
|
|
|
5226
5486
|
}
|
|
5227
5487
|
return normalized;
|
|
5228
5488
|
}
|
|
5489
|
+
/**
|
|
5490
|
+
* Generate query variants for multi-query expansion.
|
|
5491
|
+
* Strips intent prefixes to create a keyword-focused variant.
|
|
5492
|
+
* Returns original + variants (deduplicated).
|
|
5493
|
+
*/
|
|
5494
|
+
expandQuery(query) {
|
|
5495
|
+
const queries = [query];
|
|
5496
|
+
const stripped = query.replace(
|
|
5497
|
+
/^(how to |how do I |how does |implement |usage of |find the |what is |show me |where is |where are )/i,
|
|
5498
|
+
""
|
|
5499
|
+
).trim();
|
|
5500
|
+
if (stripped !== query && stripped.length >= 5) {
|
|
5501
|
+
queries.push(stripped);
|
|
5502
|
+
}
|
|
5503
|
+
return queries;
|
|
5504
|
+
}
|
|
5505
|
+
/**
|
|
5506
|
+
* Run vector search across multiple query variants and merge results.
|
|
5507
|
+
* Deduplicates by document ID, keeping the highest score.
|
|
5508
|
+
*/
|
|
5509
|
+
async multiQueryVectorSearch(queries, stores, limit) {
|
|
5510
|
+
const allResults = await Promise.all(
|
|
5511
|
+
queries.map((q) => this.vectorSearchRaw(q, stores, limit))
|
|
5512
|
+
);
|
|
5513
|
+
const merged = /* @__PURE__ */ new Map();
|
|
5514
|
+
for (const results of allResults) {
|
|
5515
|
+
for (const r of results) {
|
|
5516
|
+
const existing = merged.get(r.id);
|
|
5517
|
+
if (existing === void 0 || r.score > existing.score) {
|
|
5518
|
+
merged.set(r.id, r);
|
|
5519
|
+
}
|
|
5520
|
+
}
|
|
5521
|
+
}
|
|
5522
|
+
return [...merged.values()].sort((a, b) => b.score - a.score).slice(0, limit);
|
|
5523
|
+
}
|
|
5229
5524
|
/**
|
|
5230
5525
|
* Fetch raw vector search results without normalization.
|
|
5231
5526
|
* Returns results with raw cosine similarity scores [0-1].
|
|
@@ -5249,9 +5544,10 @@ var SearchService = class {
|
|
|
5249
5544
|
}
|
|
5250
5545
|
async ftsSearch(query, stores, limit) {
|
|
5251
5546
|
const results = [];
|
|
5547
|
+
const normalizedQuery = normalizeFtsQuery(query);
|
|
5252
5548
|
for (const storeId of stores) {
|
|
5253
5549
|
try {
|
|
5254
|
-
const hits = await this.lanceStore.fullTextSearch(storeId,
|
|
5550
|
+
const hits = await this.lanceStore.fullTextSearch(storeId, normalizedQuery, limit);
|
|
5255
5551
|
results.push(
|
|
5256
5552
|
...hits.map((r) => ({
|
|
5257
5553
|
id: r.id,
|
|
@@ -5268,37 +5564,20 @@ var SearchService = class {
|
|
|
5268
5564
|
/**
|
|
5269
5565
|
* Internal hybrid search result with additional metadata for confidence calculation.
|
|
5270
5566
|
*/
|
|
5271
|
-
async hybridSearchWithMetadata(query, stores, limit, threshold
|
|
5567
|
+
async hybridSearchWithMetadata(query, stores, limit, threshold) {
|
|
5272
5568
|
const intents = classifyQueryIntents(query);
|
|
5273
5569
|
const envOverrides = parseSearchEnvOverrides(false);
|
|
5274
5570
|
const candidateMultiplier = envOverrides.candidateMultiplier ?? DEFAULT_CANDIDATE_MULTIPLIER;
|
|
5275
|
-
const
|
|
5276
|
-
|
|
5277
|
-
|
|
5278
|
-
{ query, topScore: ftsResults[0]?.score },
|
|
5279
|
-
"Strong FTS signal \u2014 skipping vector search"
|
|
5280
|
-
);
|
|
5281
|
-
const sorted2 = ftsResults.slice(0, limit).map((r, i) => ({
|
|
5282
|
-
...r,
|
|
5283
|
-
score: Math.round((1 - i / Math.max(ftsResults.length, 1)) * 1e6) / 1e6
|
|
5284
|
-
}));
|
|
5285
|
-
if (threshold !== void 0) {
|
|
5286
|
-
return { results: sorted2.filter((r) => r.score >= threshold), maxRawScore: 0 };
|
|
5287
|
-
}
|
|
5288
|
-
return { results: sorted2, maxRawScore: 0 };
|
|
5289
|
-
}
|
|
5290
|
-
const expandedQuery = expandQueryWithIntent(query, searchIntent);
|
|
5291
|
-
const rawVectorResults = await this.vectorSearchRaw(
|
|
5292
|
-
expandedQuery,
|
|
5293
|
-
stores,
|
|
5294
|
-
limit * candidateMultiplier
|
|
5295
|
-
);
|
|
5571
|
+
const fetchLimit = limit * candidateMultiplier;
|
|
5572
|
+
const useExpansion = envOverrides.queryExpansion === true;
|
|
5573
|
+
const rawVectorResults = useExpansion ? await this.multiQueryVectorSearch(this.expandQuery(query), stores, fetchLimit) : await this.vectorSearchRaw(query, stores, fetchLimit);
|
|
5296
5574
|
const rawVectorScores = /* @__PURE__ */ new Map();
|
|
5297
5575
|
rawVectorResults.forEach((r) => {
|
|
5298
5576
|
rawVectorScores.set(r.id, r.score);
|
|
5299
5577
|
});
|
|
5300
5578
|
const maxRawScore = rawVectorResults.length > 0 ? rawVectorResults[0]?.score ?? 0 : 0;
|
|
5301
5579
|
const vectorResults = this.normalizeAndFilterScores(rawVectorResults);
|
|
5580
|
+
const ftsResults = await this.ftsSearch(query, stores, limit * candidateMultiplier);
|
|
5302
5581
|
const vectorRanks = /* @__PURE__ */ new Map();
|
|
5303
5582
|
const ftsRanks = /* @__PURE__ */ new Map();
|
|
5304
5583
|
const allDocs = /* @__PURE__ */ new Map();
|
|
@@ -5334,6 +5613,8 @@ var SearchService = class {
|
|
|
5334
5613
|
const pathKeywordBoost = this.getPathKeywordBoost(query, result2);
|
|
5335
5614
|
const depthBoost = this.getDepthBoost(result2, getPrimaryIntent(intents));
|
|
5336
5615
|
const entryPointBoost = this.getEntryPointBoost(result2, getPrimaryIntent(intents));
|
|
5616
|
+
const sectionHeaderBoost = 1;
|
|
5617
|
+
const symbolNameBoost = 1;
|
|
5337
5618
|
const metadata = {
|
|
5338
5619
|
vectorRRF,
|
|
5339
5620
|
ftsRRF,
|
|
@@ -5342,7 +5623,9 @@ var SearchService = class {
|
|
|
5342
5623
|
urlKeywordBoost,
|
|
5343
5624
|
pathKeywordBoost,
|
|
5344
5625
|
depthBoost,
|
|
5345
|
-
entryPointBoost
|
|
5626
|
+
entryPointBoost,
|
|
5627
|
+
sectionHeaderBoost,
|
|
5628
|
+
symbolNameBoost
|
|
5346
5629
|
};
|
|
5347
5630
|
if (vectorRank !== Infinity) {
|
|
5348
5631
|
metadata.vectorRank = vectorRank;
|
|
@@ -5355,7 +5638,7 @@ var SearchService = class {
|
|
|
5355
5638
|
}
|
|
5356
5639
|
rrfScores.push({
|
|
5357
5640
|
id,
|
|
5358
|
-
score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost * urlKeywordBoost * pathKeywordBoost * depthBoost * entryPointBoost,
|
|
5641
|
+
score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost * urlKeywordBoost * pathKeywordBoost * depthBoost * entryPointBoost * sectionHeaderBoost * symbolNameBoost,
|
|
5359
5642
|
result: result2,
|
|
5360
5643
|
rawVectorScore,
|
|
5361
5644
|
metadata
|
|
@@ -5376,20 +5659,10 @@ var SearchService = class {
|
|
|
5376
5659
|
reranked.results.forEach((r) => {
|
|
5377
5660
|
rerankedScores.set(r.id, r.rerankerScore);
|
|
5378
5661
|
});
|
|
5379
|
-
|
|
5380
|
-
|
|
5381
|
-
|
|
5382
|
-
|
|
5383
|
-
return { ...r, blendedScore: -Infinity };
|
|
5384
|
-
}
|
|
5385
|
-
const normalizedRrf = maxRrfScore > 0 ? r.score / maxRrfScore : 0;
|
|
5386
|
-
const rrfWeight = rrfRank < 3 ? 0.7 : rrfRank < 10 ? 0.5 : 0.3;
|
|
5387
|
-
const rerankerWeight = 1 - rrfWeight;
|
|
5388
|
-
return {
|
|
5389
|
-
...r,
|
|
5390
|
-
blendedScore: normalizedRrf * rrfWeight + rerankerScore * rerankerWeight
|
|
5391
|
-
};
|
|
5392
|
-
}).sort((a, b) => b.blendedScore - a.blendedScore).slice(0, limit);
|
|
5662
|
+
sorted = sortedAll.map((r) => ({
|
|
5663
|
+
...r,
|
|
5664
|
+
rerankerScore: rerankedScores.get(r.id)
|
|
5665
|
+
})).sort((a, b) => (b.rerankerScore ?? -Infinity) - (a.rerankerScore ?? -Infinity)).slice(0, limit);
|
|
5393
5666
|
} else {
|
|
5394
5667
|
sorted = sortedAll.slice(0, limit);
|
|
5395
5668
|
}
|
|
@@ -5489,7 +5762,9 @@ var SearchService = class {
|
|
|
5489
5762
|
const blendedMultiplier = totalConfidence > 0 ? weightedMultiplier / totalConfidence : 1;
|
|
5490
5763
|
const finalBoost = baseBoost * blendedMultiplier;
|
|
5491
5764
|
if (fileType === "test") {
|
|
5492
|
-
|
|
5765
|
+
const primaryIntent = intents[0]?.intent;
|
|
5766
|
+
const cap = primaryIntent === "testing" ? 1.5 : 0.6;
|
|
5767
|
+
return Math.min(finalBoost, cap);
|
|
5493
5768
|
}
|
|
5494
5769
|
return finalBoost;
|
|
5495
5770
|
}
|
|
@@ -5646,12 +5921,6 @@ var SearchService = class {
|
|
|
5646
5921
|
location: `${path4}${codeUnit ? `:${String(codeUnit.startLine)}` : ""}`,
|
|
5647
5922
|
relevanceReason: this.generateRelevanceReason(result, query)
|
|
5648
5923
|
};
|
|
5649
|
-
if (graph) {
|
|
5650
|
-
const relatedFiles = this.getRelatedFilePaths(graph, path4, symbolName);
|
|
5651
|
-
if (relatedFiles.length > 0) {
|
|
5652
|
-
enhanced.summary = { ...enhanced.summary, relatedFiles };
|
|
5653
|
-
}
|
|
5654
|
-
}
|
|
5655
5924
|
if (detail === "contextual" || detail === "full") {
|
|
5656
5925
|
const usage = this.getUsageFromGraph(graph, path4, symbolName);
|
|
5657
5926
|
enhanced.context = {
|
|
@@ -5867,23 +6136,9 @@ var SearchService = class {
|
|
|
5867
6136
|
};
|
|
5868
6137
|
}
|
|
5869
6138
|
/**
|
|
5870
|
-
* Get related
|
|
5871
|
-
* Returns
|
|
6139
|
+
* Get related code from graph.
|
|
6140
|
+
* Returns callers and callees for the symbol.
|
|
5872
6141
|
*/
|
|
5873
|
-
getRelatedFilePaths(graph, filePath, symbolName) {
|
|
5874
|
-
if (symbolName === "" || symbolName === "(anonymous)") return [];
|
|
5875
|
-
const nodeId = `${filePath}:${symbolName}`;
|
|
5876
|
-
const files = /* @__PURE__ */ new Set();
|
|
5877
|
-
for (const edge of graph.getIncomingEdges(nodeId)) {
|
|
5878
|
-
const [file] = this.parseNodeId(edge.from);
|
|
5879
|
-
if (file && file !== filePath) files.add(file);
|
|
5880
|
-
}
|
|
5881
|
-
for (const edge of graph.getEdges(nodeId)) {
|
|
5882
|
-
const [file] = this.parseNodeId(edge.to);
|
|
5883
|
-
if (file && file !== filePath) files.add(file);
|
|
5884
|
-
}
|
|
5885
|
-
return Array.from(files).slice(0, 5);
|
|
5886
|
-
}
|
|
5887
6142
|
getRelatedCodeFromGraph(graph, filePath, symbolName) {
|
|
5888
6143
|
if (!graph || symbolName === "" || symbolName === "(anonymous)") {
|
|
5889
6144
|
return [];
|
|
@@ -5947,7 +6202,9 @@ var IngestConfigSchema = z3.object({
|
|
|
5947
6202
|
/** Skip binary files detected by content heuristic - default true */
|
|
5948
6203
|
skipBinaries: z3.boolean().optional(),
|
|
5949
6204
|
/** Override max file size for this store (bytes) */
|
|
5950
|
-
maxFileSizeBytes: z3.number().int().positive().optional()
|
|
6205
|
+
maxFileSizeBytes: z3.number().int().positive().optional(),
|
|
6206
|
+
/** Maximum number of files to index (cap applied after all other filters) */
|
|
6207
|
+
maxFiles: z3.number().int().positive().optional()
|
|
5951
6208
|
});
|
|
5952
6209
|
var FileStoreDefinitionSchema = BaseStoreDefinitionSchema.extend({
|
|
5953
6210
|
type: z3.literal("file"),
|
|
@@ -6190,7 +6447,7 @@ async function cloneRepository(options) {
|
|
|
6190
6447
|
if (partialClone) {
|
|
6191
6448
|
args.push("--filter=blob:none");
|
|
6192
6449
|
}
|
|
6193
|
-
args.push("--depth", String(depth), "--single-branch");
|
|
6450
|
+
args.push("--depth", String(depth), "--single-branch", "--no-tags");
|
|
6194
6451
|
if (branch !== void 0) {
|
|
6195
6452
|
args.push("--branch", branch);
|
|
6196
6453
|
}
|
|
@@ -6202,12 +6459,17 @@ async function cloneRepository(options) {
|
|
|
6202
6459
|
branch,
|
|
6203
6460
|
depth,
|
|
6204
6461
|
singleBranch: true,
|
|
6462
|
+
noTags: true,
|
|
6463
|
+
lfsSkipSmudge: true,
|
|
6205
6464
|
partialClone
|
|
6206
6465
|
},
|
|
6207
6466
|
"Starting git clone"
|
|
6208
6467
|
);
|
|
6209
6468
|
return new Promise((resolve4) => {
|
|
6210
|
-
const git = spawn("git", args, {
|
|
6469
|
+
const git = spawn("git", args, {
|
|
6470
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
6471
|
+
env: { ...process.env, GIT_LFS_SKIP_SMUDGE: "1" }
|
|
6472
|
+
});
|
|
6211
6473
|
let timedOut = false;
|
|
6212
6474
|
let forceKillTimeout = null;
|
|
6213
6475
|
const timeout = setTimeout(() => {
|
|
@@ -7212,6 +7474,8 @@ var LanceStore = class {
|
|
|
7212
7474
|
const schema = LanceSchema({
|
|
7213
7475
|
id: new Utf8(),
|
|
7214
7476
|
content: this.embeddingFunction.sourceField(),
|
|
7477
|
+
fts_content: new Utf8(),
|
|
7478
|
+
// content + identifier-split variants for FTS
|
|
7215
7479
|
vector: this.embeddingFunction.vectorField(),
|
|
7216
7480
|
metadata: new Utf8()
|
|
7217
7481
|
});
|
|
@@ -7222,6 +7486,7 @@ var LanceStore = class {
|
|
|
7222
7486
|
{
|
|
7223
7487
|
id: "__init__",
|
|
7224
7488
|
content: "",
|
|
7489
|
+
fts_content: "",
|
|
7225
7490
|
vector: new Array(this._dimensions).fill(0),
|
|
7226
7491
|
metadata: "{}"
|
|
7227
7492
|
}
|
|
@@ -7236,13 +7501,25 @@ var LanceStore = class {
|
|
|
7236
7501
|
}
|
|
7237
7502
|
async addDocuments(storeId, documents) {
|
|
7238
7503
|
const table = await this.getTable(storeId);
|
|
7239
|
-
const
|
|
7240
|
-
|
|
7241
|
-
|
|
7242
|
-
|
|
7243
|
-
|
|
7244
|
-
|
|
7245
|
-
|
|
7504
|
+
const hasFts = await this.hasFtsContentColumn(storeId);
|
|
7505
|
+
if (hasFts) {
|
|
7506
|
+
const lanceDocuments = documents.map((doc) => ({
|
|
7507
|
+
id: doc.id,
|
|
7508
|
+
content: doc.content,
|
|
7509
|
+
fts_content: buildFtsContent(doc.content),
|
|
7510
|
+
vector: [...doc.vector],
|
|
7511
|
+
metadata: JSON.stringify(doc.metadata)
|
|
7512
|
+
}));
|
|
7513
|
+
await table.add(lanceDocuments);
|
|
7514
|
+
} else {
|
|
7515
|
+
const lanceDocuments = documents.map((doc) => ({
|
|
7516
|
+
id: doc.id,
|
|
7517
|
+
content: doc.content,
|
|
7518
|
+
vector: [...doc.vector],
|
|
7519
|
+
metadata: JSON.stringify(doc.metadata)
|
|
7520
|
+
}));
|
|
7521
|
+
await table.add(lanceDocuments);
|
|
7522
|
+
}
|
|
7246
7523
|
}
|
|
7247
7524
|
async deleteDocuments(storeId, documentIds) {
|
|
7248
7525
|
if (documentIds.length === 0) {
|
|
@@ -7297,10 +7574,29 @@ var LanceStore = class {
|
|
|
7297
7574
|
}
|
|
7298
7575
|
async createFtsIndex(storeId) {
|
|
7299
7576
|
const table = await this.getTable(storeId);
|
|
7300
|
-
await
|
|
7301
|
-
|
|
7577
|
+
const ftsColumn = await this.hasFtsContentColumn(storeId) ? "fts_content" : "content";
|
|
7578
|
+
await table.createIndex(ftsColumn, {
|
|
7579
|
+
config: lancedb.Index.fts({
|
|
7580
|
+
stem: true,
|
|
7581
|
+
removeStopWords: false,
|
|
7582
|
+
lowercase: true,
|
|
7583
|
+
language: "English"
|
|
7584
|
+
})
|
|
7302
7585
|
});
|
|
7303
7586
|
}
|
|
7587
|
+
/**
|
|
7588
|
+
* Check if a table has the fts_content column (v3 schema).
|
|
7589
|
+
* Tables created before the FTS improvement only have content.
|
|
7590
|
+
*/
|
|
7591
|
+
async hasFtsContentColumn(storeId) {
|
|
7592
|
+
try {
|
|
7593
|
+
const table = await this.getTable(storeId);
|
|
7594
|
+
const schema = await table.schema();
|
|
7595
|
+
return schema.fields.some((f) => f.name === "fts_content");
|
|
7596
|
+
} catch {
|
|
7597
|
+
return false;
|
|
7598
|
+
}
|
|
7599
|
+
}
|
|
7304
7600
|
async fullTextSearch(storeId, query, limit) {
|
|
7305
7601
|
const table = await this.getTable(storeId);
|
|
7306
7602
|
const results = await table.search(query, "fts").limit(limit).toArray();
|
|
@@ -7637,7 +7933,6 @@ export {
|
|
|
7637
7933
|
ASTParser,
|
|
7638
7934
|
ok,
|
|
7639
7935
|
err,
|
|
7640
|
-
TEXT_EXTENSIONS,
|
|
7641
7936
|
classifyWebContentType,
|
|
7642
7937
|
isFileStoreDefinition,
|
|
7643
7938
|
isRepoStoreDefinition,
|
|
@@ -7650,4 +7945,4 @@ export {
|
|
|
7650
7945
|
createServices,
|
|
7651
7946
|
destroyServices
|
|
7652
7947
|
};
|
|
7653
|
-
//# sourceMappingURL=chunk-
|
|
7948
|
+
//# sourceMappingURL=chunk-724FNI27.js.map
|