bluera-knowledge 0.35.0 → 0.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/CHANGELOG.md +32 -0
  3. package/README.md +8 -20
  4. package/bun.lock +27 -0
  5. package/dist/{chunk-L2SC6J4K.js → chunk-724FNI27.js} +466 -171
  6. package/dist/chunk-724FNI27.js.map +1 -0
  7. package/dist/{chunk-DNGE7FZ4.js → chunk-AO45YFHO.js} +1386 -42
  8. package/dist/chunk-AO45YFHO.js.map +1 -0
  9. package/dist/{chunk-MQQ46BST.js → chunk-F6DGSS2N.js} +2 -2
  10. package/dist/index.js +72 -5
  11. package/dist/index.js.map +1 -1
  12. package/dist/mcp/server.d.ts +37 -3
  13. package/dist/mcp/server.js +2 -2
  14. package/dist/workers/background-worker-cli.js +2 -2
  15. package/hooks/check-ready.sh +17 -7
  16. package/hooks/hooks.json +17 -1
  17. package/hooks/lib/store_summary.py +111 -0
  18. package/hooks/posttooluse-bk-reminder.py +33 -6
  19. package/hooks/stop-bk-check.py +86 -0
  20. package/hooks/userpromptsubmit-bk-nudge.py +156 -0
  21. package/package.json +3 -1
  22. package/scripts/auto-setup.sh +11 -3
  23. package/scripts/eval-candidates.sh +235 -0
  24. package/skills/advanced-workflows/references/combining-workflows.md +17 -0
  25. package/skills/advanced-workflows/references/error-recovery.md +44 -0
  26. package/skills/advanced-workflows/references/handling-large-results.md +48 -0
  27. package/skills/advanced-workflows/references/multi-store-search.md +42 -0
  28. package/skills/knowledge-search/SKILL.md +1 -1
  29. package/skills/search/statusline.md +75 -0
  30. package/skills/store-lifecycle/references/failure-recovery.md +80 -0
  31. package/skills/store-lifecycle/references/indexing-strategies.md +67 -0
  32. package/skills/store-lifecycle/references/job-monitoring.md +72 -0
  33. package/skills/store-lifecycle/references/lifecycle-checklist.md +20 -0
  34. package/skills/store-lifecycle/references/storage-management.md +43 -0
  35. package/skills/suggest/SKILL.md +13 -6
  36. package/dist/chunk-DNGE7FZ4.js.map +0 -1
  37. package/dist/chunk-L2SC6J4K.js.map +0 -1
  38. /package/dist/{chunk-MQQ46BST.js.map → chunk-F6DGSS2N.js.map} +0 -0
@@ -2072,40 +2072,6 @@ import { readFile as readFile2, access } from "fs/promises";
2072
2072
  import { homedir as homedir2 } from "os";
2073
2073
  import { isAbsolute, join as join6, resolve } from "path";
2074
2074
 
2075
- // src/services/reranker-env.ts
2076
- var logger = createLogger("reranker-env");
2077
- function parseRerankerEnvOverrides(strict) {
2078
- return {
2079
- enabled: parseEnabled(process.env["BK_RERANKER_ENABLED"], strict),
2080
- topK: parseTopK(process.env["BK_RERANKER_TOPK"], strict)
2081
- };
2082
- }
2083
- function parseEnabled(raw, strict) {
2084
- if (raw === void 0 || raw === "") return void 0;
2085
- if (raw === "1") return true;
2086
- if (raw === "0") return false;
2087
- const msg = `BK_RERANKER_ENABLED must be '0' or '1', got: "${raw}"`;
2088
- if (strict) throw new Error(msg);
2089
- logger.warn(msg);
2090
- return void 0;
2091
- }
2092
- function parseTopK(raw, strict) {
2093
- if (raw === void 0 || raw === "") return void 0;
2094
- const parsed = Number.parseInt(raw, 10);
2095
- if (Number.isNaN(parsed) || parsed < 1) {
2096
- const msg = `BK_RERANKER_TOPK must be a positive integer, got: "${raw}"`;
2097
- if (strict) throw new Error(msg);
2098
- logger.warn(msg);
2099
- return void 0;
2100
- }
2101
- return parsed;
2102
- }
2103
-
2104
- // src/db/embeddings.ts
2105
- import { homedir } from "os";
2106
- import { join as join5 } from "path";
2107
- import { pipeline, env } from "@huggingface/transformers";
2108
-
2109
2075
  // src/models/registry.ts
2110
2076
  var MODEL_REGISTRY = {
2111
2077
  // ============================================================
@@ -2362,7 +2328,7 @@ var MODEL_REGISTRY = {
2362
2328
  normalize: true,
2363
2329
  queryPrefix: "",
2364
2330
  docPrefix: "",
2365
- category: "other",
2331
+ category: "jina",
2366
2332
  sizeCategory: "small",
2367
2333
  notes: "8192 token context. Good for long documents."
2368
2334
  },
@@ -2374,12 +2340,102 @@ var MODEL_REGISTRY = {
2374
2340
  normalize: true,
2375
2341
  queryPrefix: "",
2376
2342
  docPrefix: "",
2377
- category: "other",
2343
+ category: "jina",
2378
2344
  sizeCategory: "base",
2379
2345
  notes: "8192 token context. Larger Jina variant."
2346
+ },
2347
+ "jina-embeddings-v2-base-code": {
2348
+ id: "jinaai/jina-embeddings-v2-base-code",
2349
+ name: "Jina Embeddings v2 Base Code",
2350
+ dimensions: 768,
2351
+ pooling: "mean",
2352
+ normalize: true,
2353
+ queryPrefix: "",
2354
+ docPrefix: "",
2355
+ category: "jina",
2356
+ sizeCategory: "base",
2357
+ notes: "161M params. Code-specific (150M+ code QA pairs, 30+ langs). 8K context. May need trust_remote_code."
2358
+ },
2359
+ // ============================================================
2360
+ // Snowflake Arctic Embed - Retrieval-optimized
2361
+ // ============================================================
2362
+ "snowflake-arctic-embed-xs": {
2363
+ id: "Snowflake/snowflake-arctic-embed-xs",
2364
+ name: "Snowflake Arctic Embed XS",
2365
+ dimensions: 384,
2366
+ pooling: "cls",
2367
+ normalize: true,
2368
+ queryPrefix: "Represent this sentence for searching relevant passages: ",
2369
+ docPrefix: "",
2370
+ category: "snowflake",
2371
+ sizeCategory: "small",
2372
+ notes: "22M params. Ultra-small retrieval model. Based on all-MiniLM-L6-v2."
2373
+ },
2374
+ "snowflake-arctic-embed-s": {
2375
+ id: "Snowflake/snowflake-arctic-embed-s",
2376
+ name: "Snowflake Arctic Embed S",
2377
+ dimensions: 384,
2378
+ pooling: "cls",
2379
+ normalize: true,
2380
+ queryPrefix: "Represent this sentence for searching relevant passages: ",
2381
+ docPrefix: "",
2382
+ category: "snowflake",
2383
+ sizeCategory: "small",
2384
+ notes: "33M params. Same size as bge-small, trained for retrieval. Drop-in candidate."
2385
+ },
2386
+ "snowflake-arctic-embed-m-v1.5": {
2387
+ id: "Snowflake/snowflake-arctic-embed-m-v1.5",
2388
+ name: "Snowflake Arctic Embed M v1.5",
2389
+ dimensions: 768,
2390
+ pooling: "cls",
2391
+ normalize: true,
2392
+ queryPrefix: "Represent this sentence for searching relevant passages: ",
2393
+ docPrefix: "",
2394
+ category: "snowflake",
2395
+ sizeCategory: "base",
2396
+ notes: "109M params. BEIR 55.14. Matryoshka (truncate to 256d). 7 ONNX quant variants."
2397
+ },
2398
+ "snowflake-arctic-embed-m-v2.0": {
2399
+ id: "Snowflake/snowflake-arctic-embed-m-v2.0",
2400
+ name: "Snowflake Arctic Embed M v2.0",
2401
+ dimensions: 768,
2402
+ pooling: "cls",
2403
+ normalize: true,
2404
+ queryPrefix: "query: ",
2405
+ docPrefix: "",
2406
+ category: "snowflake",
2407
+ sizeCategory: "base",
2408
+ notes: "305M params. Multilingual, 8K context. Custom GTE arch \u2014 may need trust_remote_code."
2409
+ },
2410
+ // ============================================================
2411
+ // ModernBERT Embedding Models - Latest architecture (2024+)
2412
+ // ============================================================
2413
+ "gte-modernbert-base": {
2414
+ id: "Alibaba-NLP/gte-modernbert-base",
2415
+ name: "GTE ModernBERT Base",
2416
+ dimensions: 768,
2417
+ pooling: "cls",
2418
+ normalize: true,
2419
+ queryPrefix: "",
2420
+ docPrefix: "",
2421
+ category: "gte",
2422
+ sizeCategory: "base",
2423
+ notes: "149M params. CoIR code retrieval 79.31. BEIR 55.33. 8K context. No trust_remote_code. Top candidate."
2424
+ },
2425
+ "modernbert-embed-base": {
2426
+ id: "nomic-ai/modernbert-embed-base",
2427
+ name: "ModernBERT Embed Base (Nomic)",
2428
+ dimensions: 768,
2429
+ pooling: "mean",
2430
+ normalize: true,
2431
+ queryPrefix: "search_query: ",
2432
+ docPrefix: "search_document: ",
2433
+ category: "nomic",
2434
+ sizeCategory: "base",
2435
+ notes: "149M params. BEIR 52.89. Matryoshka (truncate to 256d). 8K context."
2380
2436
  }
2381
2437
  };
2382
- var DEFAULT_MODEL_ID = "bge-small-en-v1.5";
2438
+ var DEFAULT_MODEL_ID = "snowflake-arctic-embed-s";
2383
2439
  function getModelConfig(modelId) {
2384
2440
  if (modelId in MODEL_REGISTRY) {
2385
2441
  return MODEL_REGISTRY[modelId];
@@ -2403,8 +2459,75 @@ function getConfiguredModelId() {
2403
2459
  }
2404
2460
  return DEFAULT_MODEL_ID;
2405
2461
  }
2462
+ var RERANKER_REGISTRY = {
2463
+ "ms-marco-MiniLM-L-6-v2": {
2464
+ id: "Xenova/ms-marco-MiniLM-L-6-v2",
2465
+ name: "MS MARCO MiniLM L6 v2",
2466
+ notes: "Default reranker. Fast cross-encoder for passage ranking."
2467
+ },
2468
+ "ms-marco-MiniLM-L-12-v2": {
2469
+ id: "Xenova/ms-marco-MiniLM-L-12-v2",
2470
+ name: "MS MARCO MiniLM L12 v2",
2471
+ notes: "Deeper reranker. Better quality but slower."
2472
+ },
2473
+ "bge-reranker-base": {
2474
+ id: "Xenova/bge-reranker-base",
2475
+ name: "BGE Reranker Base",
2476
+ notes: "BGE-family cross-encoder. Better fit for BGE embeddings."
2477
+ },
2478
+ "bge-reranker-large": {
2479
+ id: "Xenova/bge-reranker-large",
2480
+ name: "BGE Reranker Large",
2481
+ notes: "Larger BGE cross-encoder. Higher quality, slower."
2482
+ }
2483
+ };
2484
+
2485
+ // src/services/reranker-env.ts
2486
+ var logger = createLogger("reranker-env");
2487
+ function parseRerankerEnvOverrides(strict) {
2488
+ return {
2489
+ enabled: parseEnabled(process.env["BK_RERANKER_ENABLED"], strict),
2490
+ topK: parseTopK(process.env["BK_RERANKER_TOPK"], strict),
2491
+ model: parseModel(process.env["BK_RERANKER_MODEL"], strict)
2492
+ };
2493
+ }
2494
+ function parseEnabled(raw, strict) {
2495
+ if (raw === void 0 || raw === "") return void 0;
2496
+ if (raw === "1") return true;
2497
+ if (raw === "0") return false;
2498
+ const msg = `BK_RERANKER_ENABLED must be '0' or '1', got: "${raw}"`;
2499
+ if (strict) throw new Error(msg);
2500
+ logger.warn(msg);
2501
+ return void 0;
2502
+ }
2503
+ function parseTopK(raw, strict) {
2504
+ if (raw === void 0 || raw === "") return void 0;
2505
+ const parsed = Number.parseInt(raw, 10);
2506
+ if (Number.isNaN(parsed) || parsed < 1) {
2507
+ const msg = `BK_RERANKER_TOPK must be a positive integer, got: "${raw}"`;
2508
+ if (strict) throw new Error(msg);
2509
+ logger.warn(msg);
2510
+ return void 0;
2511
+ }
2512
+ return parsed;
2513
+ }
2514
+ function parseModel(raw, strict) {
2515
+ if (raw === void 0 || raw === "") return void 0;
2516
+ const entry = RERANKER_REGISTRY[raw];
2517
+ if (entry === void 0) {
2518
+ const valid = Object.keys(RERANKER_REGISTRY).join(", ");
2519
+ const msg = `BK_RERANKER_MODEL must be one of [${valid}], got: "${raw}"`;
2520
+ if (strict) throw new Error(msg);
2521
+ logger.warn(msg);
2522
+ return void 0;
2523
+ }
2524
+ return entry.id;
2525
+ }
2406
2526
 
2407
2527
  // src/db/embeddings.ts
2528
+ import { homedir } from "os";
2529
+ import { join as join5 } from "path";
2530
+ import { pipeline, env } from "@huggingface/transformers";
2408
2531
  env.cacheDir = join5(homedir(), ".cache", "huggingface-transformers");
2409
2532
  function getFinetunedModelPath() {
2410
2533
  const path4 = process.env["BK_FINETUNED_MODEL"];
@@ -2419,14 +2542,16 @@ function buildEmbeddingConfig(modelId, overrides) {
2419
2542
  const envQueryPrefix = process.env["BK_QUERY_PREFIX"];
2420
2543
  const modelConfig = getModelConfig(modelId);
2421
2544
  if (modelConfig === void 0) {
2545
+ const baseModelId = process.env["BK_BASE_MODEL"];
2546
+ const baseConfig = baseModelId !== void 0 && baseModelId !== "" ? getModelConfig(baseModelId) : void 0;
2422
2547
  return {
2423
2548
  model: modelId,
2424
2549
  batchSize: overrides?.batchSize ?? 32,
2425
2550
  dtype: overrides?.dtype ?? "fp32",
2426
- pooling: overrides?.pooling ?? envPooling ?? "mean",
2427
- normalize: overrides?.normalize ?? true,
2428
- queryPrefix: overrides?.queryPrefix ?? envQueryPrefix ?? "",
2429
- docPrefix: overrides?.docPrefix ?? "",
2551
+ pooling: overrides?.pooling ?? envPooling ?? baseConfig?.pooling ?? "mean",
2552
+ normalize: overrides?.normalize ?? baseConfig?.normalize ?? true,
2553
+ queryPrefix: overrides?.queryPrefix ?? envQueryPrefix ?? baseConfig?.queryPrefix ?? "",
2554
+ docPrefix: overrides?.docPrefix ?? baseConfig?.docPrefix ?? "",
2430
2555
  maxInFlightBatches: overrides?.maxInFlightBatches ?? 1
2431
2556
  };
2432
2557
  }
@@ -2670,10 +2795,10 @@ var DEFAULT_CONFIG = {
2670
2795
  version: 1,
2671
2796
  dataDir: ".bluera/bluera-knowledge/data",
2672
2797
  embedding: {
2673
- model: "Xenova/bge-small-en-v1.5",
2798
+ model: "Snowflake/snowflake-arctic-embed-s",
2674
2799
  batchSize: 32,
2675
2800
  dtype: "fp32",
2676
- pooling: "mean",
2801
+ pooling: "cls",
2677
2802
  normalize: true,
2678
2803
  queryPrefix: "Represent this sentence for searching relevant passages: ",
2679
2804
  docPrefix: "",
@@ -2824,13 +2949,14 @@ var ConfigService = class {
2824
2949
  };
2825
2950
  }
2826
2951
  const rerankerOverrides = parseRerankerEnvOverrides(false);
2827
- if (rerankerOverrides.enabled !== void 0 || rerankerOverrides.topK !== void 0) {
2952
+ if (rerankerOverrides.enabled !== void 0 || rerankerOverrides.topK !== void 0 || rerankerOverrides.model !== void 0) {
2828
2953
  this.config = {
2829
2954
  ...this.config,
2830
2955
  reranker: {
2831
2956
  ...this.config.reranker,
2832
2957
  ...rerankerOverrides.enabled !== void 0 ? { enabled: rerankerOverrides.enabled } : {},
2833
- ...rerankerOverrides.topK !== void 0 ? { topK: rerankerOverrides.topK } : {}
2958
+ ...rerankerOverrides.topK !== void 0 ? { topK: rerankerOverrides.topK } : {},
2959
+ ...rerankerOverrides.model !== void 0 ? { model: rerankerOverrides.model } : {}
2834
2960
  }
2835
2961
  };
2836
2962
  }
@@ -3361,11 +3487,15 @@ var DriftService = class {
3361
3487
  }
3362
3488
  };
3363
3489
 
3364
- // src/utils/text-extensions.ts
3490
+ // src/services/index.service.ts
3491
+ var minimatch = minimatchFn;
3492
+ var execFileAsync = promisify(execFile);
3493
+ var logger2 = createLogger("index-service");
3365
3494
  var TEXT_EXTENSIONS = /* @__PURE__ */ new Set([
3366
3495
  // Text/docs
3367
3496
  ".txt",
3368
3497
  ".md",
3498
+ ".mdx",
3369
3499
  ".rst",
3370
3500
  ".adoc",
3371
3501
  // JavaScript/TypeScript
@@ -3471,11 +3601,33 @@ var TEXT_EXTENSIONS = /* @__PURE__ */ new Set([
3471
3601
  ".makefile",
3472
3602
  ".cmake"
3473
3603
  ]);
3474
-
3475
- // src/services/index.service.ts
3476
- var minimatch = minimatchFn;
3477
- var execFileAsync = promisify(execFile);
3478
- var logger2 = createLogger("index-service");
3604
+ var TEXT_BASENAMES = /* @__PURE__ */ new Set([
3605
+ // Build systems
3606
+ "Dockerfile",
3607
+ "Makefile",
3608
+ "Rakefile",
3609
+ "Gemfile",
3610
+ "Procfile",
3611
+ "Vagrantfile",
3612
+ "Justfile",
3613
+ "Brewfile",
3614
+ "Earthfile",
3615
+ "Tiltfile",
3616
+ "CMakeLists.txt",
3617
+ "BUILD",
3618
+ "BUILD.bazel",
3619
+ "WORKSPACE",
3620
+ // Dotfiles (config)
3621
+ ".gitignore",
3622
+ ".gitattributes",
3623
+ ".editorconfig",
3624
+ ".dockerignore",
3625
+ ".eslintignore",
3626
+ ".prettierignore",
3627
+ ".npmignore",
3628
+ ".env.example",
3629
+ ".env.sample"
3630
+ ]);
3479
3631
  function isMinifiedFile(filename) {
3480
3632
  const minPatterns = [
3481
3633
  /\.min\.(js|css|mjs|cjs)$/i,
@@ -3542,6 +3694,7 @@ var EXT_TO_LANGUAGE = {
3542
3694
  ".psm1": "powershell",
3543
3695
  ".sql": "sql",
3544
3696
  ".md": "markdown",
3697
+ ".mdx": "mdx",
3545
3698
  ".rst": "restructuredtext",
3546
3699
  ".lua": "lua",
3547
3700
  ".r": "r",
@@ -3581,6 +3734,19 @@ var EXT_TO_LANGUAGE = {
3581
3734
  ".toml": "toml",
3582
3735
  ".xml": "xml"
3583
3736
  };
3737
+ var BASENAME_TO_LANGUAGE = {
3738
+ Dockerfile: "dockerfile",
3739
+ Makefile: "makefile",
3740
+ Rakefile: "ruby",
3741
+ Gemfile: "ruby",
3742
+ Brewfile: "ruby",
3743
+ "CMakeLists.txt": "cmake",
3744
+ BUILD: "starlark",
3745
+ "BUILD.bazel": "starlark",
3746
+ WORKSPACE: "starlark",
3747
+ Tiltfile: "starlark",
3748
+ Justfile: "just"
3749
+ };
3584
3750
  var ENTRY_POINT_NAMES = /* @__PURE__ */ new Set([
3585
3751
  "index.ts",
3586
3752
  "index.js",
@@ -3969,7 +4135,7 @@ ${rawContent}` : rawContent;
3969
4135
  const ext = extname(filePath).toLowerCase();
3970
4136
  const fileName = basename(filePath).toLowerCase();
3971
4137
  const fileType = this.classifyFileType(ext, fileName, filePath);
3972
- const language = EXT_TO_LANGUAGE[ext];
4138
+ const language = EXT_TO_LANGUAGE[ext] ?? BASENAME_TO_LANGUAGE[basename(filePath)];
3973
4139
  const normalizedRelPath = relativePath.replaceAll("\\", "/");
3974
4140
  const depth = normalizedRelPath.split("/").length - 1;
3975
4141
  const isEntryPoint = ENTRY_POINT_NAMES.has(basename(filePath));
@@ -4058,7 +4224,8 @@ ${rawContent}` : rawContent;
4058
4224
  candidates = await this.scanDirectory(storePath);
4059
4225
  logger2.debug({ storePath, count: candidates.length }, "Using filesystem walk for discovery");
4060
4226
  }
4061
- return this.filterFiles(candidates, storePath, ingestConfig);
4227
+ const { files } = await this.filterFiles(candidates, storePath, ingestConfig);
4228
+ return files;
4062
4229
  }
4063
4230
  /**
4064
4231
  * Apply extension filter, ignore patterns, size limit, and per-store ingest filters.
@@ -4070,23 +4237,35 @@ ${rawContent}` : rawContent;
4070
4237
  const skippedMinified = [];
4071
4238
  const skippedBinary = [];
4072
4239
  const skippedExcluded = [];
4240
+ let skippedExtension = 0;
4241
+ let skippedDirSegment = 0;
4242
+ let skippedIgnorePattern = 0;
4243
+ let skippedStatError = 0;
4073
4244
  const maxFileSize = ingestConfig?.maxFileSizeBytes ?? this.maxFileSizeBytes;
4074
- const skipMinified = ingestConfig?.skipMinified ?? true;
4245
+ const skipMinifiedOpt = ingestConfig?.skipMinified ?? true;
4075
4246
  const skipBinaries = ingestConfig?.skipBinaries ?? true;
4076
4247
  const excludeGlobs = ingestConfig?.excludeGlobs ?? [];
4248
+ const maxFiles = ingestConfig?.maxFiles;
4077
4249
  for (const filePath of files) {
4078
4250
  const ext = extname(filePath).toLowerCase();
4079
4251
  const filename = basename(filePath);
4080
- if (!TEXT_EXTENSIONS.has(ext)) continue;
4252
+ if (!TEXT_EXTENSIONS.has(ext) && !TEXT_BASENAMES.has(filename)) {
4253
+ skippedExtension++;
4254
+ continue;
4255
+ }
4081
4256
  const relativePath = relative(storePath, filePath).replaceAll("\\", "/");
4082
4257
  const pathSegments = relativePath.split("/");
4083
4258
  const dirSegments = pathSegments.slice(0, -1);
4084
4259
  if (dirSegments.some((segment) => this.ignoreDirs.has(segment))) {
4260
+ skippedDirSegment++;
4085
4261
  continue;
4086
4262
  }
4087
4263
  const shouldIgnore = this.ignoreFilePatterns.some((matcher) => matcher(filename));
4088
- if (shouldIgnore) continue;
4089
- if (skipMinified && isMinifiedFile(filename)) {
4264
+ if (shouldIgnore) {
4265
+ skippedIgnorePattern++;
4266
+ continue;
4267
+ }
4268
+ if (skipMinifiedOpt && isMinifiedFile(filename)) {
4090
4269
  skippedMinified.push(filename);
4091
4270
  continue;
4092
4271
  }
@@ -4100,13 +4279,14 @@ ${rawContent}` : rawContent;
4100
4279
  }
4101
4280
  let fileSize;
4102
4281
  try {
4103
- const stats = await stat2(filePath);
4104
- fileSize = stats.size;
4282
+ const stats2 = await stat2(filePath);
4283
+ fileSize = stats2.size;
4105
4284
  if (fileSize > maxFileSize) {
4106
4285
  skippedLarge.push({ path: filePath, size: fileSize });
4107
4286
  continue;
4108
4287
  }
4109
4288
  } catch {
4289
+ skippedStatError++;
4110
4290
  continue;
4111
4291
  }
4112
4292
  if (skipBinaries) {
@@ -4116,17 +4296,23 @@ ${rawContent}` : rawContent;
4116
4296
  continue;
4117
4297
  }
4118
4298
  } catch {
4299
+ skippedStatError++;
4119
4300
  continue;
4120
4301
  }
4121
4302
  }
4122
4303
  result.push(filePath);
4123
4304
  }
4305
+ if (maxFiles !== void 0 && result.length > maxFiles) {
4306
+ logger2.info({ storePath, total: result.length, maxFiles }, "Applying maxFiles cap");
4307
+ result.length = maxFiles;
4308
+ }
4309
+ const skippedLargeBytes = skippedLarge.reduce((sum, f) => sum + f.size, 0);
4124
4310
  if (skippedLarge.length > 0) {
4125
4311
  logger2.info(
4126
4312
  {
4127
4313
  storePath,
4128
4314
  count: skippedLarge.length,
4129
- totalBytes: skippedLarge.reduce((sum, f) => sum + f.size, 0),
4315
+ totalBytes: skippedLargeBytes,
4130
4316
  examples: skippedLarge.slice(0, 5).map((f) => relative(storePath, f.path))
4131
4317
  },
4132
4318
  "Skipped large files"
@@ -4150,7 +4336,21 @@ ${rawContent}` : rawContent;
4150
4336
  "Skipped excluded files"
4151
4337
  );
4152
4338
  }
4153
- return result;
4339
+ const stats = {
4340
+ candidates: files.length,
4341
+ accepted: result.length,
4342
+ skippedExtension,
4343
+ skippedDirSegment,
4344
+ skippedIgnorePattern,
4345
+ skippedMinified: skippedMinified.length,
4346
+ skippedExcluded: skippedExcluded.length,
4347
+ skippedLargeFiles: skippedLarge.length,
4348
+ skippedLargeBytes,
4349
+ skippedBinary: skippedBinary.length,
4350
+ skippedStatError
4351
+ };
4352
+ logger2.info({ storePath, ...stats }, "File filtering complete");
4353
+ return { files: result, stats };
4154
4354
  }
4155
4355
  async scanDirectory(dir) {
4156
4356
  const files = [];
@@ -4167,7 +4367,7 @@ ${rawContent}` : rawContent;
4167
4367
  continue;
4168
4368
  }
4169
4369
  const ext = extname(entry.name).toLowerCase();
4170
- if (TEXT_EXTENSIONS.has(ext)) {
4370
+ if (TEXT_EXTENSIONS.has(ext) || TEXT_BASENAMES.has(entry.name)) {
4171
4371
  files.push(fullPath);
4172
4372
  }
4173
4373
  }
@@ -4761,7 +4961,8 @@ function parseSearchEnvOverrides(strict) {
4761
4961
  return {
4762
4962
  rrfK: parseRrfK(process.env["BK_RRF_K"], strict),
4763
4963
  vectorWeight: parseVectorWeight(process.env["BK_RRF_VECTOR_WEIGHT"], strict),
4764
- candidateMultiplier: parseCandidateMultiplier(process.env["BK_CANDIDATE_MULTIPLIER"], strict)
4964
+ candidateMultiplier: parseCandidateMultiplier(process.env["BK_CANDIDATE_MULTIPLIER"], strict),
4965
+ queryExpansion: parseBoolFlag(process.env["BK_QUERY_EXPANSION"], "BK_QUERY_EXPANSION", strict)
4765
4966
  };
4766
4967
  }
4767
4968
  function parseRrfK(raw, strict) {
@@ -4797,6 +4998,69 @@ function parseCandidateMultiplier(raw, strict) {
4797
4998
  }
4798
4999
  return parsed;
4799
5000
  }
5001
+ function parseBoolFlag(raw, name, strict) {
5002
+ if (raw === void 0 || raw === "") return void 0;
5003
+ if (raw === "1") return true;
5004
+ if (raw === "0") return false;
5005
+ const msg = `${name} must be '0' or '1', got: "${raw}"`;
5006
+ if (strict) throw new Error(msg);
5007
+ logger3.warn(msg);
5008
+ return void 0;
5009
+ }
5010
+
5011
+ // src/utils/code-tokenizer.ts
5012
+ function splitIdentifier(identifier) {
5013
+ const segments = identifier.split(/[_.]/).filter((s) => s.length > 0);
5014
+ const words = [];
5015
+ for (const segment of segments) {
5016
+ const parts = segment.replace(/([a-z])([A-Z])/g, "$1\0$2").replace(/([A-Z]{2,})([A-Z][a-z])/g, "$1\0$2").replace(/([a-zA-Z])(\d)/g, "$1\0$2").replace(/(\d)([a-zA-Z])/g, "$1\0$2").split("\0");
5017
+ for (const part of parts) {
5018
+ if (part.length > 0) {
5019
+ words.push(part);
5020
+ }
5021
+ }
5022
+ }
5023
+ return words;
5024
+ }
5025
+ var IDENTIFIER_PATTERN = /(?:[a-zA-Z_$][\w$]*(?:\.[\w$]+)*)/g;
5026
+ function isSplittable(identifier) {
5027
+ if (/[a-z][A-Z]/.test(identifier)) return true;
5028
+ if (identifier.includes("_")) return true;
5029
+ if (identifier.includes(".")) return true;
5030
+ if (/[A-Z]{2,}[a-z]/.test(identifier)) return true;
5031
+ if (/[a-zA-Z]\d|\d[a-zA-Z]/.test(identifier)) return true;
5032
+ return false;
5033
+ }
5034
+ function extractSplitVariants(text) {
5035
+ const seen = /* @__PURE__ */ new Set();
5036
+ const variants = [];
5037
+ for (const match of text.matchAll(IDENTIFIER_PATTERN)) {
5038
+ const identifier = match[0];
5039
+ if (identifier.length < 3 || seen.has(identifier)) continue;
5040
+ seen.add(identifier);
5041
+ if (!isSplittable(identifier)) continue;
5042
+ const parts = splitIdentifier(identifier);
5043
+ if (parts.length > 1) {
5044
+ variants.push(parts.join(" "));
5045
+ const lower = parts.map((p) => p.toLowerCase()).join(" ");
5046
+ if (lower !== parts.join(" ").toLowerCase()) {
5047
+ variants.push(lower);
5048
+ }
5049
+ }
5050
+ }
5051
+ return variants.join(" ");
5052
+ }
5053
+ function buildFtsContent(originalContent) {
5054
+ const variants = extractSplitVariants(originalContent);
5055
+ if (variants.length === 0) return originalContent;
5056
+ return `${originalContent}
5057
+ ${variants}`;
5058
+ }
5059
+ function normalizeFtsQuery(query) {
5060
+ const variants = extractSplitVariants(query);
5061
+ if (variants.length === 0) return query;
5062
+ return `${query} ${variants}`;
5063
+ }
4800
5064
 
4801
5065
  // src/services/search.service.ts
4802
5066
  var logger4 = createLogger("search-service");
@@ -4868,6 +5132,18 @@ var INTENT_FILE_BOOSTS = {
4868
5132
  changelog: 1.1,
4869
5133
  // Often contains bug fixes and known issues
4870
5134
  other: 1
5135
+ },
5136
+ testing: {
5137
+ "documentation-primary": 0.8,
5138
+ documentation: 0.85,
5139
+ example: 1,
5140
+ source: 0.9,
5141
+ "source-internal": 0.85,
5142
+ test: 1.5,
5143
+ // Tests are exactly what the user wants
5144
+ config: 0.7,
5145
+ changelog: 0.6,
5146
+ other: 0.9
4871
5147
  }
4872
5148
  };
4873
5149
  var FRAMEWORK_PATTERNS = [
@@ -4923,12 +5199,23 @@ var CONCEPTUAL_PATTERNS = [
4923
5199
  /\bhow does .* work\b/i,
4924
5200
  /\bwhat('s| is) the (purpose|point|idea)\b/i
4925
5201
  ];
5202
+ var TESTING_PATTERNS = [
5203
+ /\b(test|tests|testing)\b/i,
5204
+ /\b(mock|mocking|stub|stubs)\b/i,
5205
+ /\b(assert|assertion|expect)\b/i,
5206
+ /\btest\s*(client|runner|helper|fixture|suite)\b/i,
5207
+ /\b(unit|integration|e2e|end-to-end)\s*test/i,
5208
+ /\b(pytest|jest|vitest|mocha|gotest)\b/i
5209
+ ];
4926
5210
  function classifyQueryIntents(query) {
4927
5211
  const q = query.toLowerCase();
4928
5212
  const intents = [];
4929
5213
  if (IMPLEMENTATION_PATTERNS.some((p) => p.test(q))) {
4930
5214
  intents.push({ intent: "implementation", confidence: 0.9 });
4931
5215
  }
5216
+ if (TESTING_PATTERNS.some((p) => p.test(q))) {
5217
+ intents.push({ intent: "testing", confidence: 0.85 });
5218
+ }
4932
5219
  if (DEBUGGING_PATTERNS.some((p) => p.test(q))) {
4933
5220
  intents.push({ intent: "debugging", confidence: 0.85 });
4934
5221
  }
@@ -4954,41 +5241,16 @@ function mapSearchIntentToQueryIntent(intent) {
4954
5241
  case "find-pattern":
4955
5242
  case "find-implementation":
4956
5243
  case "find-definition":
4957
- case "find-files":
4958
5244
  return "implementation";
4959
5245
  case "find-usage":
4960
5246
  case "find-documentation":
4961
5247
  return "how-to";
5248
+ case "find-files":
5249
+ return "implementation";
4962
5250
  }
4963
5251
  }
4964
- var INTENT_EXPANSION_TERMS = {
4965
- "find-implementation": "source code implementation function class",
4966
- "find-documentation": "documentation guide tutorial example",
4967
- "find-usage": "usage example how to use",
4968
- "find-pattern": "pattern matching code structure",
4969
- "find-definition": "definition type interface declaration",
4970
- "find-files": "file module path"
4971
- };
4972
- function expandQueryWithIntent(query, intent) {
4973
- if (intent === void 0) return query;
4974
- const expansion = INTENT_EXPANSION_TERMS[intent];
4975
- return `${query} ${expansion}`;
4976
- }
4977
- function isStrongFtsSignal(query, ftsResults) {
4978
- if (ftsResults.length < 2) return false;
4979
- const top = ftsResults[0];
4980
- const second = ftsResults[1];
4981
- if (top === void 0 || second === void 0) return false;
4982
- if (second.score > 0 && top.score / second.score <= 2) return false;
4983
- const queryLower = query.toLowerCase();
4984
- const rawFile = top.metadata["file"] ?? top.metadata["path"];
4985
- const rawName = top.metadata["name"];
4986
- const filePath = typeof rawFile === "string" ? rawFile : "";
4987
- const name = typeof rawName === "string" ? rawName : "";
4988
- return filePath.toLowerCase().includes(queryLower) || name.toLowerCase().includes(queryLower);
4989
- }
4990
5252
  var RRF_PRESETS = {
4991
- code: { k: 25, vectorWeight: 0.75, ftsWeight: 0.25 },
5253
+ code: { k: 25, vectorWeight: 0.35, ftsWeight: 0.65 },
4992
5254
  web: { k: 30, vectorWeight: 0.7, ftsWeight: 0.3 }
4993
5255
  };
4994
5256
  var DEFAULT_CANDIDATE_MULTIPLIER = 2;
@@ -5075,8 +5337,7 @@ var SearchService = class {
5075
5337
  let rerankTimeMs;
5076
5338
  const fetchLimit = limit * 3;
5077
5339
  if (mode === "vector") {
5078
- const expandedQuery = expandQueryWithIntent(query.query, query.intent);
5079
- const rawResults = await this.vectorSearchRaw(expandedQuery, stores, fetchLimit);
5340
+ const rawResults = await this.vectorSearchRaw(query.query, stores, fetchLimit);
5080
5341
  maxRawScore = rawResults.length > 0 ? rawResults[0]?.score ?? 0 : 0;
5081
5342
  allResults = this.normalizeAndFilterScores(rawResults, query.threshold).slice(0, fetchLimit);
5082
5343
  } else if (mode === "fts") {
@@ -5086,8 +5347,7 @@ var SearchService = class {
5086
5347
  query.query,
5087
5348
  stores,
5088
5349
  fetchLimit,
5089
- query.threshold,
5090
- query.intent
5350
+ query.threshold
5091
5351
  );
5092
5352
  allResults = hybridResult.results;
5093
5353
  maxRawScore = hybridResult.maxRawScore;
@@ -5226,6 +5486,41 @@ var SearchService = class {
5226
5486
  }
5227
5487
  return normalized;
5228
5488
  }
5489
+ /**
5490
+ * Generate query variants for multi-query expansion.
5491
+ * Strips intent prefixes to create a keyword-focused variant.
5492
+ * Returns original + variants (deduplicated).
5493
+ */
5494
+ expandQuery(query) {
5495
+ const queries = [query];
5496
+ const stripped = query.replace(
5497
+ /^(how to |how do I |how does |implement |usage of |find the |what is |show me |where is |where are )/i,
5498
+ ""
5499
+ ).trim();
5500
+ if (stripped !== query && stripped.length >= 5) {
5501
+ queries.push(stripped);
5502
+ }
5503
+ return queries;
5504
+ }
5505
+ /**
5506
+ * Run vector search across multiple query variants and merge results.
5507
+ * Deduplicates by document ID, keeping the highest score.
5508
+ */
5509
+ async multiQueryVectorSearch(queries, stores, limit) {
5510
+ const allResults = await Promise.all(
5511
+ queries.map((q) => this.vectorSearchRaw(q, stores, limit))
5512
+ );
5513
+ const merged = /* @__PURE__ */ new Map();
5514
+ for (const results of allResults) {
5515
+ for (const r of results) {
5516
+ const existing = merged.get(r.id);
5517
+ if (existing === void 0 || r.score > existing.score) {
5518
+ merged.set(r.id, r);
5519
+ }
5520
+ }
5521
+ }
5522
+ return [...merged.values()].sort((a, b) => b.score - a.score).slice(0, limit);
5523
+ }
5229
5524
  /**
5230
5525
  * Fetch raw vector search results without normalization.
5231
5526
  * Returns results with raw cosine similarity scores [0-1].
@@ -5249,9 +5544,10 @@ var SearchService = class {
5249
5544
  }
5250
5545
  async ftsSearch(query, stores, limit) {
5251
5546
  const results = [];
5547
+ const normalizedQuery = normalizeFtsQuery(query);
5252
5548
  for (const storeId of stores) {
5253
5549
  try {
5254
- const hits = await this.lanceStore.fullTextSearch(storeId, query, limit);
5550
+ const hits = await this.lanceStore.fullTextSearch(storeId, normalizedQuery, limit);
5255
5551
  results.push(
5256
5552
  ...hits.map((r) => ({
5257
5553
  id: r.id,
@@ -5268,37 +5564,20 @@ var SearchService = class {
5268
5564
  /**
5269
5565
  * Internal hybrid search result with additional metadata for confidence calculation.
5270
5566
  */
5271
- async hybridSearchWithMetadata(query, stores, limit, threshold, searchIntent) {
5567
+ async hybridSearchWithMetadata(query, stores, limit, threshold) {
5272
5568
  const intents = classifyQueryIntents(query);
5273
5569
  const envOverrides = parseSearchEnvOverrides(false);
5274
5570
  const candidateMultiplier = envOverrides.candidateMultiplier ?? DEFAULT_CANDIDATE_MULTIPLIER;
5275
- const ftsResults = await this.ftsSearch(query, stores, limit * candidateMultiplier);
5276
- if (isStrongFtsSignal(query, ftsResults)) {
5277
- logger4.debug(
5278
- { query, topScore: ftsResults[0]?.score },
5279
- "Strong FTS signal \u2014 skipping vector search"
5280
- );
5281
- const sorted2 = ftsResults.slice(0, limit).map((r, i) => ({
5282
- ...r,
5283
- score: Math.round((1 - i / Math.max(ftsResults.length, 1)) * 1e6) / 1e6
5284
- }));
5285
- if (threshold !== void 0) {
5286
- return { results: sorted2.filter((r) => r.score >= threshold), maxRawScore: 0 };
5287
- }
5288
- return { results: sorted2, maxRawScore: 0 };
5289
- }
5290
- const expandedQuery = expandQueryWithIntent(query, searchIntent);
5291
- const rawVectorResults = await this.vectorSearchRaw(
5292
- expandedQuery,
5293
- stores,
5294
- limit * candidateMultiplier
5295
- );
5571
+ const fetchLimit = limit * candidateMultiplier;
5572
+ const useExpansion = envOverrides.queryExpansion === true;
5573
+ const rawVectorResults = useExpansion ? await this.multiQueryVectorSearch(this.expandQuery(query), stores, fetchLimit) : await this.vectorSearchRaw(query, stores, fetchLimit);
5296
5574
  const rawVectorScores = /* @__PURE__ */ new Map();
5297
5575
  rawVectorResults.forEach((r) => {
5298
5576
  rawVectorScores.set(r.id, r.score);
5299
5577
  });
5300
5578
  const maxRawScore = rawVectorResults.length > 0 ? rawVectorResults[0]?.score ?? 0 : 0;
5301
5579
  const vectorResults = this.normalizeAndFilterScores(rawVectorResults);
5580
+ const ftsResults = await this.ftsSearch(query, stores, limit * candidateMultiplier);
5302
5581
  const vectorRanks = /* @__PURE__ */ new Map();
5303
5582
  const ftsRanks = /* @__PURE__ */ new Map();
5304
5583
  const allDocs = /* @__PURE__ */ new Map();
@@ -5334,6 +5613,8 @@ var SearchService = class {
5334
5613
  const pathKeywordBoost = this.getPathKeywordBoost(query, result2);
5335
5614
  const depthBoost = this.getDepthBoost(result2, getPrimaryIntent(intents));
5336
5615
  const entryPointBoost = this.getEntryPointBoost(result2, getPrimaryIntent(intents));
5616
+ const sectionHeaderBoost = 1;
5617
+ const symbolNameBoost = 1;
5337
5618
  const metadata = {
5338
5619
  vectorRRF,
5339
5620
  ftsRRF,
@@ -5342,7 +5623,9 @@ var SearchService = class {
5342
5623
  urlKeywordBoost,
5343
5624
  pathKeywordBoost,
5344
5625
  depthBoost,
5345
- entryPointBoost
5626
+ entryPointBoost,
5627
+ sectionHeaderBoost,
5628
+ symbolNameBoost
5346
5629
  };
5347
5630
  if (vectorRank !== Infinity) {
5348
5631
  metadata.vectorRank = vectorRank;
@@ -5355,7 +5638,7 @@ var SearchService = class {
5355
5638
  }
5356
5639
  rrfScores.push({
5357
5640
  id,
5358
- score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost * urlKeywordBoost * pathKeywordBoost * depthBoost * entryPointBoost,
5641
+ score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost * urlKeywordBoost * pathKeywordBoost * depthBoost * entryPointBoost * sectionHeaderBoost * symbolNameBoost,
5359
5642
  result: result2,
5360
5643
  rawVectorScore,
5361
5644
  metadata
@@ -5376,20 +5659,10 @@ var SearchService = class {
5376
5659
  reranked.results.forEach((r) => {
5377
5660
  rerankedScores.set(r.id, r.rerankerScore);
5378
5661
  });
5379
- const maxRrfScore = sortedAll[0]?.score ?? 1;
5380
- sorted = sortedAll.map((r, rrfRank) => {
5381
- const rerankerScore = rerankedScores.get(r.id);
5382
- if (rerankerScore === void 0) {
5383
- return { ...r, blendedScore: -Infinity };
5384
- }
5385
- const normalizedRrf = maxRrfScore > 0 ? r.score / maxRrfScore : 0;
5386
- const rrfWeight = rrfRank < 3 ? 0.7 : rrfRank < 10 ? 0.5 : 0.3;
5387
- const rerankerWeight = 1 - rrfWeight;
5388
- return {
5389
- ...r,
5390
- blendedScore: normalizedRrf * rrfWeight + rerankerScore * rerankerWeight
5391
- };
5392
- }).sort((a, b) => b.blendedScore - a.blendedScore).slice(0, limit);
5662
+ sorted = sortedAll.map((r) => ({
5663
+ ...r,
5664
+ rerankerScore: rerankedScores.get(r.id)
5665
+ })).sort((a, b) => (b.rerankerScore ?? -Infinity) - (a.rerankerScore ?? -Infinity)).slice(0, limit);
5393
5666
  } else {
5394
5667
  sorted = sortedAll.slice(0, limit);
5395
5668
  }
@@ -5489,7 +5762,9 @@ var SearchService = class {
5489
5762
  const blendedMultiplier = totalConfidence > 0 ? weightedMultiplier / totalConfidence : 1;
5490
5763
  const finalBoost = baseBoost * blendedMultiplier;
5491
5764
  if (fileType === "test") {
5492
- return Math.min(finalBoost, 0.6);
5765
+ const primaryIntent = intents[0]?.intent;
5766
+ const cap = primaryIntent === "testing" ? 1.5 : 0.6;
5767
+ return Math.min(finalBoost, cap);
5493
5768
  }
5494
5769
  return finalBoost;
5495
5770
  }
@@ -5646,12 +5921,6 @@ var SearchService = class {
5646
5921
  location: `${path4}${codeUnit ? `:${String(codeUnit.startLine)}` : ""}`,
5647
5922
  relevanceReason: this.generateRelevanceReason(result, query)
5648
5923
  };
5649
- if (graph) {
5650
- const relatedFiles = this.getRelatedFilePaths(graph, path4, symbolName);
5651
- if (relatedFiles.length > 0) {
5652
- enhanced.summary = { ...enhanced.summary, relatedFiles };
5653
- }
5654
- }
5655
5924
  if (detail === "contextual" || detail === "full") {
5656
5925
  const usage = this.getUsageFromGraph(graph, path4, symbolName);
5657
5926
  enhanced.context = {
@@ -5867,23 +6136,9 @@ var SearchService = class {
5867
6136
  };
5868
6137
  }
5869
6138
  /**
5870
- * Get related file paths from code graph edges for follow-up reads.
5871
- * Returns unique file paths (max 5) from callers/callees, excluding the result's own file.
6139
+ * Get related code from graph.
6140
+ * Returns callers and callees for the symbol.
5872
6141
  */
5873
- getRelatedFilePaths(graph, filePath, symbolName) {
5874
- if (symbolName === "" || symbolName === "(anonymous)") return [];
5875
- const nodeId = `${filePath}:${symbolName}`;
5876
- const files = /* @__PURE__ */ new Set();
5877
- for (const edge of graph.getIncomingEdges(nodeId)) {
5878
- const [file] = this.parseNodeId(edge.from);
5879
- if (file && file !== filePath) files.add(file);
5880
- }
5881
- for (const edge of graph.getEdges(nodeId)) {
5882
- const [file] = this.parseNodeId(edge.to);
5883
- if (file && file !== filePath) files.add(file);
5884
- }
5885
- return Array.from(files).slice(0, 5);
5886
- }
5887
6142
  getRelatedCodeFromGraph(graph, filePath, symbolName) {
5888
6143
  if (!graph || symbolName === "" || symbolName === "(anonymous)") {
5889
6144
  return [];
@@ -5947,7 +6202,9 @@ var IngestConfigSchema = z3.object({
5947
6202
  /** Skip binary files detected by content heuristic - default true */
5948
6203
  skipBinaries: z3.boolean().optional(),
5949
6204
  /** Override max file size for this store (bytes) */
5950
- maxFileSizeBytes: z3.number().int().positive().optional()
6205
+ maxFileSizeBytes: z3.number().int().positive().optional(),
6206
+ /** Maximum number of files to index (cap applied after all other filters) */
6207
+ maxFiles: z3.number().int().positive().optional()
5951
6208
  });
5952
6209
  var FileStoreDefinitionSchema = BaseStoreDefinitionSchema.extend({
5953
6210
  type: z3.literal("file"),
@@ -6190,7 +6447,7 @@ async function cloneRepository(options) {
6190
6447
  if (partialClone) {
6191
6448
  args.push("--filter=blob:none");
6192
6449
  }
6193
- args.push("--depth", String(depth), "--single-branch");
6450
+ args.push("--depth", String(depth), "--single-branch", "--no-tags");
6194
6451
  if (branch !== void 0) {
6195
6452
  args.push("--branch", branch);
6196
6453
  }
@@ -6202,12 +6459,17 @@ async function cloneRepository(options) {
6202
6459
  branch,
6203
6460
  depth,
6204
6461
  singleBranch: true,
6462
+ noTags: true,
6463
+ lfsSkipSmudge: true,
6205
6464
  partialClone
6206
6465
  },
6207
6466
  "Starting git clone"
6208
6467
  );
6209
6468
  return new Promise((resolve4) => {
6210
- const git = spawn("git", args, { stdio: ["ignore", "pipe", "pipe"] });
6469
+ const git = spawn("git", args, {
6470
+ stdio: ["ignore", "pipe", "pipe"],
6471
+ env: { ...process.env, GIT_LFS_SKIP_SMUDGE: "1" }
6472
+ });
6211
6473
  let timedOut = false;
6212
6474
  let forceKillTimeout = null;
6213
6475
  const timeout = setTimeout(() => {
@@ -7212,6 +7474,8 @@ var LanceStore = class {
7212
7474
  const schema = LanceSchema({
7213
7475
  id: new Utf8(),
7214
7476
  content: this.embeddingFunction.sourceField(),
7477
+ fts_content: new Utf8(),
7478
+ // content + identifier-split variants for FTS
7215
7479
  vector: this.embeddingFunction.vectorField(),
7216
7480
  metadata: new Utf8()
7217
7481
  });
@@ -7222,6 +7486,7 @@ var LanceStore = class {
7222
7486
  {
7223
7487
  id: "__init__",
7224
7488
  content: "",
7489
+ fts_content: "",
7225
7490
  vector: new Array(this._dimensions).fill(0),
7226
7491
  metadata: "{}"
7227
7492
  }
@@ -7236,13 +7501,25 @@ var LanceStore = class {
7236
7501
  }
7237
7502
  async addDocuments(storeId, documents) {
7238
7503
  const table = await this.getTable(storeId);
7239
- const lanceDocuments = documents.map((doc) => ({
7240
- id: doc.id,
7241
- content: doc.content,
7242
- vector: [...doc.vector],
7243
- metadata: JSON.stringify(doc.metadata)
7244
- }));
7245
- await table.add(lanceDocuments);
7504
+ const hasFts = await this.hasFtsContentColumn(storeId);
7505
+ if (hasFts) {
7506
+ const lanceDocuments = documents.map((doc) => ({
7507
+ id: doc.id,
7508
+ content: doc.content,
7509
+ fts_content: buildFtsContent(doc.content),
7510
+ vector: [...doc.vector],
7511
+ metadata: JSON.stringify(doc.metadata)
7512
+ }));
7513
+ await table.add(lanceDocuments);
7514
+ } else {
7515
+ const lanceDocuments = documents.map((doc) => ({
7516
+ id: doc.id,
7517
+ content: doc.content,
7518
+ vector: [...doc.vector],
7519
+ metadata: JSON.stringify(doc.metadata)
7520
+ }));
7521
+ await table.add(lanceDocuments);
7522
+ }
7246
7523
  }
7247
7524
  async deleteDocuments(storeId, documentIds) {
7248
7525
  if (documentIds.length === 0) {
@@ -7297,10 +7574,29 @@ var LanceStore = class {
7297
7574
  }
7298
7575
  async createFtsIndex(storeId) {
7299
7576
  const table = await this.getTable(storeId);
7300
- await table.createIndex("content", {
7301
- config: lancedb.Index.fts()
7577
+ const ftsColumn = await this.hasFtsContentColumn(storeId) ? "fts_content" : "content";
7578
+ await table.createIndex(ftsColumn, {
7579
+ config: lancedb.Index.fts({
7580
+ stem: true,
7581
+ removeStopWords: false,
7582
+ lowercase: true,
7583
+ language: "English"
7584
+ })
7302
7585
  });
7303
7586
  }
7587
+ /**
7588
+ * Check if a table has the fts_content column (v3 schema).
7589
+ * Tables created before the FTS improvement only have content.
7590
+ */
7591
+ async hasFtsContentColumn(storeId) {
7592
+ try {
7593
+ const table = await this.getTable(storeId);
7594
+ const schema = await table.schema();
7595
+ return schema.fields.some((f) => f.name === "fts_content");
7596
+ } catch {
7597
+ return false;
7598
+ }
7599
+ }
7304
7600
  async fullTextSearch(storeId, query, limit) {
7305
7601
  const table = await this.getTable(storeId);
7306
7602
  const results = await table.search(query, "fts").limit(limit).toArray();
@@ -7637,7 +7933,6 @@ export {
7637
7933
  ASTParser,
7638
7934
  ok,
7639
7935
  err,
7640
- TEXT_EXTENSIONS,
7641
7936
  classifyWebContentType,
7642
7937
  isFileStoreDefinition,
7643
7938
  isRepoStoreDefinition,
@@ -7650,4 +7945,4 @@ export {
7650
7945
  createServices,
7651
7946
  destroyServices
7652
7947
  };
7653
- //# sourceMappingURL=chunk-L2SC6J4K.js.map
7948
+ //# sourceMappingURL=chunk-724FNI27.js.map