sweet-search 2.4.2 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/core/cli.js +43 -5
  2. package/core/embedding/embedding-cache.js +266 -18
  3. package/core/embedding/embedding-service.js +45 -9
  4. package/core/graph/graph-expansion.js +52 -12
  5. package/core/graph/graph-extractor.js +30 -1
  6. package/core/indexing/ast-chunker.js +331 -16
  7. package/core/indexing/chunking/chunk-builder.js +34 -1
  8. package/core/indexing/index-codebase-v21.js +31 -2
  9. package/core/indexing/index.js +6 -3
  10. package/core/indexing/indexer-ann.js +45 -6
  11. package/core/indexing/indexer-build.js +9 -1
  12. package/core/indexing/indexer-phases.js +6 -4
  13. package/core/indexing/indexing-file-policy.js +140 -0
  14. package/core/indexing/li-skip-policy.js +11 -220
  15. package/core/infrastructure/codebase-repository.js +21 -0
  16. package/core/infrastructure/config/embedding.js +20 -1
  17. package/core/infrastructure/config/graph.js +2 -2
  18. package/core/infrastructure/config/ranking.js +10 -0
  19. package/core/infrastructure/config/vector-store.js +1 -1
  20. package/core/infrastructure/coreml-cascade.js +236 -30
  21. package/core/infrastructure/coreml-cascade.json +25 -0
  22. package/core/infrastructure/index.js +17 -0
  23. package/core/infrastructure/init-config.js +216 -0
  24. package/core/infrastructure/language-patterns/registry-core.js +18 -0
  25. package/core/infrastructure/model-registry.js +12 -0
  26. package/core/infrastructure/native-inference.js +143 -51
  27. package/core/infrastructure/tree-sitter-provider.js +92 -2
  28. package/core/ranking/cascaded-scorer.js +6 -2
  29. package/core/ranking/file-kind-ranking.js +264 -0
  30. package/core/ranking/late-interaction-index.js +10 -4
  31. package/core/ranking/late-interaction-policy.js +304 -0
  32. package/core/search/context-expander.js +267 -28
  33. package/core/search/index.js +4 -0
  34. package/core/search/search-cli.js +3 -1
  35. package/core/search/search-pattern.js +4 -3
  36. package/core/search/search-postprocess.js +189 -8
  37. package/core/search/search-read-semantic.js +734 -0
  38. package/core/search/search-read.js +481 -0
  39. package/core/search/search-server.js +153 -5
  40. package/core/search/sweet-search.js +133 -16
  41. package/core/start-server.js +13 -2
  42. package/mcp/server.js +41 -0
  43. package/mcp/tool-handlers.js +117 -6
  44. package/package.json +9 -7
  45. package/scripts/init.js +386 -5
  46. package/scripts/uninstall.js +152 -6
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Shared indexing file policy.
3
+ *
4
+ * Embedding discovery, BM25/sparse artifacts, and late-interaction indexing
5
+ * must agree on the same project include/exclude policy. This module provides
6
+ * the chunk-level checks that happen after discovery has already applied the
7
+ * shared include/exclude globs, max file size, and gitignore alignment.
8
+ */
9
+
10
+ import { existsSync, readFileSync } from 'fs';
11
+ import { minimatch } from 'minimatch';
12
+ import { loadProjectConfig } from '../infrastructure/config/index.js';
13
+
14
+ const MM_OPTS = { dot: true, nocase: false };
15
+
16
+ function normalizePath(p) {
17
+ return p.replace(/\\/g, '/');
18
+ }
19
+
20
+ const _excludesByRoot = new Map();
21
+
22
+ function getExcludes(projectRoot) {
23
+ const key = projectRoot || '__cwd__';
24
+ let cached = _excludesByRoot.get(key);
25
+ if (cached) return cached;
26
+ const config = loadProjectConfig(projectRoot || process.cwd());
27
+ cached = Array.isArray(config.exclude) ? config.exclude : [];
28
+ _excludesByRoot.set(key, cached);
29
+ return cached;
30
+ }
31
+
32
+ function resetCache() {
33
+ _excludesByRoot.clear();
34
+ _cachedExtraPatterns = null;
35
+ }
36
+
37
+ const GENERATED_MARKERS = [
38
+ '@generated',
39
+ 'Code generated by',
40
+ 'DO NOT EDIT',
41
+ 'AUTO-GENERATED FILE',
42
+ 'AUTOGENERATED FILE',
43
+ 'This file is automatically generated',
44
+ ];
45
+
46
+ let _cachedExtraPatterns = null;
47
+ function loadExtraPatternsFromFile() {
48
+ if (_cachedExtraPatterns !== null) return _cachedExtraPatterns;
49
+ const path = process.env.SWEET_SEARCH_LI_SKIP_FILE;
50
+ if (!path || !existsSync(path)) {
51
+ _cachedExtraPatterns = [];
52
+ return _cachedExtraPatterns;
53
+ }
54
+ try {
55
+ const lines = readFileSync(path, 'utf8').split(/\r?\n/);
56
+ _cachedExtraPatterns = lines
57
+ .map((l) => l.trim())
58
+ .filter((l) => l && !l.startsWith('#'));
59
+ } catch {
60
+ _cachedExtraPatterns = [];
61
+ }
62
+ return _cachedExtraPatterns;
63
+ }
64
+
65
+ export function isExcludedByConfig(filePath, projectRoot) {
66
+ if (!filePath) return false;
67
+ const p = normalizePath(filePath);
68
+ const excludes = getExcludes(projectRoot);
69
+ for (const g of excludes) {
70
+ if (typeof g === 'string' && minimatch(p, g, MM_OPTS)) return true;
71
+ }
72
+ const extras = loadExtraPatternsFromFile();
73
+ for (const g of extras) {
74
+ if (minimatch(p, g, MM_OPTS)) return true;
75
+ }
76
+ return false;
77
+ }
78
+
79
+ export function chunkLooksGenerated(text) {
80
+ if (!text) return false;
81
+ const head = text.slice(0, 500);
82
+ for (const marker of GENERATED_MARKERS) {
83
+ if (head.includes(marker)) return true;
84
+ }
85
+ return false;
86
+ }
87
+
88
+ export function applyIndexingChunkPolicy(chunks, options = {}) {
89
+ if (process.env.SWEET_SEARCH_LI_SKIP_DISABLE === '1' || !Array.isArray(chunks)) {
90
+ return { kept: chunks || [], skipped: [], stats: emptyStats() };
91
+ }
92
+ const { projectRoot } = options;
93
+ const fileFirstReason = new Map();
94
+
95
+ for (const chunk of chunks) {
96
+ if (!chunk?.file) continue;
97
+ if (fileFirstReason.has(chunk.file)) continue;
98
+
99
+ let reason = null;
100
+ if (isExcludedByConfig(chunk.file, projectRoot)) {
101
+ reason = 'excluded';
102
+ } else {
103
+ const text = chunk.text || chunk.content || '';
104
+ if (chunkLooksGenerated(text)) reason = 'generated';
105
+ }
106
+ fileFirstReason.set(chunk.file, reason);
107
+ }
108
+
109
+ const kept = [];
110
+ const skipped = [];
111
+ const stats = emptyStats();
112
+ for (const chunk of chunks) {
113
+ const reason = chunk?.file ? fileFirstReason.get(chunk.file) : null;
114
+ if (reason) {
115
+ skipped.push(chunk);
116
+ stats[reason]++;
117
+ stats.totalSkipped++;
118
+ } else {
119
+ kept.push(chunk);
120
+ }
121
+ }
122
+ stats.skippedFiles = new Set(skipped.map((c) => c.file).filter(Boolean)).size;
123
+ stats.keptFiles = new Set(kept.map((c) => c.file).filter(Boolean)).size;
124
+ return { kept, skipped, stats };
125
+ }
126
+
127
+ function emptyStats() {
128
+ return {
129
+ excluded: 0,
130
+ generated: 0,
131
+ totalSkipped: 0,
132
+ skippedFiles: 0,
133
+ keptFiles: 0,
134
+ };
135
+ }
136
+
137
+ export const _internals = {
138
+ GENERATED_MARKERS,
139
+ resetCache,
140
+ };
@@ -1,225 +1,16 @@
1
1
  /**
2
- * Late Interaction skip policy.
2
+ * Late Interaction skip policy compatibility exports.
3
3
  *
4
- * The embedding indexer and the LI reranker share a SINGLE unified skip list —
5
- * the `exclude` globs loaded by `loadProjectConfig()` from
6
- * `core/infrastructure/config/search.js`. That list is the authoritative
7
- * source of truth for vendored / build-output / lock / minified / secret /
8
- * binary noise, AND it includes any user extensions from
9
- * `.sweet-search.config.json`. The embed indexer at
10
- * `core/indexing/indexer-utils.js` passes the same list to fast-glob; we
11
- * match it here with `minimatch`.
12
- *
13
- * This module adds TWO things on top of the unified glob check that globs
14
- * alone cannot express:
15
- *
16
- * 1. `chunkLooksGenerated` — content-based detection of auto-generated files
17
- * (starting with `// @generated` / `Code generated by` / `DO NOT EDIT`).
18
- * 2. `applyLiSkipPolicy` per-file token budget — a resource guard for the
19
- * slow LI encoder so one huge JSON/grammar file can't blow the budget.
20
- * This is a latency decision, not a semantic one.
21
- *
22
- * Configurable via:
23
- * - SWEET_SEARCH_LI_SKIP_DISABLE=1 → disable all skip policy (LI everything)
24
- * - SWEET_SEARCH_LI_SKIP_FILE=<path> → extra GLOB patterns, one per line
25
- * (# comments allowed; blank lines ignored)
26
- * - SWEET_SEARCH_LI_MAX_FILE_TOKENS=N → per-file token cap (default: 50_000)
27
- */
28
-
29
- import { existsSync, readFileSync } from 'fs';
30
- import { minimatch } from 'minimatch';
31
- import { loadProjectConfig } from '../infrastructure/config/index.js';
32
-
33
- const MM_OPTS = { dot: true, nocase: false };
34
-
35
- function normalizePath(p) {
36
- return p.replace(/\\/g, '/');
37
- }
38
-
39
- /**
40
- * Cache the resolved exclude list keyed by projectRoot so we don't reload
41
- * `.sweet-search.config.json` on every chunk. Keyed by projectRoot string;
42
- * `null`/`undefined` collapse to a default `cwd` key.
43
- */
44
- const _excludesByRoot = new Map();
45
-
46
- function getExcludes(projectRoot) {
47
- const key = projectRoot || '__cwd__';
48
- let cached = _excludesByRoot.get(key);
49
- if (cached) return cached;
50
- const config = loadProjectConfig(projectRoot || process.cwd());
51
- cached = Array.isArray(config.exclude) ? config.exclude : [];
52
- _excludesByRoot.set(key, cached);
53
- return cached;
54
- }
55
-
56
- /**
57
- * Test-only: clear the excludes cache. Not exported through the public
58
- * surface; reachable via `_internals.resetCache()`.
59
- */
60
- function resetCache() {
61
- _excludesByRoot.clear();
62
- _cachedExtraPatterns = null;
63
- }
64
-
65
- /**
66
- * Markers that indicate a file is auto-generated. We only check the FIRST
67
- * ~500 characters of a chunk's text since real generated headers always
68
- * appear at the top.
69
- */
70
- const GENERATED_MARKERS = [
71
- '@generated',
72
- 'Code generated by',
73
- 'DO NOT EDIT',
74
- 'AUTO-GENERATED FILE',
75
- 'AUTOGENERATED FILE',
76
- 'This file is automatically generated',
77
- ];
78
-
79
- const DEFAULT_MAX_FILE_TOKENS = 50_000;
80
-
81
- let _cachedExtraPatterns = null;
82
- function loadExtraPatternsFromFile() {
83
- if (_cachedExtraPatterns !== null) return _cachedExtraPatterns;
84
- const path = process.env.SWEET_SEARCH_LI_SKIP_FILE;
85
- if (!path || !existsSync(path)) {
86
- _cachedExtraPatterns = [];
87
- return _cachedExtraPatterns;
88
- }
89
- try {
90
- const lines = readFileSync(path, 'utf8').split(/\r?\n/);
91
- _cachedExtraPatterns = lines
92
- .map((l) => l.trim())
93
- .filter((l) => l && !l.startsWith('#'));
94
- } catch {
95
- _cachedExtraPatterns = [];
96
- }
97
- return _cachedExtraPatterns;
98
- }
99
-
100
- /**
101
- * Check if a file path matches any exclude glob from the unified
102
- * `loadProjectConfig(projectRoot).exclude` list, plus any extra globs loaded
103
- * from `SWEET_SEARCH_LI_SKIP_FILE`.
104
- *
105
- * @param {string} filePath
106
- * @param {string} [projectRoot]
107
- */
108
- export function isExcludedByConfig(filePath, projectRoot) {
109
- if (!filePath) return false;
110
- const p = normalizePath(filePath);
111
- const excludes = getExcludes(projectRoot);
112
- for (const g of excludes) {
113
- if (typeof g === 'string' && minimatch(p, g, MM_OPTS)) return true;
114
- }
115
- const extras = loadExtraPatternsFromFile();
116
- for (const g of extras) {
117
- if (minimatch(p, g, MM_OPTS)) return true;
118
- }
119
- return false;
120
- }
121
-
122
- /**
123
- * Check if a chunk's text starts with a generated-file marker.
124
- * Only inspects the first ~500 chars (these markers always appear at the top).
125
- */
126
- export function chunkLooksGenerated(text) {
127
- if (!text) return false;
128
- const head = text.slice(0, 500);
129
- for (const marker of GENERATED_MARKERS) {
130
- if (head.includes(marker)) return true;
131
- }
132
- return false;
133
- }
134
-
135
- /**
136
- * Filter a chunk list down to those eligible for late-interaction encoding.
137
- *
138
- * In the normal pipeline the embed indexer has already rejected most glob
139
- * matches at file-discovery time, so the glob check here is defense-in-depth:
140
- * it catches chunks produced by alternate code paths or config drift. The
141
- * content-based `chunkLooksGenerated` and the per-file token cap are the
142
- * checks that actually do new work.
143
- *
144
- * @param {Array<{file: string, text?: string, content?: string}>} chunks
145
- * @param {{ projectRoot?: string }} [options]
146
- * @returns {{ kept: Array, skipped: Array, stats: object }}
4
+ * The shared implementation lives in `indexing-file-policy.js` so embedding,
5
+ * sparse/BM25 artifacts, and LI all depend on one file-policy source.
147
6
  */
148
- export function applyLiSkipPolicy(chunks, options = {}) {
149
- if (process.env.SWEET_SEARCH_LI_SKIP_DISABLE === '1' || !Array.isArray(chunks)) {
150
- return { kept: chunks || [], skipped: [], stats: emptyStats() };
151
- }
152
- const { projectRoot } = options;
153
-
154
- const maxFileTokens = parseInt(process.env.SWEET_SEARCH_LI_MAX_FILE_TOKENS || '', 10) > 0
155
- ? parseInt(process.env.SWEET_SEARCH_LI_MAX_FILE_TOKENS, 10)
156
- : DEFAULT_MAX_FILE_TOKENS;
157
-
158
- // First pass: per-file token totals + per-chunk reasons.
159
- const fileTokenTotal = new Map();
160
- const fileFirstReason = new Map(); // file → reason ('excluded', 'generated', 'huge', null)
161
-
162
- for (const chunk of chunks) {
163
- if (!chunk?.file) continue;
164
- if (fileFirstReason.has(chunk.file)) continue; // already classified
165
-
166
- let reason = null;
167
- if (isExcludedByConfig(chunk.file, projectRoot)) {
168
- reason = 'excluded';
169
- } else {
170
- const text = chunk.text || chunk.content || '';
171
- if (chunkLooksGenerated(text)) reason = 'generated';
172
- }
173
- fileFirstReason.set(chunk.file, reason);
174
- }
175
-
176
- // Per-file token totals (only for files not already classified for skip)
177
- for (const chunk of chunks) {
178
- if (!chunk?.file) continue;
179
- if (fileFirstReason.get(chunk.file)) continue; // already skipped
180
- const text = chunk.text || chunk.content || '';
181
- // Cheap char/4 estimate; exact tokenization isn't needed here.
182
- const est = Math.ceil(text.length / 4);
183
- fileTokenTotal.set(chunk.file, (fileTokenTotal.get(chunk.file) || 0) + est);
184
- }
185
- for (const [file, total] of fileTokenTotal) {
186
- if (total > maxFileTokens) fileFirstReason.set(file, 'huge');
187
- }
188
-
189
- // Second pass: split chunks into kept/skipped buckets using the file decisions.
190
- const kept = [];
191
- const skipped = [];
192
- const stats = emptyStats();
193
- for (const chunk of chunks) {
194
- const reason = chunk?.file ? fileFirstReason.get(chunk.file) : null;
195
- if (reason) {
196
- skipped.push(chunk);
197
- stats[reason]++;
198
- stats.totalSkipped++;
199
- } else {
200
- kept.push(chunk);
201
- }
202
- }
203
- // Distinct file counts (helpful for the summary line)
204
- stats.skippedFiles = new Set(skipped.map((c) => c.file).filter(Boolean)).size;
205
- stats.keptFiles = new Set(kept.map((c) => c.file).filter(Boolean)).size;
206
- return { kept, skipped, stats };
207
- }
208
7
 
209
- function emptyStats() {
210
- return {
211
- excluded: 0,
212
- generated: 0,
213
- huge: 0,
214
- totalSkipped: 0,
215
- skippedFiles: 0,
216
- keptFiles: 0,
217
- };
218
- }
8
+ export {
9
+ isExcludedByConfig,
10
+ chunkLooksGenerated,
11
+ _internals,
12
+ } from './indexing-file-policy.js';
219
13
 
220
- // Exported for tests
221
- export const _internals = {
222
- GENERATED_MARKERS,
223
- DEFAULT_MAX_FILE_TOKENS,
224
- resetCache,
225
- };
14
+ export {
15
+ applyIndexingChunkPolicy as applyLiSkipPolicy,
16
+ } from './indexing-file-policy.js';
@@ -78,6 +78,27 @@ export class CodebaseRepository {
78
78
  }
79
79
  }
80
80
 
81
+ /**
82
+ * Return all chunk metadata rows for a single file_path.
83
+ * Used by sweet-search read / read-semantic for symbol-aware metadata
84
+ * and for in-file candidate enumeration. Returns empty array if the file
85
+ * is not indexed, the DB is missing, or the table doesn't exist yet.
86
+ *
87
+ * @param {string} filePath - Project-relative file path as stored in vectors.file_path
88
+ * @returns {Array<{id, file_path, text, metadata}>}
89
+ */
90
+ getChunksByFilePath(filePath) {
91
+ if (!filePath) return [];
92
+ try {
93
+ const db = this._open();
94
+ return db.prepare(
95
+ 'SELECT id, file_path, text, metadata FROM vectors WHERE file_path = ? ORDER BY id'
96
+ ).all(filePath);
97
+ } catch {
98
+ return [];
99
+ }
100
+ }
101
+
81
102
  /**
82
103
  * Full vector scan in an ephemeral connection (no persistent state).
83
104
  * Used by the O(N) fallback path — opens, scans, closes immediately.
@@ -245,8 +245,27 @@ export const EMBEDDING_CONFIG = {
245
245
  enabled: true,
246
246
  maxSize: 1000,
247
247
  vocabularyPath: DB_PATHS.vocabulary,
248
- autoExpand: process.env.SWEET_SEARCH_VOCAB_AUTO_EXPAND !== '0',
248
+ // Whether `getEmbedding` consults the persistent query-vocabulary
249
+ // cache before calling the live model. Disable to force fresh
250
+ // model output on every query — required for reproducible
251
+ // benchmarks against a populated vocab file. Reads only; writes
252
+ // are gated separately by `autoExpand` below.
253
+ useVocabulary: process.env.SWEET_SEARCH_VOCAB_USE !== '0'
254
+ && process.env.SWEET_SEARCH_VOCAB_USE !== 'false',
255
+ // Whether queries that fire ≥ `expansionThreshold` times within a
256
+ // process are auto-promoted into the persistent vocabulary file.
257
+ autoExpand: process.env.SWEET_SEARCH_VOCAB_AUTO_EXPAND !== '0'
258
+ && process.env.SWEET_SEARCH_VOCAB_AUTO_EXPAND !== 'false',
249
259
  expansionThreshold: 3,
260
+ // Hard cap on auto-expanded vocabulary size. Once reached, new
261
+ // auto-promotions are skipped; explicit `addToVocabulary` /
262
+ // `expandVocabulary` calls still write through. Override with
263
+ // `SWEET_SEARCH_VOCAB_MAX_TERMS` (range 1..1e6).
264
+ maxTerms: (() => {
265
+ const v = parseInt(process.env.SWEET_SEARCH_VOCAB_MAX_TERMS || '', 10);
266
+ if (Number.isFinite(v) && v > 0 && v <= 1_000_000) return v;
267
+ return 10_000;
268
+ })(),
250
269
  },
251
270
 
252
271
  // All available providers for fallback
@@ -10,7 +10,7 @@
10
10
 
11
11
  export const HCGS_CONFIG = {
12
12
  // Summary generation
13
- enabled: true,
13
+ enabled: false,
14
14
 
15
15
  // Hierarchy levels (bottom-up order)
16
16
  levels: ['function', 'method', 'field', 'class', 'interface', 'enum', 'package', 'file'],
@@ -54,7 +54,7 @@ export const HCGS_CONFIG = {
54
54
  cacheEnabled: true,
55
55
 
56
56
  // Token savings: return summary first, full code on "expand"
57
- returnSummaryFirst: true,
57
+ returnSummaryFirst: false,
58
58
  summaryTokenBudget: 150, // tokens per result in summary mode
59
59
  fullCodeTokenBudget: 1000, // tokens per result in expanded mode
60
60
  };
@@ -179,6 +179,14 @@ export const LATE_INTERACTION_CONFIG = {
179
179
  backboneDim: 768, // raw ModernBERT hidden size
180
180
  tokenDimension: 128, // final output after projection
181
181
  projectionPaths: ['1_Dense/model.safetensors'], // 768→128 single stage
182
+ // Per-stage `out_features`. Length must equal projectionPaths.length.
183
+ // Consumed by the native Rust LI loader to validate safetensors shapes.
184
+ projectionDims: [128],
185
+ // Registry key for the FP32-safetensors variant of this model used by
186
+ // the native (candle / Metal / CUDA) inference path. The ORT-side INT8
187
+ // path uses the parent key (`lateon-code`) directly. See
188
+ // `core/infrastructure/native-inference.js::resolveNativeLiVariant`.
189
+ nativeRegistryKey: 'lateon-code-fp32',
182
190
  maxQueryLength: 256,
183
191
  get maxDocLength() {
184
192
  const env = parseInt(process.env.SWEET_SEARCH_LI_MAX_DOC_LENGTH || '', 10);
@@ -193,6 +201,8 @@ export const LATE_INTERACTION_CONFIG = {
193
201
  backboneDim: 256, // raw ModernBERT hidden size
194
202
  tokenDimension: 48, // final output after 2-stage projection
195
203
  projectionPaths: ['1_Dense/model.safetensors', '2_Dense/model.safetensors'], // 256→512→48
204
+ projectionDims: [512, 48],
205
+ nativeRegistryKey: 'lateon-code-edge-fp32',
196
206
  maxQueryLength: 256,
197
207
  get maxDocLength() {
198
208
  const env = parseInt(process.env.SWEET_SEARCH_LI_MAX_DOC_LENGTH || '', 10);
@@ -64,7 +64,7 @@ export const BINARY_HNSW_CONFIG = {
64
64
  stage1Candidates: 1000, // Binary HNSW retrieves top 1000
65
65
  stage2Candidates: 200, // Int8 rescores top 200 (legacy fixed, used as maxStage2 fallback)
66
66
  stage2_5Candidates: 200, // Float rescore pool size (legacy fixed, used as maxStage2_5 fallback)
67
- stage3Candidates: 20, // Reranker sees top 20
67
+ stage3Candidates: 30, // Reranker sees top 30 (validated 2026-05-03 on GenCodeSearchNet n=6000: +1.52pp R@10, +0.34pp MRR vs 20, no per-language regression)
68
68
 
69
69
  // Phase 1 flag: batched normalized-dot Stage 2 scoring.
70
70
  // When false, falls back to per-candidate int8CosineSimilarity.