sweet-search 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -1,32 +1,15 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* sweet-search read — filesystem-grounded file reader.
|
|
3
|
-
*
|
|
4
|
-
*
|
|
5
|
-
* metadata for indexed files, but the returned `text` always comes from
|
|
6
|
-
* `node:fs`, never from the (truncated) DB column.
|
|
7
|
-
*
|
|
8
|
-
* Design notes:
|
|
9
|
-
* - Filesystem is ground truth. Never return DB-stored text as content.
|
|
10
|
-
* - Batch up to 20 files; per-file errors do not fail the batch.
|
|
11
|
-
* - Warm-process cache keyed by `path|size|mtimeMs` avoids re-reading hot
|
|
12
|
-
* files; line-offset table lets line-range reads avoid materialising the
|
|
13
|
-
* whole content for large files.
|
|
14
|
-
*
|
|
15
|
-
* DDD: this module lives in the search/ application layer (allowed to import
|
|
16
|
-
* infrastructure for filesystem grounding and chunk metadata).
|
|
2
|
+
* sweet-search read — filesystem-grounded file reader. Returns exact bytes from
|
|
3
|
+
* disk; the vectors index may attach symbol/chunk metadata, but the returned
|
|
4
|
+
* `text` always comes from node:fs, never from the (truncated) DB column.
|
|
17
5
|
*/
|
|
18
6
|
|
|
19
|
-
import { promises as fs, statSync } from 'node:fs';
|
|
7
|
+
import { promises as fs, realpathSync, statSync } from 'node:fs';
|
|
20
8
|
import path from 'node:path';
|
|
21
9
|
import { CodebaseRepository } from '../infrastructure/codebase-repository.js';
|
|
22
|
-
import { DB_PATHS } from '../infrastructure/config/index.js';
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
// Cache — keyed by absolutePath|size|mtimeMs (any change invalidates).
|
|
26
|
-
// Bounded LRU. Entries hold either the full text + line-offset table, or just
|
|
27
|
-
// the line-offset table for very large files where we deliberately avoid
|
|
28
|
-
// caching the whole content.
|
|
29
|
-
// ---------------------------------------------------------------------------
|
|
10
|
+
import { DB_PATHS, PROJECT_ROOT } from '../infrastructure/config/index.js';
|
|
11
|
+
import { withPinnedRead } from './search-reader-pin.js';
|
|
12
|
+
import { emitToolIdentityAuto } from './cli-decoration.js';
|
|
30
13
|
|
|
31
14
|
const CACHE_MAX_ENTRIES = 64;
|
|
32
15
|
const CACHE_LARGE_FILE_BYTES = 4 * 1024 * 1024; // 4MB — switch to range-read mode
|
|
@@ -45,22 +28,22 @@ function _cacheTouch(key, value) {
|
|
|
45
28
|
}
|
|
46
29
|
}
|
|
47
30
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if (_repo === null) {
|
|
55
|
-
try { _repo = new CodebaseRepository(DB_PATHS.codebase); }
|
|
56
|
-
catch { _repo = false; }
|
|
31
|
+
const _repos = new Map();
|
|
32
|
+
function _getRepo(projectRoot) {
|
|
33
|
+
const dbPath = _codebasePathForProject(projectRoot);
|
|
34
|
+
if (!_repos.has(dbPath)) {
|
|
35
|
+
try { _repos.set(dbPath, new CodebaseRepository(dbPath)); }
|
|
36
|
+
catch { _repos.set(dbPath, false); }
|
|
57
37
|
}
|
|
58
|
-
return
|
|
38
|
+
return _repos.get(dbPath) || null;
|
|
59
39
|
}
|
|
60
40
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
41
|
+
function _codebasePathForProject(projectRoot) {
|
|
42
|
+
const root = path.resolve(projectRoot || process.cwd());
|
|
43
|
+
if (root === path.resolve(PROJECT_ROOT || process.cwd())) return DB_PATHS.codebase;
|
|
44
|
+
const stateDir = path.basename(path.dirname(DB_PATHS.codebase || '.sweet-search/codebase.db'));
|
|
45
|
+
return path.join(root, stateDir, 'codebase.db');
|
|
46
|
+
}
|
|
64
47
|
|
|
65
48
|
function _resolvePath(p, projectRoot) {
|
|
66
49
|
if (!p) throw new Error('path is required');
|
|
@@ -70,10 +53,24 @@ function _resolvePath(p, projectRoot) {
|
|
|
70
53
|
|
|
71
54
|
function _projectRelative(absPath, projectRoot) {
|
|
72
55
|
const root = projectRoot || process.cwd();
|
|
73
|
-
const
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
56
|
+
const normalized = _normalizeRelativePath(path.relative(root, absPath));
|
|
57
|
+
if (normalized) return normalized;
|
|
58
|
+
try {
|
|
59
|
+
return _normalizeRelativePath(
|
|
60
|
+
path.relative(realpathSync.native(root), realpathSync.native(absPath)),
|
|
61
|
+
) || absPath;
|
|
62
|
+
} catch {
|
|
63
|
+
return absPath;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function _normalizeRelativePath(rel) {
|
|
68
|
+
const normalized = rel.replace(/\\/g, '/').replace(/^\.\//, '');
|
|
69
|
+
return (
|
|
70
|
+
normalized && !normalized.startsWith('../') && !path.isAbsolute(normalized)
|
|
71
|
+
? normalized
|
|
72
|
+
: null
|
|
73
|
+
);
|
|
77
74
|
}
|
|
78
75
|
|
|
79
76
|
// ---------------------------------------------------------------------------
|
|
@@ -123,7 +120,6 @@ async function _readFromDisk(absPath) {
|
|
|
123
120
|
const isLarge = stat.size > CACHE_LARGE_FILE_BYTES;
|
|
124
121
|
const entry = {
|
|
125
122
|
text: isLarge ? null : buf.toString('utf8'),
|
|
126
|
-
bufferRef: isLarge ? null : null, // not held — text is the canonical form
|
|
127
123
|
lineOffsets,
|
|
128
124
|
size: stat.size,
|
|
129
125
|
mtimeMs: stat.mtimeMs,
|
|
@@ -217,8 +213,8 @@ function _metaEndLine(meta) {
|
|
|
217
213
|
: null;
|
|
218
214
|
}
|
|
219
215
|
|
|
220
|
-
function _attachIndexMetadata(filePathRel) {
|
|
221
|
-
const repo = _getRepo();
|
|
216
|
+
function _attachIndexMetadata(filePathRel, projectRoot) {
|
|
217
|
+
const repo = _getRepo(projectRoot);
|
|
222
218
|
if (!repo) return { indexed: false, chunks: [], language: null };
|
|
223
219
|
|
|
224
220
|
const rows = repo.getChunksByFilePath(filePathRel);
|
|
@@ -258,7 +254,7 @@ function _attachIndexMetadata(filePathRel) {
|
|
|
258
254
|
* @param {boolean} [req.includeMetadata=true] - attach index chunks/language
|
|
259
255
|
* @returns {Promise<Object>}
|
|
260
256
|
*/
|
|
261
|
-
|
|
257
|
+
async function _readFileUnpinned(req) {
|
|
262
258
|
const t0 = performance.now();
|
|
263
259
|
const projectRoot = req.projectRoot || process.cwd();
|
|
264
260
|
const absPath = _resolvePath(req.path, projectRoot);
|
|
@@ -291,7 +287,7 @@ export async function readFile(req) {
|
|
|
291
287
|
let chunks = [];
|
|
292
288
|
let indexed = false;
|
|
293
289
|
if (req.includeMetadata !== false) {
|
|
294
|
-
const meta = _attachIndexMetadata(relForIndex);
|
|
290
|
+
const meta = _attachIndexMetadata(relForIndex, projectRoot);
|
|
295
291
|
indexed = meta.indexed;
|
|
296
292
|
chunks = meta.chunks;
|
|
297
293
|
language = meta.language;
|
|
@@ -323,6 +319,14 @@ export async function readFile(req) {
|
|
|
323
319
|
};
|
|
324
320
|
}
|
|
325
321
|
|
|
322
|
+
export async function readFile(req) {
|
|
323
|
+
const projectRoot = req?.projectRoot || process.cwd();
|
|
324
|
+
return withPinnedRead(
|
|
325
|
+
{ projectRoot, meta: { tool: 'read', path: req?.path ?? null, count: 1 } },
|
|
326
|
+
() => _readFileUnpinned({ ...req, projectRoot }),
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
|
|
326
330
|
/**
|
|
327
331
|
* Batch read — up to 20 files in parallel. Per-file failures are returned
|
|
328
332
|
* inline; the batch never throws unless `files` is malformed.
|
|
@@ -340,15 +344,18 @@ export async function readFiles(files, opts = {}) {
|
|
|
340
344
|
if (files.length > 20) {
|
|
341
345
|
throw new Error(`read accepts at most 20 files; got ${files.length}`);
|
|
342
346
|
}
|
|
343
|
-
const
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
347
|
+
const projectRoot = opts.projectRoot || process.cwd();
|
|
348
|
+
return withPinnedRead({ projectRoot, meta: { tool: 'read', count: files.length } }, async () => {
|
|
349
|
+
const t0 = performance.now();
|
|
350
|
+
const results = await Promise.all(files.map(f => _readFileUnpinned({
|
|
351
|
+
path: f.path,
|
|
352
|
+
startLine: f.startLine,
|
|
353
|
+
endLine: f.endLine,
|
|
354
|
+
projectRoot,
|
|
355
|
+
includeMetadata: opts.includeMetadata !== false,
|
|
356
|
+
})));
|
|
357
|
+
return { files: results, totalMs: +(performance.now() - t0).toFixed(2) };
|
|
358
|
+
});
|
|
352
359
|
}
|
|
353
360
|
|
|
354
361
|
// ---------------------------------------------------------------------------
|
|
@@ -385,12 +392,6 @@ export function formatReadResults(results, format = 'agent') {
|
|
|
385
392
|
|
|
386
393
|
// ---------------------------------------------------------------------------
|
|
387
394
|
// CLI handler
|
|
388
|
-
// Usage:
|
|
389
|
-
// sweet-search read path/to/file.ts
|
|
390
|
-
// sweet-search read path/to/file.ts --lines 45-92
|
|
391
|
-
// sweet-search read a.ts b.ts c.ts
|
|
392
|
-
// sweet-search read path/to/file.ts --json
|
|
393
|
-
// sweet-search read path/to/file.ts --raw
|
|
394
395
|
// ---------------------------------------------------------------------------
|
|
395
396
|
|
|
396
397
|
function _parseLineRange(spec) {
|
|
@@ -409,13 +410,21 @@ function _parseArgs(args) {
|
|
|
409
410
|
let startLine = null;
|
|
410
411
|
let endLine = null;
|
|
411
412
|
let includeMetadata = true;
|
|
413
|
+
let plain = false;
|
|
414
|
+
let noBanner = false;
|
|
412
415
|
for (let i = 0; i < args.length; i++) {
|
|
413
416
|
const a = args[i];
|
|
414
417
|
if (a === '--json') format = 'json';
|
|
415
418
|
else if (a === '--raw') format = 'raw';
|
|
416
419
|
else if (a === '--agent') format = 'agent';
|
|
417
420
|
else if (a === '--no-metadata') includeMetadata = false;
|
|
418
|
-
else if (a === '--
|
|
421
|
+
else if (a === '--no-banner') noBanner = true;
|
|
422
|
+
else if (a === '--format' || a.startsWith('--format=')) {
|
|
423
|
+
const v = a === '--format' ? args[++i] : a.slice('--format='.length);
|
|
424
|
+
if (v === 'json' || v === 'raw' || v === 'agent') format = v;
|
|
425
|
+
else if (v === 'plain') plain = true;
|
|
426
|
+
else throw new Error(`unknown --format value: ${v}`);
|
|
427
|
+
} else if (a === '--lines') {
|
|
419
428
|
const [s, e] = _parseLineRange(args[++i]);
|
|
420
429
|
startLine = s; endLine = e;
|
|
421
430
|
} else if (a === '--help' || a === '-h') {
|
|
@@ -427,7 +436,7 @@ function _parseArgs(args) {
|
|
|
427
436
|
positional.push(a);
|
|
428
437
|
}
|
|
429
438
|
}
|
|
430
|
-
return { positional, format, startLine, endLine, includeMetadata };
|
|
439
|
+
return { positional, format, startLine, endLine, includeMetadata, plain, noBanner };
|
|
431
440
|
}
|
|
432
441
|
|
|
433
442
|
function _printHelp() {
|
|
@@ -443,6 +452,8 @@ function _printHelp() {
|
|
|
443
452
|
' --json Emit JSON (machine-readable)',
|
|
444
453
|
' --raw Emit raw text only (no fences/headers)',
|
|
445
454
|
' --agent Default — markdown fenced block + symbol hints',
|
|
455
|
+
' --format <fmt> json | raw | agent | plain (plain = no identity line)',
|
|
456
|
+
' --no-banner Suppress the identity line',
|
|
446
457
|
' --no-metadata Skip index metadata attachment',
|
|
447
458
|
'',
|
|
448
459
|
].join('\n'));
|
|
@@ -467,6 +478,10 @@ export async function handleReadCli(args) {
|
|
|
467
478
|
endLine: wantsRange ? parsed.endLine : undefined,
|
|
468
479
|
}));
|
|
469
480
|
const out = await readFiles(files, { includeMetadata: parsed.includeMetadata });
|
|
481
|
+
if (parsed.format !== 'json') {
|
|
482
|
+
const detail = files.length === 1 ? files[0].path : `${files.length} files`;
|
|
483
|
+
emitToolIdentityAuto('read', detail, { plain: parsed.plain, noBanner: parsed.noBanner });
|
|
484
|
+
}
|
|
470
485
|
process.stdout.write(formatReadResults(out, parsed.format));
|
|
471
486
|
if (parsed.format !== 'json') process.stdout.write('\n');
|
|
472
487
|
// Non-zero exit if every file failed (so shell pipelines see the error).
|
|
@@ -477,5 +492,8 @@ export async function handleReadCli(args) {
|
|
|
477
492
|
// Test-only export — clears caches between unit tests.
|
|
478
493
|
export function __resetReadCachesForTests() {
|
|
479
494
|
_cache.clear();
|
|
480
|
-
|
|
495
|
+
for (const repo of _repos.values()) repo?.close?.();
|
|
496
|
+
_repos.clear();
|
|
481
497
|
}
|
|
498
|
+
|
|
499
|
+
export const __testing = { projectRelative: _projectRelative, codebasePathForProject: _codebasePathForProject };
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { DB_PATHS, PROJECT_ROOT } from '../infrastructure/config/index.js';
|
|
3
|
+
import { readManifest } from '../incremental-indexing/infrastructure/manifest.mjs';
|
|
4
|
+
import { beginRead, endRead } from '../incremental-indexing/infrastructure/reader-heartbeat.mjs';
|
|
5
|
+
|
|
6
|
+
function dataDirName() {
|
|
7
|
+
const dir = path.basename(path.dirname(DB_PATHS.codebase || ''));
|
|
8
|
+
return dir && dir !== '.' ? dir : '.sweet-search';
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export function searchStateDir(projectRoot = process.cwd()) {
|
|
12
|
+
const root = path.resolve(projectRoot || process.cwd());
|
|
13
|
+
if (root === path.resolve(PROJECT_ROOT || process.cwd())) {
|
|
14
|
+
return path.dirname(DB_PATHS.codebase);
|
|
15
|
+
}
|
|
16
|
+
return path.join(root, dataDirName());
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Negative cache for stateDirs known to have no reconcile-manifest.json.
|
|
20
|
+
// 1s TTL bounds staleness if reconcile starts publishing after first probe.
|
|
21
|
+
// Cleared per-stateDir whenever a manifest is observed.
|
|
22
|
+
const _manifestAbsentAt = new Map();
|
|
23
|
+
const MANIFEST_ABSENT_TTL_MS = 1000;
|
|
24
|
+
|
|
25
|
+
export function _resetManifestAbsentCache() {
|
|
26
|
+
_manifestAbsentAt.clear();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function beginPinnedRead({ projectRoot, stateDir, epoch, meta } = {}) {
|
|
30
|
+
// Caller signaled "I already checked and there is no pinned epoch".
|
|
31
|
+
// Heartbeat has no GC contract to honor without an epoch — no-op.
|
|
32
|
+
if (epoch === null) return null;
|
|
33
|
+
const resolvedStateDir = stateDir || (projectRoot ? searchStateDir(projectRoot) : null);
|
|
34
|
+
if (!resolvedStateDir) return null;
|
|
35
|
+
// Skip readManifest when we recently observed it was absent at this path.
|
|
36
|
+
if (!Number.isInteger(epoch)) {
|
|
37
|
+
const absentAt = _manifestAbsentAt.get(resolvedStateDir);
|
|
38
|
+
if (absentAt !== undefined && Date.now() - absentAt < MANIFEST_ABSENT_TTL_MS) {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
const manifest = Number.isInteger(epoch) ? null : readManifest(resolvedStateDir);
|
|
43
|
+
const manifestEpoch = Number.isInteger(epoch)
|
|
44
|
+
? epoch
|
|
45
|
+
: manifest?.epoch;
|
|
46
|
+
if (!Number.isInteger(manifestEpoch)) {
|
|
47
|
+
_manifestAbsentAt.set(resolvedStateDir, Date.now());
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
_manifestAbsentAt.delete(resolvedStateDir);
|
|
51
|
+
return {
|
|
52
|
+
stateDir: resolvedStateDir,
|
|
53
|
+
epoch: manifestEpoch,
|
|
54
|
+
manifest,
|
|
55
|
+
record: beginRead(resolvedStateDir, manifestEpoch, meta || {}),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export function endPinnedRead(pin) {
|
|
60
|
+
if (!pin) return;
|
|
61
|
+
endRead(pin.stateDir, pin.record);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export async function withPinnedRead(options, fn) {
|
|
65
|
+
const pin = beginPinnedRead(options);
|
|
66
|
+
try {
|
|
67
|
+
return await fn(pin?.epoch ?? null, pin);
|
|
68
|
+
} finally {
|
|
69
|
+
endPinnedRead(pin);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-query parallel BM25F + Reciprocal Rank Fusion (RRF) tail fallback.
|
|
3
|
+
*
|
|
4
|
+
* Applies when the normal hybrid pipeline (lexical + semantic + CC fusion
|
|
5
|
+
* + IAR + post-fusion boosts + demotions + MMR + existing rewrite-retry)
|
|
6
|
+
* still leaves results weak — empty, low-confidence top-1, or no source
|
|
7
|
+
* file in top-3.
|
|
8
|
+
*
|
|
9
|
+
* Why this design:
|
|
10
|
+
* - Long natural-language queries get tokenized by FTS5's sanitizer
|
|
11
|
+
* into AND-of-many-tokens (`"how" "does" "Fastify" "compile" ...`),
|
|
12
|
+
* and no chunk has all those tokens. Result: zero hits.
|
|
13
|
+
* - SOTA in 2025-2026 (Cognition SWE-grep, Polarity Omnigrep, Cody
|
|
14
|
+
* Deep Search, T2-RAGBench) is multi-query parallel retrieval with
|
|
15
|
+
* RRF fusion — fire one BM25 per content keyword, fuse by rank.
|
|
16
|
+
* - RRF (Cormack 2009) is corpus-agnostic and avoids the per-keyword
|
|
17
|
+
* score-normalization trap. A chunk that ranks high in MULTIPLE
|
|
18
|
+
* per-keyword queries floats up; a chunk that only matches one
|
|
19
|
+
* noisy keyword (e.g. "time" → setTimeout) stays mid-pack because
|
|
20
|
+
* it has a single 1/(k+rank) contribution.
|
|
21
|
+
*
|
|
22
|
+
* Why NOT a hand-curated stopword list:
|
|
23
|
+
* The earlier draft (Proposal C v1) added "time", "data", "value" etc.
|
|
24
|
+
* to a stopword list because Q4 ("registration time") matched
|
|
25
|
+
* `setTimeout`. That's the Clever Hans / corpus-overfit anti-pattern
|
|
26
|
+
* per the Mitra & Craswell neural-IR survey and Vespa's WAND article.
|
|
27
|
+
* RRF handles this structurally: "time" matches noisy chunks at
|
|
28
|
+
* rank 1, but those chunks DON'T also match "compile" + "schemas",
|
|
29
|
+
* so their RRF score stays low compared to a chunk that hits all
|
|
30
|
+
* three.
|
|
31
|
+
*
|
|
32
|
+
* Disable via `ablations: new Set(['no-rrf-fallback'])`.
|
|
33
|
+
*
|
|
34
|
+
* References:
|
|
35
|
+
* - Cormack et al. "Reciprocal Rank Fusion outperforms Condorcet and
|
|
36
|
+
* individual rank learning methods", SIGIR 2009.
|
|
37
|
+
* - Cognition SWE-grep blog (Oct 2025).
|
|
38
|
+
* - T2-RAGBench multi-query+RRF, EACL 2026.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
import { detectFileKind } from '../ranking/file-kind-ranking.js';
|
|
42
|
+
|
|
43
|
+
// Question-scaffolding stopwords ONLY. Generic English nouns (time, data,
|
|
44
|
+
// state, mode, value, etc.) intentionally NOT in this list — they may be
|
|
45
|
+
// the actual concept the user is asking about, and RRF naturally demotes
|
|
46
|
+
// them when they're noise (one rank-1 hit can't beat two rank-mid hits).
|
|
47
|
+
const QUERY_SCAFFOLD_STOPWORDS = new Set([
|
|
48
|
+
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'could',
|
|
49
|
+
'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have',
|
|
50
|
+
'how', 'in', 'into', 'is', 'it', 'its', 'of', 'on', 'or', 'should',
|
|
51
|
+
'so', 'than', 'that', 'the', 'their', 'them', 'this', 'those',
|
|
52
|
+
'to', 'too', 'use', 'using', 'was', 'were', 'what', 'when', 'where',
|
|
53
|
+
'whether', 'which', 'while', 'who', 'whom', 'why', 'will', 'with',
|
|
54
|
+
'would', 'you', 'your',
|
|
55
|
+
]);
|
|
56
|
+
|
|
57
|
+
// Standard RRF k constant from Cormack 2009. Higher k flattens the
|
|
58
|
+
// rank-position curve (less weight for top hits); lower k sharpens it.
|
|
59
|
+
// 60 is the published default and what most production systems use.
|
|
60
|
+
const RRF_K = 60;
|
|
61
|
+
|
|
62
|
+
const DEFAULT_PER_KEYWORD_LIMIT = 30;
|
|
63
|
+
const DEFAULT_CONFIDENCE_FLOOR = 0.35;
|
|
64
|
+
|
|
65
|
+
// RRF scores are tiny (~0.01-0.05). Map to a [base, base+range] band so
|
|
66
|
+
// fallback candidates compete mid-pack without overwhelming a strong
|
|
67
|
+
// fused top-1 from the encoder (typically 0.4-0.86).
|
|
68
|
+
const FALLBACK_BASE = 0.40;
|
|
69
|
+
const FALLBACK_RANGE = 0.20;
|
|
70
|
+
|
|
71
|
+
export function extractContentKeywords(query) {
|
|
72
|
+
if (!query) return [];
|
|
73
|
+
const tokens = String(query).match(/[A-Za-z_][A-Za-z0-9_]+/g) || [];
|
|
74
|
+
const out = [];
|
|
75
|
+
const seen = new Set();
|
|
76
|
+
for (const tok of tokens) {
|
|
77
|
+
if (tok.length < 3) continue;
|
|
78
|
+
const lower = tok.toLowerCase();
|
|
79
|
+
if (QUERY_SCAFFOLD_STOPWORDS.has(lower)) continue;
|
|
80
|
+
if (seen.has(lower)) continue;
|
|
81
|
+
seen.add(lower);
|
|
82
|
+
out.push(tok);
|
|
83
|
+
}
|
|
84
|
+
return out;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Decide whether the keyword fallback should run. Tightened (2026-05-05)
|
|
89
|
+
* after a 20-query probe found the original triggers fired too eagerly.
|
|
90
|
+
*
|
|
91
|
+
* The earlier two-clause trigger (`low_confidence` ∨ `no_source_in_top3`)
|
|
92
|
+
* caused regressions on queries where the encoder DID produce a real
|
|
93
|
+
* named source symbol just below the score floor (e.g. `getServerInstance`
|
|
94
|
+
* at score 0.32 lost to a 1-line `[typeAlias: HttpKeys]` injected by RRF).
|
|
95
|
+
*
|
|
96
|
+
* New rule: RRF fires only when top-3 has NO "good source candidate" —
|
|
97
|
+
* defined as an implementation-file chunk with a real named entity. That
|
|
98
|
+
* captures the genuine "retrieval is lost" case (only docs / tests /
|
|
99
|
+
* unlabelled chunks) without sacrificing borderline-confidence wins.
|
|
100
|
+
*
|
|
101
|
+
* - empty → fire (always)
|
|
102
|
+
* - top-1 in docs/tests AND no good source candidate → fire
|
|
103
|
+
* - all top-3 are unlabelled chunks (no symbol name) → fire
|
|
104
|
+
* - otherwise → don't fire
|
|
105
|
+
*
|
|
106
|
+
* The previous standalone `low_confidence` trigger (top-1 score < floor)
|
|
107
|
+
* was removed — encoder scores below 0.35 are common on long NL queries
|
|
108
|
+
* even when the answer IS the encoder's top-1.
|
|
109
|
+
*/
|
|
110
|
+
export function shouldRunFallback(results, opts = {}) {
|
|
111
|
+
if (!Array.isArray(results) || results.length === 0) return 'empty';
|
|
112
|
+
const window = results.slice(0, Math.min(3, results.length));
|
|
113
|
+
const hasGoodSource = window.some(r => {
|
|
114
|
+
const file = r.metadata?.file || r.file || r.file_path || '';
|
|
115
|
+
if (detectFileKind(file) !== 'implementation') return false;
|
|
116
|
+
const name = r.metadata?.name || r.name;
|
|
117
|
+
return name && String(name).trim().length > 0;
|
|
118
|
+
});
|
|
119
|
+
if (!hasGoodSource) return 'no_good_source_in_top3';
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function chunkKey(r) {
|
|
124
|
+
const m = r.metadata || {};
|
|
125
|
+
const file = m.file || r.file || r.file_path;
|
|
126
|
+
const sl = m.startLine ?? r.startLine;
|
|
127
|
+
const el = m.endLine ?? r.endLine;
|
|
128
|
+
return `${file}|${sl}|${el}`;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Compute Reciprocal Rank Fusion across per-keyword BM25 result lists.
|
|
133
|
+
*
|
|
134
|
+
* For each chunk, RRF score = sum over all keywords k of 1 / (RRF_K + rank_k)
|
|
135
|
+
* where rank_k is the chunk's 1-indexed position in keyword k's results.
|
|
136
|
+
* Chunks not present in a keyword's results contribute 0 for that keyword.
|
|
137
|
+
*
|
|
138
|
+
* This naturally rewards chunks that appear in MULTIPLE per-keyword queries
|
|
139
|
+
* over chunks that only appear at rank 1 of a single noisy keyword.
|
|
140
|
+
*
|
|
141
|
+
* @param {Array<Array>} perKeywordResults - one array of BM25 hits per keyword
|
|
142
|
+
* @returns {Map<string, { result, rrf, perKeywordRanks: Map<string, number> }>}
|
|
143
|
+
*/
|
|
144
|
+
export function fuseRRF(perKeywordResults) {
|
|
145
|
+
const acc = new Map();
|
|
146
|
+
for (let kIdx = 0; kIdx < perKeywordResults.length; kIdx++) {
|
|
147
|
+
const list = perKeywordResults[kIdx] || [];
|
|
148
|
+
for (let r = 0; r < list.length; r++) {
|
|
149
|
+
const item = list[r];
|
|
150
|
+
const key = chunkKey(item);
|
|
151
|
+
const rank = r + 1; // 1-indexed
|
|
152
|
+
const contrib = 1 / (RRF_K + rank);
|
|
153
|
+
if (!acc.has(key)) {
|
|
154
|
+
acc.set(key, {
|
|
155
|
+
result: item,
|
|
156
|
+
rrf: 0,
|
|
157
|
+
keywordsHit: new Set(),
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
const entry = acc.get(key);
|
|
161
|
+
entry.rrf += contrib;
|
|
162
|
+
entry.keywordsHit.add(kIdx);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return acc;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Run multi-query BM25F + RRF fallback against the existing fused list.
|
|
170
|
+
*
|
|
171
|
+
* Fires when shouldRunFallback returns a reason. Extracts content
|
|
172
|
+
* keywords, fires one BM25F query per keyword via the existing
|
|
173
|
+
* `graphSearch.bm25SearchRaw` (which already uses the 4-column FTS5
|
|
174
|
+
* with weighted BM25), fuses with RRF, normalizes RRF scores to a
|
|
175
|
+
* mid-pack band, and merges into the existing candidate set.
|
|
176
|
+
*
|
|
177
|
+
* @param {Array} fused - current candidate list
|
|
178
|
+
* @param {string} query
|
|
179
|
+
* @param {object} opts
|
|
180
|
+
* @returns {Promise<{ results: Array, stats: object }>}
|
|
181
|
+
*/
|
|
182
|
+
export async function runRRFFallback(fused, query, opts = {}) {
|
|
183
|
+
const ablations = opts.ablations;
|
|
184
|
+
if (ablations && (ablations instanceof Set ? ablations.has('no-rrf-fallback') : Array.isArray(ablations) && ablations.includes('no-rrf-fallback'))) {
|
|
185
|
+
return { results: fused, stats: { reason: null, keywords: [], injected: 0, boosted: 0, fusedCount: 0 } };
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const reason = shouldRunFallback(fused, opts);
|
|
189
|
+
if (!reason) {
|
|
190
|
+
return { results: fused, stats: { reason: null, keywords: [], injected: 0, boosted: 0, fusedCount: 0 } };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const keywords = extractContentKeywords(query);
|
|
194
|
+
if (keywords.length < 2) {
|
|
195
|
+
return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const searcher = opts.searcher;
|
|
199
|
+
const graphSearch = searcher?.graphSearch;
|
|
200
|
+
if (!graphSearch || typeof graphSearch.bm25SearchRaw !== 'function') {
|
|
201
|
+
return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Fire BM25F per keyword in parallel. The existing bm25SearchRaw
|
|
205
|
+
// handles the AND/prefix/trigram cascade for each individual keyword,
|
|
206
|
+
// and uses the 4-column BM25F (`bm25(entities_fts, 10.0, 4.0, 5.0, 1.0)`).
|
|
207
|
+
const perKeywordLimit = Math.max(10, Math.min(50, opts.perKeywordLimit ?? DEFAULT_PER_KEYWORD_LIMIT));
|
|
208
|
+
const perKeyword = await Promise.all(
|
|
209
|
+
keywords.map(async (kw) => {
|
|
210
|
+
try {
|
|
211
|
+
const r = await graphSearch.bm25SearchRaw(kw, perKeywordLimit);
|
|
212
|
+
return r?.results || [];
|
|
213
|
+
} catch {
|
|
214
|
+
return [];
|
|
215
|
+
}
|
|
216
|
+
})
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
const fusedMap = fuseRRF(perKeyword);
|
|
220
|
+
if (fusedMap.size === 0) {
|
|
221
|
+
return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: 0 } };
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Sort by RRF score descending; cap the number we inject to avoid
|
|
225
|
+
// flooding the candidate set when many chunks have small RRF scores.
|
|
226
|
+
const ranked = [...fusedMap.values()].sort((a, b) => b.rrf - a.rrf);
|
|
227
|
+
const injectCap = Math.max(5, Math.min(30, opts.injectCap ?? 20));
|
|
228
|
+
const top = ranked.slice(0, injectCap);
|
|
229
|
+
|
|
230
|
+
// Normalize RRF scores to [FALLBACK_BASE, FALLBACK_BASE + FALLBACK_RANGE]
|
|
231
|
+
const maxRrf = top[0]?.rrf || 0;
|
|
232
|
+
if (maxRrf <= 0) {
|
|
233
|
+
return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: fusedMap.size } };
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const existingByKey = new Map();
|
|
237
|
+
for (const r of fused) existingByKey.set(chunkKey(r), r);
|
|
238
|
+
|
|
239
|
+
let injected = 0;
|
|
240
|
+
let boosted = 0;
|
|
241
|
+
const additions = [];
|
|
242
|
+
|
|
243
|
+
for (const { result, rrf, keywordsHit } of top) {
|
|
244
|
+
const key = chunkKey(result);
|
|
245
|
+
const norm = rrf / maxRrf;
|
|
246
|
+
const fallbackScore = FALLBACK_BASE + FALLBACK_RANGE * norm;
|
|
247
|
+
|
|
248
|
+
const exists = existingByKey.get(key);
|
|
249
|
+
if (exists) {
|
|
250
|
+
if ((exists.score || 0) < fallbackScore) {
|
|
251
|
+
exists.score = fallbackScore;
|
|
252
|
+
exists._rrfBoosted = true;
|
|
253
|
+
exists._rrfHits = keywordsHit.size;
|
|
254
|
+
boosted++;
|
|
255
|
+
}
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
additions.push({
|
|
260
|
+
...result,
|
|
261
|
+
searchPath: 'rrf-fallback',
|
|
262
|
+
score: fallbackScore,
|
|
263
|
+
_rrfFallback: true,
|
|
264
|
+
_rrfHits: keywordsHit.size,
|
|
265
|
+
_rrfRaw: rrf,
|
|
266
|
+
});
|
|
267
|
+
injected++;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if (injected === 0 && boosted === 0) {
|
|
271
|
+
return { results: fused, stats: { reason, keywords, injected: 0, boosted: 0, fusedCount: fusedMap.size } };
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const merged = [...fused, ...additions].sort((a, b) => (b.score || 0) - (a.score || 0));
|
|
275
|
+
return {
|
|
276
|
+
results: merged,
|
|
277
|
+
stats: { reason, keywords, injected, boosted, fusedCount: fusedMap.size },
|
|
278
|
+
};
|
|
279
|
+
}
|