sweet-search 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -30,10 +30,13 @@
|
|
|
30
30
|
*/
|
|
31
31
|
|
|
32
32
|
import path from 'node:path';
|
|
33
|
+
import fs from 'node:fs';
|
|
33
34
|
import { CodebaseRepository } from '../infrastructure/codebase-repository.js';
|
|
34
|
-
import { DB_PATHS, LATE_INTERACTION_CONFIG } from '../infrastructure/config/index.js';
|
|
35
|
+
import { DB_PATHS, LATE_INTERACTION_CONFIG, PROJECT_ROOT } from '../infrastructure/config/index.js';
|
|
35
36
|
import { applyPersistedLiModel } from '../infrastructure/init-config.js';
|
|
36
37
|
import { readFile as readFileExact } from './search-read.js';
|
|
38
|
+
import { withPinnedRead } from './search-reader-pin.js';
|
|
39
|
+
import { emitToolIdentityAuto } from './cli-decoration.js';
|
|
37
40
|
|
|
38
41
|
// Applies the user's persisted LI model exactly once per (projectRoot, env)
|
|
39
42
|
// pair so encodeQuery/_getLateInteractionIndex below see the right variant.
|
|
@@ -62,6 +65,21 @@ const DEFAULTS = {
|
|
|
62
65
|
lexicalWeight: 1.0,
|
|
63
66
|
symbolWeight: 1.5, // symbol-name hits are stronger evidence per-file
|
|
64
67
|
maxsimWeight: 1.6, // late interaction wins ties
|
|
68
|
+
// Demotion factors applied to the final re-rank score (after MaxSim re-rank).
|
|
69
|
+
// Stage 3 diagnosis (2026-05-13, PHASE6_REDO ss-semantic) found:
|
|
70
|
+
// - chunks with null/unknown symbol metadata frequently win top-1 when
|
|
71
|
+
// they're really file-header fragments or unnamed code blocks
|
|
72
|
+
// (CPP-002, RB-001, C-005, PY-004 dev failures)
|
|
73
|
+
// - tiny chunks (≤ 5 lines) inflate MaxSim by concentrating literal
|
|
74
|
+
// token presence in a small window (RB-001 `module Sinatra`,
|
|
75
|
+
// C-005 single-line `redisContext *redisConnectWithOptions(...)`).
|
|
76
|
+
// Multiplicative demotion at the final-rank stage is conservative: the
|
|
77
|
+
// chunk is still returned, just less likely to be top-1. Tunable; 0.85
|
|
78
|
+
// was chosen by inspecting per-failure score margins (typical wrong-vs-
|
|
79
|
+
// gold gap is 0.01-0.04, so 0.85 reliably flips the cases identified).
|
|
80
|
+
unsymboledDemote: 0.85,
|
|
81
|
+
smallChunkDemote: 0.85,
|
|
82
|
+
smallChunkMaxLines: 5,
|
|
65
83
|
};
|
|
66
84
|
|
|
67
85
|
const APPROX_CHARS_PER_TOKEN = 4;
|
|
@@ -70,36 +88,155 @@ const APPROX_CHARS_PER_TOKEN = 4;
|
|
|
70
88
|
// Module-level lazy singletons
|
|
71
89
|
// ---------------------------------------------------------------------------
|
|
72
90
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
91
|
+
const RECONCILE_MANIFEST_FILENAME = 'reconcile-manifest.json';
|
|
92
|
+
|
|
93
|
+
function _projectKey(projectRoot) {
|
|
94
|
+
return path.resolve(projectRoot || PROJECT_ROOT || process.cwd());
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function _dataDirName() {
|
|
98
|
+
const dir = path.basename(path.dirname(DB_PATHS.codebase || ''));
|
|
99
|
+
return dir && dir !== '.' ? dir : '.sweet-search';
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function _stateDirForProject(projectRoot) {
|
|
103
|
+
const root = _projectKey(projectRoot);
|
|
104
|
+
if (root === path.resolve(PROJECT_ROOT)) return path.dirname(DB_PATHS.codebase);
|
|
105
|
+
return path.join(root, _dataDirName());
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function _codebasePathForProject(projectRoot, manifest = null) {
|
|
109
|
+
const descriptor = manifest?.vectors?.path || manifest?.vectors?.dbPath;
|
|
110
|
+
if (descriptor) {
|
|
111
|
+
return _resolveStatePath(projectRoot, descriptor);
|
|
112
|
+
}
|
|
113
|
+
return _defaultCodebasePathForProject(projectRoot);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function _defaultCodebasePathForProject(projectRoot) {
|
|
117
|
+
const root = _projectKey(projectRoot);
|
|
118
|
+
if (root === path.resolve(PROJECT_ROOT)) return DB_PATHS.codebase;
|
|
119
|
+
return path.join(_stateDirForProject(root), 'codebase.db');
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function _readReconcileManifest(projectRoot) {
|
|
123
|
+
try {
|
|
124
|
+
const manifest = JSON.parse(
|
|
125
|
+
fs.readFileSync(path.join(_stateDirForProject(projectRoot), RECONCILE_MANIFEST_FILENAME), 'utf-8'),
|
|
126
|
+
);
|
|
127
|
+
return Number.isInteger(manifest?.epoch) ? manifest : null;
|
|
128
|
+
} catch {
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function _resolveStatePath(projectRoot, filePath) {
|
|
134
|
+
if (!filePath) return null;
|
|
135
|
+
if (path.isAbsolute(filePath)) return filePath;
|
|
136
|
+
return path.join(_stateDirForProject(projectRoot), filePath);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function _lateInteractionIndexPath(projectRoot, manifest) {
|
|
140
|
+
const descriptor = manifest?.lateInteraction?.path
|
|
141
|
+
|| manifest?.lateInteraction?.indexPath
|
|
142
|
+
|| manifest?.lateInteraction?.manifest;
|
|
143
|
+
if (descriptor) {
|
|
144
|
+
const resolved = _resolveStatePath(projectRoot, descriptor);
|
|
145
|
+
const segmentDir = path.dirname(resolved);
|
|
146
|
+
return segmentDir.endsWith('.segments')
|
|
147
|
+
? segmentDir.slice(0, -'.segments'.length)
|
|
148
|
+
: resolved;
|
|
149
|
+
}
|
|
150
|
+
const root = _projectKey(projectRoot);
|
|
151
|
+
if (root === path.resolve(PROJECT_ROOT)) return DB_PATHS.lateInteraction;
|
|
152
|
+
if (DB_PATHS.lateInteraction && fs.existsSync(DB_PATHS.lateInteraction)) {
|
|
153
|
+
return DB_PATHS.lateInteraction;
|
|
78
154
|
}
|
|
79
|
-
return
|
|
155
|
+
return path.join(_stateDirForProject(root), path.basename(DB_PATHS.lateInteraction));
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function _sourceStaleness(projectRoot, filePathRel) {
|
|
159
|
+
const manifest = _readReconcileManifest(projectRoot);
|
|
160
|
+
const publishedMs = Date.parse(manifest?.publishedAt || '');
|
|
161
|
+
if (!Number.isFinite(publishedMs)) return null;
|
|
162
|
+
try {
|
|
163
|
+
const abs = path.isAbsolute(filePathRel)
|
|
164
|
+
? filePathRel
|
|
165
|
+
: path.resolve(projectRoot, filePathRel);
|
|
166
|
+
const stat = fs.statSync(abs);
|
|
167
|
+
if (stat.mtimeMs <= publishedMs) return null;
|
|
168
|
+
return {
|
|
169
|
+
stale: true,
|
|
170
|
+
indexEpoch: manifest.epoch,
|
|
171
|
+
indexPublishedAt: manifest.publishedAt,
|
|
172
|
+
sourceMtime: stat.mtime.toISOString(),
|
|
173
|
+
warning: 'source file is newer than the semantic index; spans were selected from stale index metadata and text was reread from disk',
|
|
174
|
+
};
|
|
175
|
+
} catch {
|
|
176
|
+
return null;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const _repos = new Map();
|
|
181
|
+
function _getRepo(projectRoot) {
|
|
182
|
+
const key = _projectKey(projectRoot);
|
|
183
|
+
const manifest = _readReconcileManifest(projectRoot);
|
|
184
|
+
const dbPath = _codebasePathForProject(projectRoot, manifest);
|
|
185
|
+
const baseDbPath = _defaultCodebasePathForProject(projectRoot);
|
|
186
|
+
let entry = _repos.get(key);
|
|
187
|
+
if (!entry || entry.dbPath !== dbPath || entry.baseDbPath !== baseDbPath) {
|
|
188
|
+
entry?.repo?.close?.();
|
|
189
|
+
try {
|
|
190
|
+
entry = { dbPath, baseDbPath, repo: new CodebaseRepository(baseDbPath) };
|
|
191
|
+
_repos.set(key, entry);
|
|
192
|
+
} catch {
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
const repo = entry.repo;
|
|
197
|
+
repo.refreshManifestEpoch?.();
|
|
198
|
+
return repo;
|
|
80
199
|
}
|
|
81
200
|
|
|
82
201
|
let _liIndex = null;
|
|
83
202
|
let _liInitPromise = null;
|
|
84
|
-
|
|
85
|
-
|
|
203
|
+
let _liProjectKey = null;
|
|
204
|
+
let _liManifestEpoch = null;
|
|
205
|
+
async function _getLateInteractionIndex(projectRoot) {
|
|
206
|
+
const projectKey = _projectKey(projectRoot);
|
|
207
|
+
const manifest = _readReconcileManifest(projectRoot);
|
|
208
|
+
const manifestEpoch = Number.isInteger(manifest?.epoch) ? manifest.epoch : null;
|
|
209
|
+
const samePin = _liProjectKey === projectKey && _liManifestEpoch === manifestEpoch;
|
|
210
|
+
if (_liIndex !== null && samePin) return _liIndex || null;
|
|
211
|
+
if (_liIndex !== null && !samePin) {
|
|
212
|
+
_liIndex = null;
|
|
213
|
+
_liInitPromise = null;
|
|
214
|
+
}
|
|
86
215
|
if (_liInitPromise) return _liInitPromise;
|
|
87
216
|
if (!LATE_INTERACTION_CONFIG?.enabled) return null;
|
|
88
217
|
_liInitPromise = (async () => {
|
|
89
218
|
try {
|
|
90
219
|
const { LateInteractionIndex } = await import('../ranking/late-interaction-index.js');
|
|
91
|
-
const idx = new LateInteractionIndex({
|
|
220
|
+
const idx = new LateInteractionIndex({
|
|
221
|
+
indexPath: _lateInteractionIndexPath(projectRoot, manifest),
|
|
222
|
+
});
|
|
92
223
|
await idx.init();
|
|
93
224
|
// If the index is empty (no segments, no docs), treat as unavailable —
|
|
94
225
|
// saves a noisy warning later when scoreWithLateInteraction runs.
|
|
95
226
|
if (!idx.documents || idx.documents.size === 0) {
|
|
96
227
|
_liIndex = false;
|
|
228
|
+
_liProjectKey = projectKey;
|
|
229
|
+
_liManifestEpoch = manifestEpoch;
|
|
97
230
|
return null;
|
|
98
231
|
}
|
|
99
232
|
_liIndex = idx;
|
|
233
|
+
_liProjectKey = projectKey;
|
|
234
|
+
_liManifestEpoch = manifestEpoch;
|
|
100
235
|
return idx;
|
|
101
236
|
} catch {
|
|
102
237
|
_liIndex = false;
|
|
238
|
+
_liProjectKey = projectKey;
|
|
239
|
+
_liManifestEpoch = manifestEpoch;
|
|
103
240
|
return null;
|
|
104
241
|
} finally {
|
|
105
242
|
_liInitPromise = null;
|
|
@@ -130,7 +267,26 @@ function _projectRelative(absOrRelPath, projectRoot) {
|
|
|
130
267
|
? absOrRelPath
|
|
131
268
|
: path.resolve(root, absOrRelPath);
|
|
132
269
|
const rel = path.relative(root, abs);
|
|
133
|
-
|
|
270
|
+
const normalized = _normalizeRelativePath(rel);
|
|
271
|
+
if (normalized) return normalized;
|
|
272
|
+
try {
|
|
273
|
+
const realRel = path.relative(
|
|
274
|
+
fs.realpathSync.native(root),
|
|
275
|
+
fs.realpathSync.native(abs),
|
|
276
|
+
);
|
|
277
|
+
return _normalizeRelativePath(realRel) || abs;
|
|
278
|
+
} catch {
|
|
279
|
+
return abs;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
function _normalizeRelativePath(rel) {
|
|
284
|
+
const normalized = rel.replace(/\\/g, '/').replace(/^\.\//, '');
|
|
285
|
+
if (!normalized || normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) {
|
|
286
|
+
return null;
|
|
287
|
+
}
|
|
288
|
+
if (path.isAbsolute(normalized)) return null;
|
|
289
|
+
return normalized;
|
|
134
290
|
}
|
|
135
291
|
|
|
136
292
|
function _parseMeta(rawMeta) {
|
|
@@ -176,7 +332,7 @@ function _escapeRegex(s) {
|
|
|
176
332
|
// ---------------------------------------------------------------------------
|
|
177
333
|
|
|
178
334
|
async function _loadFileChunks(filePathRel, projectRoot) {
|
|
179
|
-
const repo = _getRepo();
|
|
335
|
+
const repo = _getRepo(projectRoot);
|
|
180
336
|
if (!repo) return { chunks: [], language: null };
|
|
181
337
|
const rows = repo.getChunksByFilePath(filePathRel);
|
|
182
338
|
if (rows.length === 0) return { chunks: [], language: null };
|
|
@@ -268,24 +424,38 @@ function _scoreSymbol(chunks, queryTerms, queryRaw) {
|
|
|
268
424
|
const sym = (c.symbol || '').toLowerCase();
|
|
269
425
|
if (!sym) continue;
|
|
270
426
|
let s = 0;
|
|
271
|
-
|
|
427
|
+
// Word-boundary match prevents short symbols from collecting +2 just for
|
|
428
|
+
// being a substring of an unrelated longer token in the query. Stage 3
|
|
429
|
+
// PHASE6_REDO diagnosis (2026-05-13) found two ss-semantic dev FAILs
|
|
430
|
+
// (JV-004 `Show getType` → `get` chunk got +2 from "get" ⊂ "gettype";
|
|
431
|
+
// LU-001 `trace _class metatable` → `class` chunk got +2 from "class"
|
|
432
|
+
// ⊂ "_class") where this substring-rule over-credited the wrong chunk.
|
|
433
|
+
// The word-boundary form still credits genuine mentions (e.g., "query"
|
|
434
|
+
// as a real query token still matches `query`-symbol chunks — ZG-001
|
|
435
|
+
// ambiguity is preserved). Structural rule, no per-language signal,
|
|
436
|
+
// no stopword growth.
|
|
437
|
+
const reBoundary = new RegExp(`(?:^|[^a-zA-Z0-9_])${_escapeRegex(sym)}(?=[^a-zA-Z0-9_]|$)`, 'i');
|
|
438
|
+
if (sym && reBoundary.test(lowerRaw)) s += 2; // query mentions symbol as a word
|
|
272
439
|
for (const t of queryTerms) {
|
|
273
440
|
if (sym === t) s += 3; // exact name match
|
|
274
|
-
else if (sym.includes(t)) s += 1; // substring
|
|
441
|
+
else if (sym.includes(t)) s += 1; // substring (chunk symbol contains query token)
|
|
275
442
|
}
|
|
276
443
|
if (s > 0) scores.set(c.id, s);
|
|
277
444
|
}
|
|
278
445
|
return scores;
|
|
279
446
|
}
|
|
280
447
|
|
|
281
|
-
async function _scoreLateInteraction(chunks, query) {
|
|
448
|
+
async function _scoreLateInteraction(chunks, query, projectRoot) {
|
|
282
449
|
if (chunks.length === 0) return { scores: new Map(), ran: false };
|
|
283
|
-
const liIndex = await _getLateInteractionIndex();
|
|
450
|
+
const liIndex = await _getLateInteractionIndex(projectRoot);
|
|
284
451
|
if (!liIndex) return { scores: new Map(), ran: false };
|
|
285
452
|
|
|
286
|
-
// Only score chunks whose IDs actually appear in the LI index.
|
|
453
|
+
// Only score chunks whose IDs actually appear in the LI index. Use the
|
|
454
|
+
// public availability API so alias pointers and live tombstone sidecars
|
|
455
|
+
// share the same visibility contract as normal search.
|
|
456
|
+
const available = liIndex.hasTokens(chunks.map(c => c.id));
|
|
287
457
|
const candidates = chunks
|
|
288
|
-
.filter(c =>
|
|
458
|
+
.filter(c => available.has(c.id))
|
|
289
459
|
.map(c => ({ id: c.id, score: 0 }));
|
|
290
460
|
if (candidates.length === 0) return { scores: new Map(), ran: false };
|
|
291
461
|
|
|
@@ -457,7 +627,7 @@ function _fallbackSpanFromText(fileText, totalLines, maxChars) {
|
|
|
457
627
|
* @param {boolean} [req.verbose=false] - include timings + signal contributions
|
|
458
628
|
* @returns {Promise<Object>}
|
|
459
629
|
*/
|
|
460
|
-
|
|
630
|
+
async function _readSemanticUnpinned(req) {
|
|
461
631
|
const t0 = performance.now();
|
|
462
632
|
if (!req || !req.path) throw new Error('path is required');
|
|
463
633
|
if (!req.query || !String(req.query).trim()) throw new Error('query is required');
|
|
@@ -465,6 +635,7 @@ export async function readSemantic(req) {
|
|
|
465
635
|
const projectRoot = req.projectRoot || process.cwd();
|
|
466
636
|
_ensurePersistedLiModelApplied(projectRoot);
|
|
467
637
|
const filePathRel = _projectRelative(req.path, projectRoot);
|
|
638
|
+
const staleness = _sourceStaleness(projectRoot, filePathRel);
|
|
468
639
|
|
|
469
640
|
const topK = req.topK ?? DEFAULTS.topK;
|
|
470
641
|
const threshold = req.threshold ?? DEFAULTS.threshold;
|
|
@@ -493,6 +664,7 @@ export async function readSemantic(req) {
|
|
|
493
664
|
spans: fallback.ok ? [_fallbackSpanFromRead(fallback, maxChars)] : [],
|
|
494
665
|
charsReturned: fallback.ok ? Math.min((fallback.text || '').length, maxChars) : 0,
|
|
495
666
|
approxTokensReturned: fallback.ok ? Math.ceil(Math.min((fallback.text || '').length, maxChars) / APPROX_CHARS_PER_TOKEN) : 0,
|
|
667
|
+
...(staleness ? { staleness, warnings: [staleness.warning] } : {}),
|
|
496
668
|
timings: { totalMs: +(performance.now() - t0).toFixed(2) },
|
|
497
669
|
};
|
|
498
670
|
}
|
|
@@ -514,7 +686,7 @@ export async function readSemantic(req) {
|
|
|
514
686
|
const tLex1 = performance.now();
|
|
515
687
|
|
|
516
688
|
const tLi0 = performance.now();
|
|
517
|
-
const { scores: maxsimScores, ran: liRan } = await _scoreLateInteraction(chunks, req.query);
|
|
689
|
+
const { scores: maxsimScores, ran: liRan } = await _scoreLateInteraction(chunks, req.query, projectRoot);
|
|
518
690
|
const tLi1 = performance.now();
|
|
519
691
|
|
|
520
692
|
// Threshold gate on MaxSim — drop chunks whose LI score is too low. This
|
|
@@ -550,6 +722,7 @@ export async function readSemantic(req) {
|
|
|
550
722
|
charsReturned: Math.min(fileText.length, maxChars),
|
|
551
723
|
approxTokensReturned: Math.ceil(Math.min(fileText.length, maxChars) / APPROX_CHARS_PER_TOKEN),
|
|
552
724
|
signals: verbose ? { liRan, lexicalHits: 0, symbolHits: 0, maxsimHits: 0 } : undefined,
|
|
725
|
+
...(staleness ? { staleness, warnings: [staleness.warning] } : {}),
|
|
553
726
|
timings: verbose ? {
|
|
554
727
|
loadMs: +(tLoad1 - tLoad0).toFixed(2),
|
|
555
728
|
lexicalMs: +(tLex1 - tLex0).toFixed(2),
|
|
@@ -568,12 +741,39 @@ export async function readSemantic(req) {
|
|
|
568
741
|
// Final re-rank: prefer late-interaction score when LI ran; otherwise the
|
|
569
742
|
// RRF score is the authority. This mirrors the SOTA pattern (cheap candidate
|
|
570
743
|
// pool → expensive LI re-rank on the survivors).
|
|
744
|
+
//
|
|
745
|
+
// Multiplicative score demotions on null/unknown-symbol chunks and on tiny
|
|
746
|
+
// chunks are applied here so the re-rank below sees the corrected score
|
|
747
|
+
// (Stage 3 PHASE6_REDO ss-semantic, 2026-05-13). Demotion is intentionally
|
|
748
|
+
// applied AFTER the MaxSim re-rank threshold gate above — chunks still
|
|
749
|
+
// survive into the result, they're just less likely to win top-1.
|
|
750
|
+
const unsymDemote = req.unsymboledDemote ?? DEFAULTS.unsymboledDemote;
|
|
751
|
+
const smallDemote = req.smallChunkDemote ?? DEFAULTS.smallChunkDemote;
|
|
752
|
+
const smallChunkMaxLines = req.smallChunkMaxLines ?? DEFAULTS.smallChunkMaxLines;
|
|
753
|
+
|
|
571
754
|
const ranked = fusedTop
|
|
572
755
|
.map(([id, fusedScore]) => {
|
|
573
756
|
const c = idToChunk.get(id);
|
|
574
757
|
if (!c) return null;
|
|
575
758
|
const li = maxsimScores.get(id);
|
|
576
|
-
const
|
|
759
|
+
const baseScore = liRan && li != null ? li : fusedScore;
|
|
760
|
+
// Stage 3 PHASE6_REDO ss-semantic (2026-05-13): demote only the
|
|
761
|
+
// INTERSECTION of (null-or-unknown symbol) AND (≤ smallChunkMaxLines).
|
|
762
|
+
// Earlier OR-form regressed typescript-lib (interface declarations
|
|
763
|
+
// are legitimately small AND symboled; OR-rule demoted them too).
|
|
764
|
+
// The intersection targets exactly the RB-001 pattern — short
|
|
765
|
+
// unnamed code fragments that win MaxSim by concentrated literal
|
|
766
|
+
// tokens (e.g., 3-line `module Sinatra` decl beating the 24-line
|
|
767
|
+
// Base class body). Multiplicative composition gives 0.85*0.85=0.7225x
|
|
768
|
+
// when both conditions fire.
|
|
769
|
+
const symMeta = c.symbol;
|
|
770
|
+
const isUnsymboled = !symMeta || symMeta === 'unknown';
|
|
771
|
+
const chunkLines = c.endLine - c.startLine + 1;
|
|
772
|
+
const isSmall = chunkLines <= smallChunkMaxLines;
|
|
773
|
+
const demoteFactor = (isUnsymboled && isSmall)
|
|
774
|
+
? unsymDemote * smallDemote
|
|
775
|
+
: 1;
|
|
776
|
+
const finalScore = baseScore * demoteFactor;
|
|
577
777
|
return {
|
|
578
778
|
id,
|
|
579
779
|
symbol: c.symbol,
|
|
@@ -586,6 +786,8 @@ export async function readSemantic(req) {
|
|
|
586
786
|
symbol: symbolScores.get(id) || 0,
|
|
587
787
|
maxsim: liRan ? (maxsimScores.get(id) ?? null) : null,
|
|
588
788
|
fused: fusedScore,
|
|
789
|
+
baseScore,
|
|
790
|
+
demoteFactor,
|
|
589
791
|
},
|
|
590
792
|
};
|
|
591
793
|
})
|
|
@@ -607,6 +809,7 @@ export async function readSemantic(req) {
|
|
|
607
809
|
spans,
|
|
608
810
|
charsReturned: charsUsed,
|
|
609
811
|
approxTokensReturned: Math.ceil(charsUsed / APPROX_CHARS_PER_TOKEN),
|
|
812
|
+
...(staleness ? { staleness, warnings: [staleness.warning] } : {}),
|
|
610
813
|
signals: verbose ? {
|
|
611
814
|
liRan,
|
|
612
815
|
lexicalHits: lexicalScores.size,
|
|
@@ -624,6 +827,21 @@ export async function readSemantic(req) {
|
|
|
624
827
|
};
|
|
625
828
|
}
|
|
626
829
|
|
|
830
|
+
export async function readSemantic(req) {
|
|
831
|
+
const projectRoot = req?.projectRoot || process.cwd();
|
|
832
|
+
return withPinnedRead(
|
|
833
|
+
{
|
|
834
|
+
projectRoot,
|
|
835
|
+
meta: {
|
|
836
|
+
tool: 'read-semantic',
|
|
837
|
+
path: req?.path ?? null,
|
|
838
|
+
query: req?.query ? String(req.query).slice(0, 200) : null,
|
|
839
|
+
},
|
|
840
|
+
},
|
|
841
|
+
() => _readSemanticUnpinned({ ...req, projectRoot }),
|
|
842
|
+
);
|
|
843
|
+
}
|
|
844
|
+
|
|
627
845
|
// ---------------------------------------------------------------------------
|
|
628
846
|
// Formatting
|
|
629
847
|
// ---------------------------------------------------------------------------
|
|
@@ -640,6 +858,9 @@ export function formatReadSemanticResult(result, format = 'agent') {
|
|
|
640
858
|
lines.push(`[error]`);
|
|
641
859
|
return lines.join('\n');
|
|
642
860
|
}
|
|
861
|
+
for (const warning of result.warnings || []) {
|
|
862
|
+
lines.push(`[warning] ${warning}`);
|
|
863
|
+
}
|
|
643
864
|
for (const span of result.spans) {
|
|
644
865
|
const label = span.symbols && span.symbols.length
|
|
645
866
|
? `${span.symbols.join(', ')} (lines ${span.startLine}-${span.endLine})`
|
|
@@ -663,10 +884,18 @@ function _parseArgs(args) {
|
|
|
663
884
|
const positional = [];
|
|
664
885
|
let format = 'agent';
|
|
665
886
|
let topK; let threshold; let contextLines; let maxChars; let maxTokens; let verbose = false;
|
|
887
|
+
let plain = false; let noBanner = false;
|
|
666
888
|
for (let i = 0; i < args.length; i++) {
|
|
667
889
|
const a = args[i];
|
|
668
890
|
if (a === '--json') format = 'json';
|
|
669
891
|
else if (a === '--agent') format = 'agent';
|
|
892
|
+
else if (a === '--no-banner') noBanner = true;
|
|
893
|
+
else if (a === '--format' || a.startsWith('--format=')) {
|
|
894
|
+
const v = a === '--format' ? args[++i] : a.slice('--format='.length);
|
|
895
|
+
if (v === 'json' || v === 'agent') format = v;
|
|
896
|
+
else if (v === 'plain') plain = true;
|
|
897
|
+
else throw new Error(`unknown --format value: ${v}`);
|
|
898
|
+
}
|
|
670
899
|
else if (a === '--verbose') verbose = true;
|
|
671
900
|
else if (a === '--top' || a === '--top-k' || a === '-k') topK = +args[++i];
|
|
672
901
|
else if (a === '--threshold') threshold = +args[++i];
|
|
@@ -677,7 +906,7 @@ function _parseArgs(args) {
|
|
|
677
906
|
else if (a.startsWith('--')) throw new Error(`unknown flag: ${a}`);
|
|
678
907
|
else positional.push(a);
|
|
679
908
|
}
|
|
680
|
-
return { positional, format, topK, threshold, contextLines, maxChars, maxTokens, verbose };
|
|
909
|
+
return { positional, format, topK, threshold, contextLines, maxChars, maxTokens, verbose, plain, noBanner };
|
|
681
910
|
}
|
|
682
911
|
|
|
683
912
|
function _printHelp() {
|
|
@@ -694,6 +923,8 @@ function _printHelp() {
|
|
|
694
923
|
' --max-chars <n> Hard cap on returned text (default: 8000)',
|
|
695
924
|
' --max-tokens <n> Convenience cap (~chars/4)',
|
|
696
925
|
' --json Emit JSON',
|
|
926
|
+
' --format <fmt> json | agent | plain (plain = no identity line)',
|
|
927
|
+
' --no-banner Suppress the identity line',
|
|
697
928
|
' --verbose Include timings + per-signal scores',
|
|
698
929
|
'',
|
|
699
930
|
].join('\n'));
|
|
@@ -719,6 +950,9 @@ export async function handleReadSemanticCli(args) {
|
|
|
719
950
|
maxTokens: parsed.maxTokens,
|
|
720
951
|
verbose: parsed.verbose,
|
|
721
952
|
});
|
|
953
|
+
if (parsed.format !== 'json') {
|
|
954
|
+
emitToolIdentityAuto('read-semantic', `${file} · "${query}"`, { plain: parsed.plain, noBanner: parsed.noBanner });
|
|
955
|
+
}
|
|
722
956
|
process.stdout.write(formatReadSemanticResult(result, parsed.format));
|
|
723
957
|
if (parsed.format !== 'json') process.stdout.write('\n');
|
|
724
958
|
process.exit(result.ok ? 0 : 1);
|
|
@@ -726,9 +960,12 @@ export async function handleReadSemanticCli(args) {
|
|
|
726
960
|
|
|
727
961
|
// Test-only export — clears caches between unit tests.
|
|
728
962
|
export function __resetReadSemanticCachesForTests() {
|
|
729
|
-
|
|
963
|
+
for (const entry of _repos.values()) entry?.repo?.close?.();
|
|
964
|
+
_repos.clear();
|
|
730
965
|
_liIndex = null;
|
|
731
966
|
_liInitPromise = null;
|
|
967
|
+
_liProjectKey = null;
|
|
968
|
+
_liManifestEpoch = null;
|
|
732
969
|
_encodeQueryFn = null;
|
|
733
970
|
_appliedLiPerRoot.clear();
|
|
734
971
|
}
|