sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
import { readFileSync } from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
|
|
1
4
|
/**
|
|
2
5
|
* Intent-aware file-kind ranking (conservative variant).
|
|
3
6
|
*
|
|
@@ -40,8 +43,62 @@
|
|
|
40
43
|
*/
|
|
41
44
|
|
|
42
45
|
const DOCS_RE = /\.md$|\.mdx$|\.rst$|(?:^|\/)docs?\//i;
|
|
43
|
-
|
|
46
|
+
// Optional leading underscore covers Go's `_examples/` build-excluded
|
|
47
|
+
// convention (go-chi, gin, gorilla, etc. — `go build` ignores any path
|
|
48
|
+
// component starting with `_`). The non-underscore form catches the
|
|
49
|
+
// generic `examples/`/`example/` directory shape across all languages.
|
|
50
|
+
// Verified across the AST-tester repo set: only `_examples/` directories
|
|
51
|
+
// follow the examples-pollution pattern; other leading-underscore dirs
|
|
52
|
+
// (Sphinx `_static`, Rust `_typeshed`, test fixture `_unrelated`, etc.)
|
|
53
|
+
// fall under existing docs / test demotion or are legitimate sources.
|
|
54
|
+
const EXAMPLES_RE = /(?:^|\/)_?examples?\//i;
|
|
55
|
+
// Tests directory patterns. Includes the standard tests?/spec/__tests__/__mocks__
|
|
56
|
+
// plus integration/, e2e/, fixtures?/, cypress/, playwright/ — common test-fixture
|
|
57
|
+
// directory conventions across JS/Python/Rust/Go that shipped without TESTS_RE
|
|
58
|
+
// catching them (e.g. fastify integration/server.js was mis-classified as
|
|
59
|
+
// 'implementation' until this update 2026-05-07).
|
|
60
|
+
// 2026-05-13: added `(?:^|/)test\.(c|cpp|cc|h|hpp|py|rb|go|rs|js|ts|java|kt|scala|php|lua|zig|dart|ex|exs)$`
|
|
61
|
+
// to catch bare `test.<ext>` files at repo root or any directory level
|
|
62
|
+
// (hiredis convention — C-005 in ast-tester probes). Restricted to code
|
|
63
|
+
// extensions only so `test.html`, `test.json`, `test.yaml` (legitimate
|
|
64
|
+
// fixtures) aren't misclassified.
|
|
65
|
+
const TESTS_RE = /(?:^|\/)(?:tests?|spec|integration|e2e|fixtures?|__tests__|__mocks__|cypress|playwright)\/|\.test\.[a-z0-9]+$|_test\.[a-z0-9]+$|\.spec\.[a-z0-9]+$|_spec\.[a-z0-9]+$|\.e2e\.[a-z0-9]+$|_e2e\.[a-z0-9]+$|(?:^|\/)test\.(?:c|cpp|cc|cxx|h|hpp|hh|py|rb|go|rs|js|ts|jsx|tsx|java|kt|scala|php|lua|zig|dart|ex|exs|swift|mjs|cjs|cs)$/i;
|
|
44
66
|
const TYPES_RE = /\.d\.ts$|(?:^|\/)types\//i;
|
|
67
|
+
// Ancillary files: configuration, lockfiles, CI manifests, container build
|
|
68
|
+
// definitions. 2026-05-07 added Dockerfile / Containerfile / .dockerignore
|
|
69
|
+
// after FreshStack uv UV-FLOW-2 surfaced a Dockerfile as top-1 for "what
|
|
70
|
+
// happens end-to-end when I run uv sync". Containerfile descriptors are not
|
|
71
|
+
// implementation code; demote consistently with .yaml/.toml/Cargo.lock
|
|
72
|
+
// siblings.
|
|
73
|
+
//
|
|
74
|
+
// NOTE: Deliberately NOT including `Makefile` / `GNUmakefile` here even
|
|
75
|
+
// though they are also build-orchestration. Probe S6-Q6 (gin) regressed
|
|
76
|
+
// PASS→PARTIAL when gin's `Makefile` was demoted: classifying it shifted
|
|
77
|
+
// the file-kind window's `demotableCount`, which cascaded through the
|
|
78
|
+
// rerank into a different gin.go top-1 chunk pick. Treating Makefile as
|
|
79
|
+
// implementation is the safer default — it rarely competes with real source
|
|
80
|
+
// for top-1 anyway. Re-evaluate if a future probe shows Makefile actually
|
|
81
|
+
// poisoning a top-1 result.
|
|
82
|
+
const ANCILLARY_RE = /(?:^|\/)\.(?:github|gitlab|circleci|vscode|cursor)\/|(?:^|\/)(?:package(?:-lock)?\.json|pnpm-lock\.yaml|yarn\.lock|Cargo\.lock|Gemfile\.lock|Dockerfile(?:\.[\w.-]+)?|Containerfile|\.dockerignore)$|\.(?:ya?ml|jsonc?|toml|ini|cfg|conf|lock|xml|csv|dockerfile)$/i;
|
|
83
|
+
const DECLARATION_RE = /\b(function|class|struct|interface|enum|trait|fn\s+\w+|def\s+\w+|const\s+k[A-Z])\b|\btype\s+\w+\s*=/;
|
|
84
|
+
const EXECUTABLE_DECLARATION_RE = /\b(function|class|struct|interface|enum|trait|fn\s+\w+|def\s+\w+|func\s+\w+)\b/;
|
|
85
|
+
const STOPWORDS = new Set([
|
|
86
|
+
'and', 'are', 'does', 'for', 'from', 'how', 'into', 'is', 'the', 'this',
|
|
87
|
+
'that', 'what', 'when', 'where', 'which', 'with', 'why',
|
|
88
|
+
]);
|
|
89
|
+
const LANG_KEYWORDS = new Set([
|
|
90
|
+
'class', 'const', 'def', 'enum', 'fn', 'function', 'impl', 'import',
|
|
91
|
+
'interface', 'let', 'package', 'pub', 'struct', 'trait', 'type', 'use',
|
|
92
|
+
]);
|
|
93
|
+
|
|
94
|
+
const ENTITY_KIND_KEYWORDS = {
|
|
95
|
+
enum: ['enum'],
|
|
96
|
+
struct: ['struct'],
|
|
97
|
+
interface: ['interface', 'trait'],
|
|
98
|
+
trait: ['trait'],
|
|
99
|
+
class: ['class'],
|
|
100
|
+
type: ['type', 'typeAlias', 'enum', 'struct', 'trait', 'class', 'interface'],
|
|
101
|
+
};
|
|
45
102
|
|
|
46
103
|
// Strong implementation-seeking signals. A query that fires one of these is
|
|
47
104
|
// confidently asking for source code; anything else is treated as `'unknown'`.
|
|
@@ -50,16 +107,20 @@ const TYPES_RE = /\.d\.ts$|(?:^|\/)types\//i;
|
|
|
50
107
|
const IMPL_INTENT_RE = new RegExp(
|
|
51
108
|
'\\b(' + [
|
|
52
109
|
// English wh-questions about location/behaviour
|
|
53
|
-
'where', 'how does', 'how do',
|
|
110
|
+
'where', 'what', 'how does', 'how do',
|
|
111
|
+
'when',
|
|
54
112
|
// Definition / implementation phrasing
|
|
55
113
|
'implements?', 'implementation', 'defines?', 'definition', 'declared?',
|
|
114
|
+
'decides?',
|
|
56
115
|
// Code-structure nouns
|
|
57
116
|
'function', 'functions', 'method', 'methods', 'class', 'classes',
|
|
58
117
|
'constructor', 'module', 'library', 'crate', 'package',
|
|
59
118
|
// Verbs that strongly signal a code unit
|
|
60
119
|
'dispatch(?:es|er)?', 'handles?', 'handler', 'handlers',
|
|
61
|
-
'
|
|
120
|
+
'bind(?:s|ing)?',
|
|
121
|
+
'parses?', 'parsed', 'parser', 'parsers',
|
|
62
122
|
'router?', 'routes?', 'routing',
|
|
123
|
+
'redirect(?:s|ed|ing)?',
|
|
63
124
|
'register(?:s|ed|ing)?',
|
|
64
125
|
'builds?', 'builder', 'builders',
|
|
65
126
|
'generat(?:es?|or|ors|ed|ing)',
|
|
@@ -98,18 +159,31 @@ const IMPL_INTENT_RE = new RegExp(
|
|
|
98
159
|
|
|
99
160
|
const DOCS_INTENT_RE = /\b(doc|docs|documentation|readme|guide|tutorial|reference|example)\b/i;
|
|
100
161
|
const TESTS_INTENT_RE = /\b(test|tests|spec|specs|fixture|fixtures|mock|mocks)\b/i;
|
|
101
|
-
const TYPES_INTENT_RE = /\b(
|
|
162
|
+
const TYPES_INTENT_RE = /\b(types|interface|declaration|signature|typings|typedef)\b|\btype\s+(?:alias|declaration|definition|interface|signature)\b/i;
|
|
163
|
+
const ANCILLARY_INTENT_RE = /\b(config|configuration|manifest|workflow|ci|github action|labeler|toml|lockfile|package\.json)\b/i;
|
|
102
164
|
|
|
103
165
|
/**
|
|
104
166
|
* Detect the file kind from a result path.
|
|
105
|
-
* @returns {'docs'|'tests'|'types'|'implementation'}
|
|
167
|
+
* @returns {'docs'|'examples'|'tests'|'types'|'ancillary'|'implementation'}
|
|
106
168
|
*/
|
|
107
|
-
export function detectFileKind(filePath) {
|
|
169
|
+
export function detectFileKind(filePath, opts) {
|
|
108
170
|
if (!filePath || typeof filePath !== 'string') return 'implementation';
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
171
|
+
// Per-call cache. Each filePath produces a deterministic kind; calling
|
|
172
|
+
// 5 regex tests + an isTestSupportFile path-rule scan per result × per
|
|
173
|
+
// demotion site burns cycles redundantly when only ~10-20 unique files
|
|
174
|
+
// live in a result set. Cache keyed by file path; verdict reused.
|
|
175
|
+
const cache = opts && opts._fileKindCache;
|
|
176
|
+
if (cache && cache.has(filePath)) return cache.get(filePath);
|
|
177
|
+
let kind;
|
|
178
|
+
if (DOCS_RE.test(filePath)) kind = 'docs';
|
|
179
|
+
else if (EXAMPLES_RE.test(filePath)) kind = 'examples';
|
|
180
|
+
else if (TESTS_RE.test(filePath)) kind = 'tests';
|
|
181
|
+
else if (isTestSupportFile(filePath)) kind = 'tests';
|
|
182
|
+
else if (TYPES_RE.test(filePath)) kind = 'types';
|
|
183
|
+
else if (ANCILLARY_RE.test(filePath)) kind = 'ancillary';
|
|
184
|
+
else kind = 'implementation';
|
|
185
|
+
if (cache) cache.set(filePath, kind);
|
|
186
|
+
return kind;
|
|
113
187
|
}
|
|
114
188
|
|
|
115
189
|
/**
|
|
@@ -118,7 +192,7 @@ export function detectFileKind(filePath) {
|
|
|
118
192
|
* `'unknown'`, and the helper treats `'unknown'` as a no-op (just like the
|
|
119
193
|
* docs/tests/types intents).
|
|
120
194
|
*
|
|
121
|
-
* @returns {'docs'|'tests'|'types'|'implementation'|'unknown'}
|
|
195
|
+
* @returns {'docs'|'tests'|'types'|'ancillary'|'implementation'|'unknown'}
|
|
122
196
|
*/
|
|
123
197
|
export function classifyFileKindIntent(query) {
|
|
124
198
|
const q = (query || '').toLowerCase();
|
|
@@ -127,6 +201,7 @@ export function classifyFileKindIntent(query) {
|
|
|
127
201
|
if (TYPES_INTENT_RE.test(q)) return 'types';
|
|
128
202
|
if (DOCS_INTENT_RE.test(q)) return 'docs';
|
|
129
203
|
if (TESTS_INTENT_RE.test(q)) return 'tests';
|
|
204
|
+
if (ANCILLARY_INTENT_RE.test(q)) return 'ancillary';
|
|
130
205
|
if (IMPL_INTENT_RE.test(q)) return 'implementation';
|
|
131
206
|
return 'unknown';
|
|
132
207
|
}
|
|
@@ -141,6 +216,2083 @@ function resolveFilePath(r) {
|
|
|
141
216
|
|| '';
|
|
142
217
|
}
|
|
143
218
|
|
|
219
|
+
function inferLineCount(r) {
|
|
220
|
+
const meta = r?.metadata || {};
|
|
221
|
+
const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
|
|
222
|
+
const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line;
|
|
223
|
+
if (Number.isFinite(start) && Number.isFinite(end) && end >= start) {
|
|
224
|
+
return end - start + 1;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const text = r?.text || r?.content || r?.code || r?.snippet || '';
|
|
228
|
+
if (typeof text === 'string' && text.length > 0) {
|
|
229
|
+
return text.split(/\r?\n/).length;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
return Infinity;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
function readResultSpan(r, opts = {}) {
|
|
236
|
+
if (!opts.projectRoot) return '';
|
|
237
|
+
const file = resolveFilePath(r);
|
|
238
|
+
if (!file) return '';
|
|
239
|
+
const meta = r?.metadata || {};
|
|
240
|
+
const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
|
|
241
|
+
const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
|
|
242
|
+
if (!Number.isFinite(start) || !Number.isFinite(end) || end < start) return '';
|
|
243
|
+
try {
|
|
244
|
+
const abs = path.resolve(opts.projectRoot, file);
|
|
245
|
+
const root = path.resolve(opts.projectRoot);
|
|
246
|
+
if (abs !== root && !abs.startsWith(root + path.sep)) return '';
|
|
247
|
+
const lines = readFileSync(abs, 'utf8').split('\n');
|
|
248
|
+
const contextStart = Math.max(1, start - 2);
|
|
249
|
+
return lines.slice(contextStart - 1, end).join('\n');
|
|
250
|
+
} catch {
|
|
251
|
+
return '';
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function resolveResultText(r, opts = {}) {
|
|
256
|
+
const inline = r?.content || r?.text || r?.code || r?.snippet;
|
|
257
|
+
if (inline) return inline;
|
|
258
|
+
// Per-call cache: this function is hit by 5+ demotion sub-rules per result
|
|
259
|
+
// (bodyDensity, isTestChunk fallback, anomalousChunk, docCommentOnly,
|
|
260
|
+
// inferEntityKindFromText). Without memoization, each cache miss triggers
|
|
261
|
+
// a full readFileSync + split('\n') on the chunk's source file — 5 file
|
|
262
|
+
// reads per result × 100 results = ~500 disk reads per applyResultDemotions
|
|
263
|
+
// call, which dominates the 6ms p50 cost.
|
|
264
|
+
const cache = opts._resultTextCache;
|
|
265
|
+
if (cache) {
|
|
266
|
+
const file = resolveFilePath(r);
|
|
267
|
+
const meta = r?.metadata || {};
|
|
268
|
+
const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
|
|
269
|
+
const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
|
|
270
|
+
if (file && Number.isFinite(start)) {
|
|
271
|
+
const key = `${file}|${start}|${Number.isFinite(end) ? end : start}`;
|
|
272
|
+
if (cache.has(key)) return cache.get(key);
|
|
273
|
+
const text = readResultSpan(r, opts);
|
|
274
|
+
cache.set(key, text);
|
|
275
|
+
return text;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return readResultSpan(r, opts);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
function resolveResultName(r) {
|
|
282
|
+
return r?.metadata?.name || r?.name || '';
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
function resolveResultType(r) {
|
|
286
|
+
return r?.metadata?.type || r?.type || '';
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
function normalizeType(type) {
|
|
290
|
+
return String(type || '').toLowerCase();
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
function hasAblation(ablations, name) {
|
|
294
|
+
return ablations instanceof Set ? ablations.has(name) : Array.isArray(ablations) && ablations.includes(name);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Removed (2026-05-05): the standalone tiny-ancillary-chunk floor became
|
|
298
|
+
// redundant once cAST sibling-merge was confirmed in tree-sitter-provider.js
|
|
299
|
+
// (recursiveChunk merges adjacent siblings up to MAX_CHUNK_SIZE so tiny
|
|
300
|
+
// chunks don't enter the index as standalone retrieval units), and the
|
|
301
|
+
// range-preservation invariant in applyResultDemotions stopped entity
|
|
302
|
+
// adoption from shrinking already-merged chunks. Kept the per-ancillary-file
|
|
303
|
+
// hard tiny factor (`tinyAncillaryFactor` in applyFileKindRanking) since
|
|
304
|
+
// that's a sub-rule of doc/test demotion, not a general size penalty.
|
|
305
|
+
|
|
306
|
+
export function isTestChunk(r, opts = {}) {
|
|
307
|
+
const filePath = resolveFilePath(r);
|
|
308
|
+
// Per-chunk verdict cache. isTestChunk fires once per result inside the
|
|
309
|
+
// demotion loop, but its inputs (filePath, chunk text, chunk name) are
|
|
310
|
+
// immutable for a given (file, start, end). Cache the boolean to skip the
|
|
311
|
+
// 4 chunk-text regexes + name regex on cache hits.
|
|
312
|
+
const verdictCache = opts._isTestChunkCache;
|
|
313
|
+
let chunkKey = null;
|
|
314
|
+
if (verdictCache) {
|
|
315
|
+
const meta = r?.metadata || {};
|
|
316
|
+
const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
|
|
317
|
+
const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
|
|
318
|
+
if (filePath && Number.isFinite(start)) {
|
|
319
|
+
chunkKey = `${filePath}|${start}|${Number.isFinite(end) ? end : start}`;
|
|
320
|
+
if (verdictCache.has(chunkKey)) return verdictCache.get(chunkKey);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
const verdict = isTestChunkUncached(r, opts, filePath);
|
|
324
|
+
if (chunkKey) verdictCache.set(chunkKey, verdict);
|
|
325
|
+
return verdict;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
function isTestChunkUncached(r, opts, filePath) {
|
|
329
|
+
const fileKind = detectFileKind(filePath, opts);
|
|
330
|
+
if (fileKind === 'tests') return true;
|
|
331
|
+
if (!hasAblation(opts.ablations, 'no-test-support-detection')) {
|
|
332
|
+
// Per-file verdict cache. isTestSupportFile is deterministic in
|
|
333
|
+
// (filePath, file content) and the file content is immutable for the
|
|
334
|
+
// duration of one search() call. Without this cache, the text-scan
|
|
335
|
+
// path (split/filter/per-line-regex over hundreds of lines) ran on
|
|
336
|
+
// every result, dominated by ~100 results × 100µs = 10ms per
|
|
337
|
+
// applyResultDemotions call. Cached, the verdict is computed at most
|
|
338
|
+
// once per unique file path.
|
|
339
|
+
const verdictCache = opts._isTestSupportCache;
|
|
340
|
+
let supportVerdict;
|
|
341
|
+
if (verdictCache && verdictCache.has(filePath)) {
|
|
342
|
+
supportVerdict = verdictCache.get(filePath);
|
|
343
|
+
} else {
|
|
344
|
+
supportVerdict = isTestSupportFile(
|
|
345
|
+
filePath,
|
|
346
|
+
() => resolveFullFileText(r, opts) || resolveResultText(r, opts),
|
|
347
|
+
);
|
|
348
|
+
if (verdictCache) verdictCache.set(filePath, supportVerdict);
|
|
349
|
+
}
|
|
350
|
+
if (supportVerdict) return true;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Combined alternation over the four prior single-pattern tests:
|
|
354
|
+
// #[cfg(test)] / #[test] (Rust attribute)
|
|
355
|
+
// func Test<X> (Go testing)
|
|
356
|
+
// def test_<X> (Python unittest/pytest)
|
|
357
|
+
// it/test/describe(...) (JS/TS suite frameworks)
|
|
358
|
+
// V8 compiles a single alternation regex into one DFA pass over the text;
|
|
359
|
+
// running four `.test()` calls forced four separate scans even when the
|
|
360
|
+
// first three short-circuited successfully. Per result the saving is
|
|
361
|
+
// ~30-100µs, which compounds across the per-call window (~100 results)
|
|
362
|
+
// and is the dominant remaining cost in rule:testName after the verdict
|
|
363
|
+
// caches eliminated repeats.
|
|
364
|
+
const text = resolveResultText(r, opts);
|
|
365
|
+
if (TEST_CHUNK_BODY_RE.test(text)) return true;
|
|
366
|
+
|
|
367
|
+
const name = resolveResultName(r);
|
|
368
|
+
return TEST_CHUNK_NAME_RE.test(name);
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
const TEST_CHUNK_BODY_RE = /^\s*(?:#\[(?:cfg\s*\(\s*test\s*\)|test)\]|func\s+Test[A-Z]|def\s+test_|(?:it|test|describe)\s*\(\s*['"])/m;
|
|
372
|
+
const TEST_CHUNK_NAME_RE = /^(?:test_|Test[A-Z])|_test$/;
|
|
373
|
+
|
|
374
|
+
function resolveFullFileText(r, opts = {}) {
|
|
375
|
+
if (!opts.projectRoot) return '';
|
|
376
|
+
const file = resolveFilePath(r);
|
|
377
|
+
if (!file) return '';
|
|
378
|
+
// Per-call cache: this fires once per result × per isTestChunk site
|
|
379
|
+
// (hybrid + postprocess). Without memoization a query touching N
|
|
380
|
+
// distinct files reads each one fully ~2× per result that hits the
|
|
381
|
+
// file. Keyed by file path — the file content is immutable for the
|
|
382
|
+
// duration of one search() call.
|
|
383
|
+
const cache = opts._fullFileTextCache;
|
|
384
|
+
if (cache && cache.has(file)) return cache.get(file);
|
|
385
|
+
try {
|
|
386
|
+
const root = path.resolve(opts.projectRoot);
|
|
387
|
+
const abs = path.resolve(root, file);
|
|
388
|
+
if (abs !== root && !abs.startsWith(root + path.sep)) {
|
|
389
|
+
if (cache) cache.set(file, '');
|
|
390
|
+
return '';
|
|
391
|
+
}
|
|
392
|
+
const text = readFileSync(abs, 'utf8');
|
|
393
|
+
if (cache) cache.set(file, text);
|
|
394
|
+
return text;
|
|
395
|
+
} catch {
|
|
396
|
+
if (cache) cache.set(file, '');
|
|
397
|
+
return '';
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
export function isTestSupportFile(filePath, content = '') {
|
|
402
|
+
if (!filePath) return false;
|
|
403
|
+
const pathRules = [
|
|
404
|
+
/(^|\/)(testutil|test_util|test_utils|test_helper|test_helpers|testing_support|spec_helper)\.[a-z]+$/i,
|
|
405
|
+
/(^|\/)(test|tests|spec|__tests__|__mocks__)\/[^/]*(util|helper|fixture|mock|stub|setup|harness)/i,
|
|
406
|
+
/(^|\/)(testdata|fixtures|__fixtures__|test_data)\//i,
|
|
407
|
+
/(^|\/)conftest\.py$/i,
|
|
408
|
+
/\.test-d\.[tj]sx?$/i,
|
|
409
|
+
];
|
|
410
|
+
if (pathRules.some(re => re.test(filePath))) return true;
|
|
411
|
+
|
|
412
|
+
// Lazy content getter: caller passes a thunk to avoid reading the file
|
|
413
|
+
// when path rules already determine the answer. Plain string still
|
|
414
|
+
// accepted for back-compat with non-applyResultDemotions callers.
|
|
415
|
+
const text = typeof content === 'function' ? content() : content;
|
|
416
|
+
if (!text) return false;
|
|
417
|
+
if (/^\s*#!\[cfg\s*\(\s*test\s*\)/m.test(text)) return true;
|
|
418
|
+
|
|
419
|
+
const lines = text.split('\n').filter(line => line.trim());
|
|
420
|
+
if (lines.length < 8) return false;
|
|
421
|
+
const hasJsTestContext = /(^|\/)(test|tests|spec|__tests__)\//i.test(filePath)
|
|
422
|
+
|| /^\s*(describe|it|test)\s*\(/m.test(text);
|
|
423
|
+
const assertionRe = hasJsTestContext
|
|
424
|
+
? /\b(assert!|assert_eq!|assert_ne!|expect\(|assertEqual|assertEquals|t\.Errorf|t\.Fatalf|t\.Helper\(\)|require\.\w+|assert\.\w+)\b/
|
|
425
|
+
: /\b(assert!|assert_eq!|assert_ne!|assertEqual|assertEquals|t\.Errorf|t\.Fatalf|t\.Helper\(\))\b/;
|
|
426
|
+
const assertLines = lines.filter(line => assertionRe.test(line)).length;
|
|
427
|
+
return assertLines / lines.length > 0.30;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
function queryTokenSet(query, queryTokens) {
|
|
431
|
+
if (queryTokens instanceof Set) return queryTokens;
|
|
432
|
+
if (Array.isArray(queryTokens)) return new Set(queryTokens.map(t => String(t).toLowerCase()));
|
|
433
|
+
return new Set(String(query || '').toLowerCase().split(/[_\W]+/).filter(t => t.length >= 3));
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
export function testNameQueryOverlap(r, queryTokens) {
|
|
437
|
+
const name = resolveResultName(r).toLowerCase();
|
|
438
|
+
if (!name) return 0;
|
|
439
|
+
const nameTokens = name
|
|
440
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
441
|
+
.replace(/([a-zA-Z])(\d)/g, '$1 $2')
|
|
442
|
+
.split(/[_\W]+/)
|
|
443
|
+
.filter(t => t.length >= 3);
|
|
444
|
+
if (nameTokens.length === 0) return 0;
|
|
445
|
+
|
|
446
|
+
let hits = 0;
|
|
447
|
+
for (const token of nameTokens) {
|
|
448
|
+
if (queryTokens.has(token)) hits++;
|
|
449
|
+
}
|
|
450
|
+
return hits / nameTokens.length;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
export function entityKindPreferenceFromQuery(query) {
|
|
454
|
+
const q = String(query || '').toLowerCase();
|
|
455
|
+
for (const [bucket, keywords] of Object.entries(ENTITY_KIND_KEYWORDS)) {
|
|
456
|
+
for (const keyword of keywords) {
|
|
457
|
+
if (new RegExp(`\\b${keyword.toLowerCase()}\\b`, 'i').test(q)) return bucket;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
return null;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
export function extractNameHints(query) {
|
|
464
|
+
const tokens = String(query || '').match(/[A-Za-z_][A-Za-z0-9_]+/g) || [];
|
|
465
|
+
const hints = new Set();
|
|
466
|
+
for (const token of tokens) {
|
|
467
|
+
if (token.length < 3) continue;
|
|
468
|
+
if (LANG_KEYWORDS.has(token)) continue;
|
|
469
|
+
if (STOPWORDS.has(token.toLowerCase())) continue;
|
|
470
|
+
if (/[A-Z]/.test(token) || token.length >= 4) hints.add(token);
|
|
471
|
+
}
|
|
472
|
+
return hints;
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
function splitIdentifierName(name) {
|
|
476
|
+
return String(name || '')
|
|
477
|
+
.replace(/([a-z0-9])([A-Z])/g, '$1 $2')
|
|
478
|
+
.split(/[_\W]+/)
|
|
479
|
+
.map(s => s.toLowerCase())
|
|
480
|
+
.filter(Boolean);
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
function resolveEntityKindInfo(r, opts = {}) {
|
|
484
|
+
const file = resolveFilePath(r);
|
|
485
|
+
const meta = r?.metadata || {};
|
|
486
|
+
const start = r?.startLine ?? r?.start_line ?? meta.startLine ?? meta.start_line;
|
|
487
|
+
const end = r?.endLine ?? r?.end_line ?? meta.endLine ?? meta.end_line ?? start;
|
|
488
|
+
// Intra-call memoization: this function is invoked 4-7x per result by
|
|
489
|
+
// different multipliers (buildRefCountMap, entityKindMultiplier,
|
|
490
|
+
// namePrecisionMultiplier, bodyDensityMultiplier, megaEntityPenalty,
|
|
491
|
+
// referenceCountBoost, the main loop). With ~100 results that's
|
|
492
|
+
// 400-1400 SQLite round-trips. Cache by (file, start, end).
|
|
493
|
+
const cache = opts._entityKindCache;
|
|
494
|
+
let cacheKey = null;
|
|
495
|
+
if (cache && file && Number.isFinite(start)) {
|
|
496
|
+
cacheKey = `${file}|${start}|${Number.isFinite(end) ? end : start}`;
|
|
497
|
+
if (cache.has(cacheKey)) return cache.get(cacheKey);
|
|
498
|
+
}
|
|
499
|
+
let result = null;
|
|
500
|
+
if (opts.codeGraphRepo && file && Number.isFinite(start)) {
|
|
501
|
+
try {
|
|
502
|
+
const entity = opts.codeGraphRepo.findEnclosingEntity(file, start, Number.isFinite(end) ? end : start)
|
|
503
|
+
|| opts.codeGraphRepo.findEnclosingEntity(file, start, start);
|
|
504
|
+
if (entity?.type) {
|
|
505
|
+
result = entity;
|
|
506
|
+
} else if (typeof opts.codeGraphRepo.findFirstEntityInRange === 'function' && Number.isFinite(end)) {
|
|
507
|
+
const first = opts.codeGraphRepo.findFirstEntityInRange(file, start, end);
|
|
508
|
+
if (first?.type) result = first;
|
|
509
|
+
}
|
|
510
|
+
} catch {
|
|
511
|
+
// Fall through to source-span inference.
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
if (!result) {
|
|
515
|
+
const inferred = inferEntityKindFromText(resolveResultText(r, opts));
|
|
516
|
+
result = inferred ? { type: inferred } : null;
|
|
517
|
+
}
|
|
518
|
+
if (cacheKey) cache.set(cacheKey, result);
|
|
519
|
+
return result;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// Boost magnitudes are env-tunable so we can ablate without re-deploying.
|
|
523
|
+
// Defaults softened (2026-05-05) from (1.25, 0.85, 1.20, 1.05) to
|
|
524
|
+
// (1.10, 0.90, 1.10, 1.03) after a 16-query 3-config ablation showed
|
|
525
|
+
// 15 of 16 top-1 results unchanged at the lower magnitudes — less
|
|
526
|
+
// leverage = less interaction risk with name-precision and other
|
|
527
|
+
// signals, with no observed quality loss. The stronger old values
|
|
528
|
+
// remain reachable via env vars if a future probe shows they help.
|
|
529
|
+
function envFloat(name, dflt) {
|
|
530
|
+
const v = process.env[name];
|
|
531
|
+
if (v == null || v === '') return dflt;
|
|
532
|
+
const n = Number(v);
|
|
533
|
+
return Number.isFinite(n) && n > 0 ? n : dflt;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
function entityKindMultiplier(r, preferred, opts = {}) {
|
|
537
|
+
if (!preferred) return 1;
|
|
538
|
+
const kindBoost = envFloat('SWEET_SEARCH_KIND_BOOST', 1.10);
|
|
539
|
+
const kindDemote = envFloat('SWEET_SEARCH_KIND_DEMOTE', 0.90);
|
|
540
|
+
const wantSet = new Set((ENTITY_KIND_KEYWORDS[preferred] || []).map(normalizeType));
|
|
541
|
+
const inferred = resolveEntityKindInfo(r, opts)?.type || '';
|
|
542
|
+
const recorded = normalizeType(resolveResultType(r));
|
|
543
|
+
const type = recorded && recorded !== 'code' && recorded !== 'chunk' ? recorded : normalizeType(inferred);
|
|
544
|
+
if (wantSet.has(type) || (type === 'typealias' && preferred === 'type')) return kindBoost;
|
|
545
|
+
if ((type === 'impl' || type === 'method' || type === 'function') && preferred !== 'function') return kindDemote;
|
|
546
|
+
return 1;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
function namePrecisionMultiplier(r, preferred, nameHintsLower, opts = {}) {
|
|
550
|
+
if (!preferred || nameHintsLower.size === 0) return 1;
|
|
551
|
+
const exactBoost = envFloat('SWEET_SEARCH_NAME_EXACT_BOOST', 1.10);
|
|
552
|
+
const substrBoost = envFloat('SWEET_SEARCH_NAME_SUBSTR_BOOST', 1.03);
|
|
553
|
+
const wantSet = new Set((ENTITY_KIND_KEYWORDS[preferred] || []).map(normalizeType));
|
|
554
|
+
const entityInfo = resolveEntityKindInfo(r, opts);
|
|
555
|
+
const recorded = normalizeType(resolveResultType(r));
|
|
556
|
+
const type = recorded && recorded !== 'code' && recorded !== 'chunk'
|
|
557
|
+
? recorded
|
|
558
|
+
: normalizeType(entityInfo?.type);
|
|
559
|
+
if (!wantSet.has(type) && !(type === 'typealias' && preferred === 'type')) return 1;
|
|
560
|
+
|
|
561
|
+
const name = resolveResultName(r) || entityInfo?.name || '';
|
|
562
|
+
if (!name) return 1;
|
|
563
|
+
if (nameHintsLower.has(name.toLowerCase())) return exactBoost;
|
|
564
|
+
const nameTokens = splitIdentifierName(name);
|
|
565
|
+
for (const hint of nameHintsLower) {
|
|
566
|
+
if (nameTokens.includes(hint)) return substrBoost;
|
|
567
|
+
}
|
|
568
|
+
return 1;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
function exactNamedEntityForResult(r, preferred, nameHints, nameHintsLower, opts = {}) {
|
|
572
|
+
if (!opts.codeGraphRepo || !preferred || nameHintsLower.size === 0) return null;
|
|
573
|
+
const file = resolveFilePath(r);
|
|
574
|
+
if (!file) return null;
|
|
575
|
+
const types = ENTITY_KIND_KEYWORDS[preferred] || [];
|
|
576
|
+
try {
|
|
577
|
+
const entities = (typeof opts.codeGraphRepo.findEntitiesByNamesCaseInsensitive === 'function'
|
|
578
|
+
? opts.codeGraphRepo.findEntitiesByNamesCaseInsensitive([...nameHintsLower], {
|
|
579
|
+
types,
|
|
580
|
+
limit: 16,
|
|
581
|
+
})
|
|
582
|
+
: opts.codeGraphRepo.findEntitiesByNames([...nameHints], {
|
|
583
|
+
types,
|
|
584
|
+
limit: 16,
|
|
585
|
+
})) || [];
|
|
586
|
+
const sameFile = entities.find(entity =>
|
|
587
|
+
(entity.filePath || entity.file) === file && nameHintsLower.has(String(entity.name || '').toLowerCase())
|
|
588
|
+
);
|
|
589
|
+
return sameFile || null;
|
|
590
|
+
} catch {
|
|
591
|
+
return null;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
function inferEntityKindFromText(text) {
|
|
596
|
+
if (!text) return '';
|
|
597
|
+
if (/^\s*(?:pub(?:\([^)]*\))?\s+)?enum\s+\w+/m.test(text)) return 'enum';
|
|
598
|
+
if (/^\s*(?:pub(?:\([^)]*\))?\s+)?struct\s+\w+/m.test(text)) return 'struct';
|
|
599
|
+
if (/^\s*(?:pub(?:\([^)]*\))?\s+)?trait\s+\w+/m.test(text)) return 'trait';
|
|
600
|
+
if (/^\s*impl(?:\s*<[^>]+>)?\s+\w+/m.test(text)) return 'impl';
|
|
601
|
+
if (/^\s*(?:export\s+)?(?:abstract\s+)?class\s+\w+/m.test(text)) return 'class';
|
|
602
|
+
if (/^\s*(?:export\s+)?interface\s+\w+/m.test(text)) return 'interface';
|
|
603
|
+
if (/^\s*(?:export\s+)?type\s+\w+\s*=/m.test(text)) return 'typealias';
|
|
604
|
+
return '';
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// Declarative / doc-string-heavy chunk demotion (added 2026-05-05).
|
|
608
|
+
//
|
|
609
|
+
// Three narrow, independent content-shape triggers — each catches a specific
|
|
610
|
+
// failure shape observed in the May-05 novel-probe analysis:
|
|
611
|
+
//
|
|
612
|
+
// T1. Declarative-entity demotion. When the chunk's primary entity type is
|
|
613
|
+
// `namespace`, `interface`, or `typeAlias`, the chunk is by definition
|
|
614
|
+
// a declaration block — signatures / property decls without behaviour.
|
|
615
|
+
// Such chunks should not outrank `function`/`impl` chunks for
|
|
616
|
+
// procedural queries. Catches the .d.ts namespace / interface case.
|
|
617
|
+
//
|
|
618
|
+
// T2. Raw-string-dominant impl. When > 50 % of an `impl` chunk's non-blank
|
|
619
|
+
// characters live inside Rust raw-string literals (`r#"..."#`,
|
|
620
|
+
// `r"..."`), the chunk is mostly documentation. Catches clap-style
|
|
621
|
+
// flag impls whose `doc_long()` returns a 30-line description (e.g.
|
|
622
|
+
// `impl Flag for SearchZip`).
|
|
623
|
+
//
|
|
624
|
+
// T3. Stub-impl. Multiple `fn` definitions in an `impl` chunk with avg
|
|
625
|
+
// body line count < 4. Catches clap-style impls whose individual
|
|
626
|
+
// `doc_long()` is small enough to escape T2 but whose methods are
|
|
627
|
+
// still mostly 1-line literal returns (e.g. `impl Flag for
|
|
628
|
+
// CaseSensitive`).
|
|
629
|
+
//
|
|
630
|
+
// All three triggers are intent-gated to `implementation` queries, so a
|
|
631
|
+
// phrasing like "what is the FastifyInstance interface" — which legitimately
|
|
632
|
+
// wants a declaration — is unaffected. T2/T3 are also restricted to chunks
|
|
633
|
+
// whose primary entity type is `impl` to avoid touching anything outside
|
|
634
|
+
// the Rust idiom we're targeting.
|
|
635
|
+
//
|
|
636
|
+
// Defaults are conservative. An earlier "execution density" heuristic
|
|
637
|
+
// (penalise any chunk with low control-flow ratio) over-fired on data-
|
|
638
|
+
// declaration chunks like `lib/errors.js` constant tables, which are the
|
|
639
|
+
// genuinely-correct answer for "how does Fastify handle errors". The
|
|
640
|
+
// triggers here are shape-specific instead of density-specific.
|
|
641
|
+
//
|
|
642
|
+
// Disable everything with `ablations: 'no-body-density'` or
|
|
643
|
+
// SWEET_SEARCH_BODY_DENSITY=0; per-trigger overrides via
|
|
644
|
+
// SWEET_SEARCH_DECLARATIVE_FACTOR / SWEET_SEARCH_RAWSTRING_FACTOR /
|
|
645
|
+
// SWEET_SEARCH_STUB_FACTOR.
|
|
646
|
+
const DECLARATIVE_ENTITY_TYPES = new Set(['namespace', 'interface', 'typealias']);
|
|
647
|
+
|
|
648
|
+
function envFloatRange(name, dflt) {
|
|
649
|
+
const v = process.env[name];
|
|
650
|
+
if (v == null || v === '') return dflt;
|
|
651
|
+
const n = Number(v);
|
|
652
|
+
return Number.isFinite(n) && n >= 0 && n <= 1 ? n : dflt;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
/**
|
|
656
|
+
* Detect whether a Rust `impl` chunk is a "stub impl" — fn definitions with
|
|
657
|
+
* no real body. Catches two patterns:
|
|
658
|
+
*
|
|
659
|
+
* (A) MULTI-METHOD stubs (original ac280d4 case): clap-style flag-arg impls
|
|
660
|
+
* where every method is a 1-line literal return (e.g. `impl Flag for
|
|
661
|
+
* CaseSensitive` whose 6 methods total ~6 body lines), independent of
|
|
662
|
+
* whether `doc_long` carries a big raw-string description.
|
|
663
|
+
*
|
|
664
|
+
* (B) SINGLE-METHOD trivial-body stubs (added 2026-05-07 — FreshStack uv
|
|
665
|
+
* UV-FLOW-8 diagnosis): derive-equivalent impls like
|
|
666
|
+
* `impl Clone for X { fn clone(&self) -> Self { Self {...} } }` with a
|
|
667
|
+
* body of < 2 substantive lines. The original rule required ≥2 fns and
|
|
668
|
+
* missed these single-method derive-style impls. Worth being conservative
|
|
669
|
+
* here — Display::fmt is usually 3+ lines, From::from sometimes IS 1
|
|
670
|
+
* line and is genuinely trivial. The 1.5-line cutoff fires only on
|
|
671
|
+
* truly stub-grade single-fns (closer to derive macros than real impls).
|
|
672
|
+
*
|
|
673
|
+
* Returns the estimated average body line count, or `Infinity` if the chunk
|
|
674
|
+
* contains no fn definitions. Lower = more stub-like.
|
|
675
|
+
*/
|
|
676
|
+
export function avgFnBodyLines(text) {
|
|
677
|
+
if (typeof text !== 'string' || text.length === 0) return Infinity;
|
|
678
|
+
const fnRe = /^\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+|const\s+|unsafe\s+)*fn\s+\w+/gm;
|
|
679
|
+
const matches = [];
|
|
680
|
+
let m;
|
|
681
|
+
while ((m = fnRe.exec(text)) !== null) matches.push(m.index);
|
|
682
|
+
if (matches.length === 0) return Infinity;
|
|
683
|
+
let totalBodyLines = 0;
|
|
684
|
+
let counted = 0;
|
|
685
|
+
for (const startIdx of matches) {
|
|
686
|
+
// Find the opening `{` after this fn signature.
|
|
687
|
+
const openIdx = text.indexOf('{', startIdx);
|
|
688
|
+
if (openIdx === -1) continue;
|
|
689
|
+
// Walk braces to find the matching close.
|
|
690
|
+
let depth = 1;
|
|
691
|
+
let j = openIdx + 1;
|
|
692
|
+
let inString = false;
|
|
693
|
+
let stringTerm = null;
|
|
694
|
+
while (j < text.length && depth > 0) {
|
|
695
|
+
const ch = text[j];
|
|
696
|
+
if (inString) {
|
|
697
|
+
if (ch === '\\') { j += 2; continue; }
|
|
698
|
+
if (ch === stringTerm) inString = false;
|
|
699
|
+
} else {
|
|
700
|
+
if (ch === '"' || ch === "'") { inString = true; stringTerm = ch; }
|
|
701
|
+
else if (ch === '{') depth++;
|
|
702
|
+
else if (ch === '}') depth--;
|
|
703
|
+
}
|
|
704
|
+
j++;
|
|
705
|
+
}
|
|
706
|
+
if (depth !== 0) continue;
|
|
707
|
+
const body = text.slice(openIdx + 1, j - 1);
|
|
708
|
+
const bodyLines = body.split('\n').filter(l => l.trim().length > 0).length;
|
|
709
|
+
totalBodyLines += bodyLines;
|
|
710
|
+
counted++;
|
|
711
|
+
}
|
|
712
|
+
if (counted === 0) return Infinity;
|
|
713
|
+
// Single-fn impls with ≤1.5 substantive body lines (1 trivial line plus
|
|
714
|
+
// the closing brace, or a 1-line `Self { ... }` body) are derive-equivalent
|
|
715
|
+
// stubs (UV-FLOW-8 case: `impl Clone for X { fn clone(&self) -> Self { Self {...} } }`).
|
|
716
|
+
// Multi-fn impls keep the original average-body rule.
|
|
717
|
+
if (counted === 1) {
|
|
718
|
+
return totalBodyLines <= 1.5 ? totalBodyLines : Infinity;
|
|
719
|
+
}
|
|
720
|
+
return totalBodyLines / counted;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
/**
|
|
724
|
+
* Estimate the fraction of a chunk's characters that live inside Rust
|
|
725
|
+
* raw-string literals. Returns a number in [0, 1].
|
|
726
|
+
*
|
|
727
|
+
* Heuristic: scan the text once tracking entry into `r#"`/`r"` regions and
|
|
728
|
+
* exit at the matching `"#`/`"`. Counts only the inner payload chars.
|
|
729
|
+
*/
|
|
730
|
+
export function rawStringDensity(text) {
|
|
731
|
+
if (typeof text !== 'string' || text.length === 0) return 0;
|
|
732
|
+
let i = 0;
|
|
733
|
+
let inside = 0;
|
|
734
|
+
let total = 0;
|
|
735
|
+
const len = text.length;
|
|
736
|
+
while (i < len) {
|
|
737
|
+
if (!/\s/.test(text[i])) total++;
|
|
738
|
+
// Detect `r#*"` opener.
|
|
739
|
+
if (text[i] === 'r' && (text[i + 1] === '"' || text[i + 1] === '#')) {
|
|
740
|
+
let j = i + 1;
|
|
741
|
+
let hashCount = 0;
|
|
742
|
+
while (text[j] === '#') { hashCount++; j++; }
|
|
743
|
+
if (text[j] === '"') {
|
|
744
|
+
// We're inside a raw string. Find the matching close.
|
|
745
|
+
const closeNeedle = '"' + '#'.repeat(hashCount);
|
|
746
|
+
const closeAt = text.indexOf(closeNeedle, j + 1);
|
|
747
|
+
if (closeAt === -1) {
|
|
748
|
+
// unterminated — count rest of file as inside
|
|
749
|
+
for (let k = j + 1; k < len; k++) {
|
|
750
|
+
if (!/\s/.test(text[k])) { inside++; total++; }
|
|
751
|
+
}
|
|
752
|
+
return total === 0 ? 0 : inside / total;
|
|
753
|
+
}
|
|
754
|
+
for (let k = j + 1; k < closeAt; k++) {
|
|
755
|
+
if (!/\s/.test(text[k])) { inside++; total++; }
|
|
756
|
+
}
|
|
757
|
+
i = closeAt + closeNeedle.length;
|
|
758
|
+
continue;
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
i++;
|
|
762
|
+
}
|
|
763
|
+
return total === 0 ? 0 : inside / total;
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
/**
|
|
767
|
+
* Mega-chunk size penalty (added 2026-05-07 — 60-probe diagnosis).
|
|
768
|
+
*
|
|
769
|
+
* Long candidate chunks (entire 1500-line classes, 700-line module
|
|
770
|
+
* functions) systematically outscore precise 30-line chunks even when the
|
|
771
|
+
* latter contain the actual answer. The dense bi-encoder doesn't penalise
|
|
772
|
+
* length the way BM25's `b` parameter does, so a mega-chunk that touches
|
|
773
|
+
* many topics earns a moderate similarity to many queries.
|
|
774
|
+
*
|
|
775
|
+
* SOTA precedent: BM25 length normalization (Robertson & Zaragoza 2009),
|
|
776
|
+
* subsequently incorporated as length penalties in dense rerankers
|
|
777
|
+
* (ColBERTv2 token-budget caps, MS-MARCO-tuned cross-encoders). Soft
|
|
778
|
+
* piecewise-linear here rather than `1/(1 + b·L/L_avg)` because (a) we
|
|
779
|
+
* lack a per-corpus L_avg estimate at query time and (b) BM25-style
|
|
780
|
+
* normalization is too aggressive for long behavioural-flow chunks where
|
|
781
|
+
* length carries some signal.
|
|
782
|
+
*
|
|
783
|
+
* Tuning floor/slope to be PERMISSIVE — ONLY truly mega chunks lose score:
|
|
784
|
+
* - L ≤ 500 lines → factor 1.0 (no penalty — every reasonable function chunk)
|
|
785
|
+
* - L = 800 → ~0.91 (typical large class)
|
|
786
|
+
* - L = 1000 → ~0.85
|
|
787
|
+
* - L ≥ 1500 → 0.80 (floor — entire-file chunks)
|
|
788
|
+
*
|
|
789
|
+
* Tightened cutoff from 200 → 500 after S6-Q6 gin regression: a 40-line
|
|
790
|
+
* `New` function had been the right top-1, but penalising 200+ chunks
|
|
791
|
+
* shifted the within-file ranking. 500-line cutoff exempts every legit
|
|
792
|
+
* function/method chunk and only demotes whole-class megachunks.
|
|
793
|
+
*
|
|
794
|
+
* Override via env: SWEET_SEARCH_MEGA_CHUNK_CUTOFF (default 500),
|
|
795
|
+
* SWEET_SEARCH_MEGA_CHUNK_SLOPE (default 0.0003 per-line),
|
|
796
|
+
* SWEET_SEARCH_MEGA_CHUNK_FLOOR (default 0.80). Disable via
|
|
797
|
+
* SWEET_SEARCH_MEGA_CHUNK_FLOOR=1 (no-op) or
|
|
798
|
+
* `ablations: ['no-mega-chunk-penalty']`.
|
|
799
|
+
*
|
|
800
|
+
* Diagnosed cases (60-probe new-set):
|
|
801
|
+
* - S5-Q10 flask: 1516-line `class Flask` chunk beat 30-line `abort` fn
|
|
802
|
+
* - S4-Q2 fastify: 735-line `function fastify` chunk beat 1-line
|
|
803
|
+
* `kRouteContext` symbol declaration
|
|
804
|
+
*/
|
|
805
|
+
/**
|
|
806
|
+
* Symbol-exact-match boost for definition-style queries.
|
|
807
|
+
*
|
|
808
|
+
* Added 2026-05-07 — both diagnoses (FreshStack uv #1, 60-probe new-set #1)
|
|
809
|
+
* converged on this as the highest-impact fix. When a query has the shape
|
|
810
|
+
* "show me X struct/enum/class/function/...", chunks where the symbol name
|
|
811
|
+
* EQUALS X (case-insensitive, after stemming s/es/ing suffixes) should
|
|
812
|
+
* dominate the lexical-collision sibling chunk that the encoder happens
|
|
813
|
+
* to score nearby.
|
|
814
|
+
*
|
|
815
|
+
* Diagnosed cases (combined): Cache vs CacheArgs (UV-DEF-1), Resolver vs
|
|
816
|
+
* Resolution (UV-DEF-4), ContentTypeParser vs ContentType (S6-Q2),
|
|
817
|
+
* Flask vs App (S6-Q9), buildErrorHandler vs setErrorHeaders (S6-Q3),
|
|
818
|
+
* Set method vs Value method (S3-Q6), get_send_file_max_age vs
|
|
819
|
+
* send_static_file (S3-Q9). 8+ failures in the new-probe set, 4 in
|
|
820
|
+
* FreshStack — strong evidence of a real systematic gap.
|
|
821
|
+
*
|
|
822
|
+
* SOTA precedent: BM25F field-weighted boosting on the symbol field
|
|
823
|
+
* (canonical IR move when one field carries decisive signal); ColBERTv2
|
|
824
|
+
* "expansion-aware reranking" with identifier prior; Sourcegraph Cody's
|
|
825
|
+
* "hint" tokens that bias toward exact symbol matches in graph-aware
|
|
826
|
+
* retrieval (Cody arXiv 2408.05344).
|
|
827
|
+
*
|
|
828
|
+
* Trigger pattern (conservative — only fires on UNAMBIGUOUS definition
|
|
829
|
+
* queries):
|
|
830
|
+
* /\b(show|give|find|describe|display|fetch).+?(?:the\s+)?(\w+)\s+
|
|
831
|
+
* (struct|enum|class|fn|function|method|trait|type|interface|impl|
|
|
832
|
+
* definition|signature|prototype|constructor)\b/i
|
|
833
|
+
*
|
|
834
|
+
* Plus a "WHAT IS X TYPE" alternate trigger:
|
|
835
|
+
* /\bwhat\s+(?:is|does)\s+(?:the\s+)?(\w+)\s+
|
|
836
|
+
* (struct|enum|class|function|method|type)\b/i
|
|
837
|
+
*
|
|
838
|
+
* Boost: 1.30× when chunk.symbol case-insensitive-equals the captured
|
|
839
|
+
* identifier. Capped at 1.30 (mild — definition queries account for ≤25%
|
|
840
|
+
* of probe traffic so a stronger boost risks breaking non-DEF queries).
|
|
841
|
+
*
|
|
842
|
+
* Override env: SWEET_SEARCH_SYMBOL_EXACT_BOOST (default 1.30, set to 1.0
|
|
843
|
+
* to disable). `ablations: ['no-symbol-exact-boost']` also disables.
|
|
844
|
+
*/
|
|
845
|
+
// Lazy quantifier on the prefix so the capture greedily prefers an
|
|
846
|
+
// identifier-like noun (buildErrorHandler) over a keyword that happens
|
|
847
|
+
// to also be in the trailing list (function/definition). Verified
|
|
848
|
+
// 2026-05-07: greedy version captured "function" for
|
|
849
|
+
// "show me the buildErrorHandler function definition in full",
|
|
850
|
+
// missing the contained-entity boost on S6-Q3. But lazy also fails on
|
|
851
|
+
// "show me the full Engine struct" (captures "the"/"full") — which is
|
|
852
|
+
// why extractSymbolDefinitionTarget tries lazy first and falls back to
|
|
853
|
+
// greedy when the lazy capture is a stopword.
|
|
854
|
+
const SYMBOL_DEFN_QUERY_RE = new RegExp(
|
|
855
|
+
'\\b(?:show|give|find|describe|display|fetch|see)' +
|
|
856
|
+
'(?:\\s+\\w+){0,5}?\\s+' +
|
|
857
|
+
'(?:the\\s+)?' +
|
|
858
|
+
'(\\w+)' +
|
|
859
|
+
'(?:\\s+\\w+)?\\s+' +
|
|
860
|
+
'(?:struct|enum|class|fn|function|method|trait|type|interface|impl|' +
|
|
861
|
+
'definition|signature|prototype|constructor)\\b',
|
|
862
|
+
'i'
|
|
863
|
+
);
|
|
864
|
+
const SYMBOL_DEFN_QUERY_RE_GREEDY = new RegExp(
|
|
865
|
+
'\\b(?:show|give|find|describe|display|fetch|see)' +
|
|
866
|
+
'(?:\\s+\\w+){0,5}\\s+' +
|
|
867
|
+
'(?:the\\s+)?' +
|
|
868
|
+
'(\\w+)' +
|
|
869
|
+
'(?:\\s+\\w+)?\\s+' +
|
|
870
|
+
'(?:struct|enum|class|fn|function|method|trait|type|interface|impl|' +
|
|
871
|
+
'definition|signature|prototype|constructor)\\b',
|
|
872
|
+
'i'
|
|
873
|
+
);
|
|
874
|
+
const SYMBOL_WHATIS_QUERY_RE = new RegExp(
|
|
875
|
+
'\\bwhat\\s+(?:is|does|are)\\s+(?:the\\s+)?' +
|
|
876
|
+
'(\\w+)\\s+' +
|
|
877
|
+
'(?:struct|enum|class|function|method|type|trait|interface|' +
|
|
878
|
+
'renderer|handler|component|service|module|controller|provider|builder)\\b',
|
|
879
|
+
'i'
|
|
880
|
+
);
|
|
881
|
+
// "where is the X function/method/struct" pattern — captures probe-style queries
|
|
882
|
+
// like S3-Q4 "where is the Default function..." and S3-Q6 "where is the Set
|
|
883
|
+
// method on Context...". Added 2026-05-07 after F7 trace showed extractSymbolDefinitionTarget
|
|
884
|
+
// returned null for these queries, missing the contained-entity boost.
|
|
885
|
+
const SYMBOL_WHERE_QUERY_RE = new RegExp(
|
|
886
|
+
'\\bwhere\\s+(?:is|does)\\s+(?:the\\s+)?' +
|
|
887
|
+
'(\\w+)\\s+' +
|
|
888
|
+
'(?:struct|enum|class|fn|function|method|trait|type|interface|impl|' +
|
|
889
|
+
'definition|signature|prototype|constructor)\\b',
|
|
890
|
+
'i'
|
|
891
|
+
);
|
|
892
|
+
|
|
893
|
+
// Identifier-shape heuristic: code identifiers across all languages
|
|
894
|
+
// commonly use one of: uppercase letters (PascalCase / camelCase),
|
|
895
|
+
// underscores (snake_case), hyphens (kebab-case), or digits. Plain
|
|
896
|
+
// English adjectives / determiners ("the", "complete", "every") fall
|
|
897
|
+
// outside this shape. This is more principled than a curated stopword
|
|
898
|
+
// list — it generalizes to non-English languages, avoids removing
|
|
899
|
+
// real lowercase identifiers like Rust `lock` / Python `commit` (which
|
|
900
|
+
// stay as final-fallback when no identifier-shape candidate exists),
|
|
901
|
+
// and doesn't require maintaining a word list. Long-term, swap for a
|
|
902
|
+
// small POS classifier if false-positive identifier captures appear.
|
|
903
|
+
function looksLikeIdentifier(name) {
|
|
904
|
+
if (!name || name.length < 3) return false;
|
|
905
|
+
return /[A-Z_\-0-9]/.test(name);
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
function extractSymbolDefinitionTarget(query) {
|
|
909
|
+
if (!query || typeof query !== 'string') return null;
|
|
910
|
+
const candidates = [];
|
|
911
|
+
for (const re of [SYMBOL_DEFN_QUERY_RE, SYMBOL_DEFN_QUERY_RE_GREEDY, SYMBOL_WHATIS_QUERY_RE, SYMBOL_WHERE_QUERY_RE]) {
|
|
912
|
+
const m = query.match(re);
|
|
913
|
+
if (m && m[1] && m[1].length >= 3) candidates.push(m[1]);
|
|
914
|
+
}
|
|
915
|
+
if (candidates.length === 0) return null;
|
|
916
|
+
// Prefer identifier-shape captures (uppercase / underscore / digit) over
|
|
917
|
+
// plain lowercase English captures. Falls back to first capture if no
|
|
918
|
+
// identifier-shape candidate found (catches lowercase identifiers like
|
|
919
|
+
// Rust `lock` or Python `commit`).
|
|
920
|
+
const idShape = candidates.find(looksLikeIdentifier);
|
|
921
|
+
return idShape || candidates[0];
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
/**
|
|
925
|
+
* Strict identifier-shape filter for query-token-to-symbol-name matching.
|
|
926
|
+
*
|
|
927
|
+
* Distinguishes "code identifier" from "English word that happens to start
|
|
928
|
+
* uppercase". A token is strict-identifier-shaped if it is ≥4 chars AND
|
|
929
|
+
* has one of:
|
|
930
|
+
* - digit (Vec128, AVX2, std::int32_t)
|
|
931
|
+
* - underscore (HWY_DLLEXPORT, snake_case_thing, _Alignas)
|
|
932
|
+
* - internal uppercase beyond the first character (FunctionCache,
|
|
933
|
+
* AlignedDeleter, DetectTargets — true camelCase / PascalCase)
|
|
934
|
+
*
|
|
935
|
+
* This excludes single-cap English nouns: "Type", "Class", "Vector", "Map",
|
|
936
|
+
* "Set", "Function", "Method", "Method", "Component" — none have a digit,
|
|
937
|
+
* underscore, or internal uppercase, so they fail the structural check.
|
|
938
|
+
* Excludes 3-char tokens like "SSE", "x86", "AVX" — these are domain
|
|
939
|
+
* acronyms that risk false-matching unrelated short symbols. The 1.15× boost
|
|
940
|
+
* trades off vs the 1.30× of the verb-anchored extractor: lower precision,
|
|
941
|
+
* higher recall on noun-anchored probe queries.
|
|
942
|
+
*/
|
|
943
|
+
function looksLikeStrictIdentifier(token) {
|
|
944
|
+
if (!token || token.length < 4) return false;
|
|
945
|
+
if (/\d/.test(token)) return true;
|
|
946
|
+
if (/_/.test(token)) return true;
|
|
947
|
+
// Internal uppercase: after the first character, find another upper.
|
|
948
|
+
// `^.[a-z0-9]*[A-Z]` is too permissive (matches XYz). Require any
|
|
949
|
+
// upper at position ≥1 (so "Aa" doesn't trigger but "AaA" does).
|
|
950
|
+
for (let i = 1; i < token.length; i++) {
|
|
951
|
+
if (token[i] >= 'A' && token[i] <= 'Z') return true;
|
|
952
|
+
}
|
|
953
|
+
return false;
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
// =============================================================================
|
|
957
|
+
// F9 (2026-05-12): additional_symbols re-anchoring for cAST sibling-merged
|
|
958
|
+
// chunks. JS/TS-gated. Pure ranking-time metadata fix — no chunk regeneration,
|
|
959
|
+
// no reindex required.
|
|
960
|
+
//
|
|
961
|
+
// Motivation: cAST sibling-merge collapses ≥2 top-level boundaries into one
|
|
962
|
+
// chunk and attributes the chunk to the FIRST boundary's name. Probe failures
|
|
963
|
+
// like TS-006 (chunk named `SlashCommand` but expected `slashCommands`) and
|
|
964
|
+
// TS-008 (chunk named `regularPrompt` but expected `systemPrompt`) are
|
|
965
|
+
// structurally PARTIAL — file is correct, symbol is the wrong sibling.
|
|
966
|
+
//
|
|
967
|
+
// The chunker already records secondary boundary names in
|
|
968
|
+
// `metadata.additional_symbols` (tree-sitter-provider.js:928-934). F9
|
|
969
|
+
// promotes the best-matching sibling to the chunk's primary label when the
|
|
970
|
+
// query references it more strongly than the original primary.
|
|
971
|
+
//
|
|
972
|
+
// SOTA references:
|
|
973
|
+
// - Sourcegraph BM25F (2025): "treat symbols as a multi-valued field"
|
|
974
|
+
// - Supermemory code-chunk: explicit scope-tree carries secondary entities
|
|
975
|
+
// per chunk, not just the head boundary
|
|
976
|
+
// - cAST (arXiv 2506.15655): acknowledges the sibling-merge attribution
|
|
977
|
+
// gap as a known limitation
|
|
978
|
+
//
|
|
979
|
+
// Pilot scope: JS/TS/TSX/JSX only. The mechanism generalizes to every
|
|
980
|
+
// language but per-language gating limits the validation surface for the
|
|
981
|
+
// initial rollout. Promote to additional languages once probes confirm gain
|
|
982
|
+
// on JS/TS with zero regressions on JS/TS + GCSN.
|
|
983
|
+
// =============================================================================
|
|
984
|
+
|
|
985
|
+
const JSTS_LANGS = new Set(['javascript', 'typescript', 'tsx', 'jsx']);
|
|
986
|
+
const JSTS_EXTENSIONS = new Set(['.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx', '.mts', '.cts']);
|
|
987
|
+
|
|
988
|
+
function isJsTsResult(result) {
|
|
989
|
+
const meta = result?.metadata ?? {};
|
|
990
|
+
if (meta.language && JSTS_LANGS.has(meta.language)) return true;
|
|
991
|
+
// Expansion entities from graph-expansion.js carry no metadata.language;
|
|
992
|
+
// fall back to file-extension sniff so F9 can still process them.
|
|
993
|
+
const fp = result?.file_path || result?.file || meta.file || meta.path || result?.filePath || '';
|
|
994
|
+
if (!fp) return false;
|
|
995
|
+
const dot = fp.lastIndexOf('.');
|
|
996
|
+
if (dot < 0) return false;
|
|
997
|
+
return JSTS_EXTENSIONS.has(fp.slice(dot).toLowerCase());
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
/**
|
|
1001
|
+
* Split a camelCase/PascalCase/snake_case/kebab-case identifier into
|
|
1002
|
+
* lowercased sub-tokens. Filters out very short fragments.
|
|
1003
|
+
*
|
|
1004
|
+
* slashCommands → ['slash', 'commands']
|
|
1005
|
+
* SlashCommand → ['slash', 'command']
|
|
1006
|
+
* entitlementsByUserType → ['entitlements', 'by', 'user', 'type']
|
|
1007
|
+
* $ZodTypeInternals → ['zod', 'type', 'internals'] ($ stripped)
|
|
1008
|
+
* HTTPSConnection → ['https', 'connection']
|
|
1009
|
+
*/
|
|
1010
|
+
function splitCamelCaseTokens(name) {
|
|
1011
|
+
if (!name) return [];
|
|
1012
|
+
return String(name)
|
|
1013
|
+
.replace(/\$/g, '') // strip $ (zod-style prefix)
|
|
1014
|
+
.replace(/([a-z0-9])([A-Z])/g, '$1 $2') // camelCase boundary
|
|
1015
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') // PascalCase run boundary
|
|
1016
|
+
.replace(/[_\-]/g, ' ')
|
|
1017
|
+
.toLowerCase()
|
|
1018
|
+
.split(/\s+/)
|
|
1019
|
+
.filter(t => t.length >= 2);
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
/**
|
|
1023
|
+
* Match a name's camelCase token against a query word. Handles plural/
|
|
1024
|
+
* singular and prefix variants with a length threshold to avoid noise.
|
|
1025
|
+
*
|
|
1026
|
+
* tokenMatches('error', 'errors') → true (prefix, both ≥4)
|
|
1027
|
+
* tokenMatches('type', 'type') → true (exact)
|
|
1028
|
+
* tokenMatches('by', 'by') → true (exact, short OK)
|
|
1029
|
+
* tokenMatches('to', 'tokenize') → false (prefix but shorter is <3)
|
|
1030
|
+
*/
|
|
1031
|
+
function tokenMatches(token, queryWord) {
|
|
1032
|
+
if (!token || !queryWord) return false;
|
|
1033
|
+
if (token === queryWord) return true;
|
|
1034
|
+
// For very short tokens, require exact only
|
|
1035
|
+
if (token.length < 3 || queryWord.length < 3) return false;
|
|
1036
|
+
const shorter = token.length <= queryWord.length ? token : queryWord;
|
|
1037
|
+
const longer = token.length > queryWord.length ? token : queryWord;
|
|
1038
|
+
if (shorter.length < 4) return false; // 'try'/'tried'/'trie' would otherwise alias
|
|
1039
|
+
return longer.startsWith(shorter);
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
/**
|
|
1043
|
+
* Score how strongly a candidate name matches the query, in tiers.
|
|
1044
|
+
*
|
|
1045
|
+
* tier 2 (literal): full lowercased name appears as a literal query word.
|
|
1046
|
+
* Strong signal: query mentions the identifier directly.
|
|
1047
|
+
* tier 1 (tokens): ALL camelCase tokens of the name are covered by some
|
|
1048
|
+
* query word (with prefix/plural matching). Medium signal:
|
|
1049
|
+
* query describes the identifier compositionally.
|
|
1050
|
+
* tier 0: not enough tokens covered — abstain.
|
|
1051
|
+
*
|
|
1052
|
+
* Returns {tier, tokens} so callers can compare tiers explicitly. Within a
|
|
1053
|
+
* tier, comparing on raw token count is noisy (sibling names often outscore
|
|
1054
|
+
* the primary by 1 token in ways that don't reflect query intent — e.g.,
|
|
1055
|
+
* "codeArtifact definition including its onStreamPart handler" mentions
|
|
1056
|
+
* BOTH names literally, but the user means codeArtifact). The relabel
|
|
1057
|
+
* rule should require sibling.tier > primary.tier strictly.
|
|
1058
|
+
*/
|
|
1059
|
+
function scoreNameMatchTiered(name, queryWordsArr, queryWordsSet) {
|
|
1060
|
+
if (!name) return { tier: 0, tokens: 0 };
|
|
1061
|
+
const tokens = splitCamelCaseTokens(name);
|
|
1062
|
+
if (tokens.length === 0) return { tier: 0, tokens: 0 };
|
|
1063
|
+
const nLowerRaw = String(name).toLowerCase();
|
|
1064
|
+
const nLowerStripped = nLowerRaw.replace(/\$/g, '');
|
|
1065
|
+
// Tier 2: literal full-name match. Check $-preserving form first so a
|
|
1066
|
+
// query mentioning "$ZodType" exact-matches v4/core/$ZodType but only
|
|
1067
|
+
// token-matches v4/classic/ZodType (F10).
|
|
1068
|
+
if (queryWordsSet.has(nLowerRaw) || queryWordsSet.has(nLowerStripped)) {
|
|
1069
|
+
return { tier: 2, tokens: tokens.length };
|
|
1070
|
+
}
|
|
1071
|
+
// Tier 1: all camelCase tokens covered
|
|
1072
|
+
for (const t of tokens) {
|
|
1073
|
+
let found = false;
|
|
1074
|
+
for (let i = 0; i < queryWordsArr.length; i++) {
|
|
1075
|
+
if (tokenMatches(t, queryWordsArr[i])) { found = true; break; }
|
|
1076
|
+
}
|
|
1077
|
+
if (!found) return { tier: 0, tokens: 0 };
|
|
1078
|
+
}
|
|
1079
|
+
return { tier: 1, tokens: tokens.length };
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
/**
|
|
1083
|
+
* Find a sibling entity in the chunk's range that beats the primary on
|
|
1084
|
+
* query match. Returns the matching code-graph entity (for F8-style label
|
|
1085
|
+
* adoption) or null.
|
|
1086
|
+
*
|
|
1087
|
+
* Queries the code graph for ALL entities declared in the chunk's range
|
|
1088
|
+
* (every top-level boundary cAST merged in). Scores each entity name
|
|
1089
|
+
* against the query and picks the best match — but only relabels if it
|
|
1090
|
+
* STRICTLY beats the primary's match. Ties keep primary (avoid noise).
|
|
1091
|
+
*
|
|
1092
|
+
* Returns null unless: language ∈ JS/TS/TSX/JSX, codeGraphRepo is available
|
|
1093
|
+
* with findEntitiesInRange, ≥1 sibling entity exists, the best sibling
|
|
1094
|
+
* beats primary, AND the primary did NOT already match the query strongly
|
|
1095
|
+
* (a strong primary match means the encoder + indexer agreed — don't
|
|
1096
|
+
* second-guess them at relabel time).
|
|
1097
|
+
*/
|
|
1098
|
+
function findAdditionalSymbolRelabel(result, queryWordsArr, queryWordsSet, opts) {
|
|
1099
|
+
// 2026-05-13: widened from JS/TS-only to all languages. The cAST sibling-
|
|
1100
|
+
// merge phenomenon F9 addresses is universal — Java/C#/Lua/Python classes
|
|
1101
|
+
// and namespaces also produce one-chunk-per-N-siblings outputs that label
|
|
1102
|
+
// with the first entity. The `findEntitiesInRange` + tiered match logic
|
|
1103
|
+
// generalises; the JS/TS gate was a pilot scope, not a semantic requirement.
|
|
1104
|
+
// Stage 7 audit (stage7-deep-diagnosis.md) confirmed ~8 Java/C#/C-family
|
|
1105
|
+
// PARTIALs sitting on this exact mechanism. Validate against post-perf-60
|
|
1106
|
+
// + GCSN + 18-language probes — revert/narrow if any regression.
|
|
1107
|
+
// Original `isJsTsResult(result)` gate removed; behaviour now language-
|
|
1108
|
+
// agnostic with the strict-identifier and tier-strict-beat gates still
|
|
1109
|
+
// enforced below.
|
|
1110
|
+
const meta = result?.metadata ?? {};
|
|
1111
|
+
|
|
1112
|
+
if (!opts.codeGraphRepo || typeof opts.codeGraphRepo.findEntitiesInRange !== 'function') {
|
|
1113
|
+
return null;
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
const fp = resolveFilePath(result);
|
|
1117
|
+
const sl = Number(result?.startLine ?? meta.startLine ?? meta.line_start);
|
|
1118
|
+
const el = Number(result?.endLine ?? meta.endLine ?? meta.line_end);
|
|
1119
|
+
if (!fp || !Number.isFinite(sl) || !Number.isFinite(el)) return null;
|
|
1120
|
+
|
|
1121
|
+
// Cache per (file, range) — entities don't change within a query call.
|
|
1122
|
+
const cache = opts._entityNameCache;
|
|
1123
|
+
const entitiesCacheKey = cache ? `${fp}|${sl}|${el}|f9-entities` : null;
|
|
1124
|
+
let entities;
|
|
1125
|
+
if (entitiesCacheKey && cache.has(entitiesCacheKey)) {
|
|
1126
|
+
entities = cache.get(entitiesCacheKey);
|
|
1127
|
+
} else {
|
|
1128
|
+
try { entities = opts.codeGraphRepo.findEntitiesInRange(fp, sl, el); }
|
|
1129
|
+
catch { entities = []; }
|
|
1130
|
+
if (entitiesCacheKey) cache.set(entitiesCacheKey, entities);
|
|
1131
|
+
}
|
|
1132
|
+
if (!Array.isArray(entities) || entities.length < 2) return null;
|
|
1133
|
+
|
|
1134
|
+
const primaryName = meta.symbol || meta.name || result?.name || result?.symbol || null;
|
|
1135
|
+
// Critical gate: F9 only operates on chunks that ALREADY have a primary
|
|
1136
|
+
// name attributed by the indexer (cAST sibling-merge case). When primary
|
|
1137
|
+
// is null/missing, the chunk is an anonymous code-block whose label will
|
|
1138
|
+
// be resolved by context-expander via findFirstEntityInRange at result
|
|
1139
|
+
// presentation time — F9 must not preempt that path (regresses TS-004:
|
|
1140
|
+
// every artifact client.tsx chunk with name=null was being relabeled to
|
|
1141
|
+
// the inner `onStreamPart` arrow function the query mentions).
|
|
1142
|
+
if (!primaryName) return null;
|
|
1143
|
+
const primaryNameLower = String(primaryName).toLowerCase();
|
|
1144
|
+
const primaryMatch = scoreNameMatchTiered(primaryName, queryWordsArr, queryWordsSet);
|
|
1145
|
+
|
|
1146
|
+
let bestEntity = null;
|
|
1147
|
+
let bestMatch = { tier: 0, tokens: 0 };
|
|
1148
|
+
for (const ent of entities) {
|
|
1149
|
+
if (!ent?.name) continue;
|
|
1150
|
+
if (primaryNameLower && String(ent.name).toLowerCase() === primaryNameLower) continue;
|
|
1151
|
+
// Require strict-identifier shape on the candidate. Avoids relabeling
|
|
1152
|
+
// to a common English-word entity captured by tree-sitter — extremely
|
|
1153
|
+
// rare in well-typed JS/TS but cheap to guard.
|
|
1154
|
+
if (!looksLikeStrictIdentifier(ent.name)) continue;
|
|
1155
|
+
const m = scoreNameMatchTiered(ent.name, queryWordsArr, queryWordsSet);
|
|
1156
|
+
if (m.tier > bestMatch.tier || (m.tier === bestMatch.tier && m.tokens > bestMatch.tokens)) {
|
|
1157
|
+
bestMatch = m;
|
|
1158
|
+
bestEntity = ent;
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
// Only relabel when sibling tier STRICTLY beats primary tier. Same-tier
|
|
1163
|
+
// ties keep primary — when both are literal-name matches in the query
|
|
1164
|
+
// (TS-004: codeArtifact + onStreamPart both literal), the chunker's
|
|
1165
|
+
// primary attribution wins because the encoder already ranked the chunk
|
|
1166
|
+
// on that signal.
|
|
1167
|
+
if (!bestEntity || bestMatch.tier === 0 || bestMatch.tier <= primaryMatch.tier) return null;
|
|
1168
|
+
return bestEntity;
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
/**
|
|
1172
|
+
* Extract strict-identifier-shaped tokens from a query. Used by
|
|
1173
|
+
* identifierMentionBoost as a complement to extractSymbolDefinitionTarget:
|
|
1174
|
+
* the verb-anchored extractor catches "show me X struct" patterns; this
|
|
1175
|
+
* noun-anchored extractor catches "X with Y characteristic" probe-style
|
|
1176
|
+
* phrasings ("Vec128 SSE vector class template", "AlignedDeleter RAII class",
|
|
1177
|
+
* "FunctionCache template struct").
|
|
1178
|
+
*
|
|
1179
|
+
* Returns a Set of tokens. Returns null if no strict-identifier tokens found.
|
|
1180
|
+
*/
|
|
1181
|
+
function extractIdentifierMentions(query) {
|
|
1182
|
+
if (!query || typeof query !== 'string') return null;
|
|
1183
|
+
const mentions = new Set();
|
|
1184
|
+
// 2026-05-14: preserve leading `$` so identifiers like `$ZodType` /
|
|
1185
|
+
// `$ZodTypeInternals` (zod v4/core public-API convention — structural
|
|
1186
|
+
// interfaces are $-prefixed while runtime classes are not) round-trip
|
|
1187
|
+
// through tokenization. Mirrors F10's fix for the F9 path (line ~1944)
|
|
1188
|
+
// — the same parser inconsistency caused identifier-mention boost to
|
|
1189
|
+
// fire on wrong chunks (v3/types.ts::ZodType) while missing the right
|
|
1190
|
+
// chunk (v4/core/schemas.ts containing the $-prefixed entity). Path 2
|
|
1191
|
+
// (`findEntityWithNameInRange`) is case-insensitive on `_`/`-` but NOT
|
|
1192
|
+
// on `$`, so the mention must preserve the `$` to find the entity.
|
|
1193
|
+
const tokens = query.match(/\$?[A-Za-z_][A-Za-z0-9_]*\b/g) || [];
|
|
1194
|
+
for (const tok of tokens) {
|
|
1195
|
+
if (looksLikeStrictIdentifier(tok)) mentions.add(tok);
|
|
1196
|
+
}
|
|
1197
|
+
// Also capture dotted compound identifiers. Lua (`tablex.deepcopy`),
|
|
1198
|
+
// Python (`os.path.exists`), and Ruby's `Module.method` style produce
|
|
1199
|
+
// code-graph entity names with embedded `.` — the single-token extractor
|
|
1200
|
+
// splits these into `tablex` / `deepcopy`, which then never matches the
|
|
1201
|
+
// chunk's actual entity name verbatim. The dotted form bypasses
|
|
1202
|
+
// `looksLikeStrictIdentifier` because a `a.b` shape is inherently
|
|
1203
|
+
// identifier-like (no English word contains a `.`).
|
|
1204
|
+
const dotted = query.match(/\b[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+\b/g) || [];
|
|
1205
|
+
for (const tok of dotted) mentions.add(tok);
|
|
1206
|
+
return mentions.size > 0 ? mentions : null;
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
/**
|
|
1210
|
+
* Mild symbol-name boost triggered by strict-identifier mentions in the
|
|
1211
|
+
* query (complement to symbolExactMatchBoost which uses the verb-anchored
|
|
1212
|
+
* `extractSymbolDefinitionTarget`). Same BM25F field-weighted principle:
|
|
1213
|
+
* when the query explicitly names an identifier, prefer chunks whose
|
|
1214
|
+
* symbol matches that name.
|
|
1215
|
+
*
|
|
1216
|
+
* Calibrated lower (×1.15 default vs ×1.30 for the verb path) because the
|
|
1217
|
+
* noun-anchored trigger has lower precision: a query mentioning "Vec128"
|
|
1218
|
+
* may or may not be asking about that specific symbol. Verb-anchored
|
|
1219
|
+
* queries like "show me X struct" are unambiguous; noun-anchored mentions
|
|
1220
|
+
* are weaker signals.
|
|
1221
|
+
*
|
|
1222
|
+
* Skips the verb-anchored target to avoid double-counting: if
|
|
1223
|
+
* extractSymbolDefinitionTarget already returned "FunctionCache" and the
|
|
1224
|
+
* symbol matches that, symbolExactMatchBoost handles it; this boost
|
|
1225
|
+
* only fires on DIFFERENT mentions, or on mentions where the verb path
|
|
1226
|
+
* did not fire (target was null).
|
|
1227
|
+
*
|
|
1228
|
+
* Format-gated via the caller (only fires when isAgentFormat=true).
|
|
1229
|
+
*/
|
|
1230
|
+
function identifierMentionBoost(result, mentions, opts = {}) {
|
|
1231
|
+
if (!mentions || mentions.size === 0) return 1.0;
|
|
1232
|
+
const raw = process.env.SWEET_SEARCH_IDENTIFIER_MENTION_BOOST;
|
|
1233
|
+
let boost = opts.identifierMentionBoostFactor ?? 1.15;
|
|
1234
|
+
if (raw != null && raw !== '') {
|
|
1235
|
+
const n = Number.parseFloat(raw);
|
|
1236
|
+
if (Number.isFinite(n) && n >= 1.0 && n <= 2.0) boost = n;
|
|
1237
|
+
}
|
|
1238
|
+
if (boost === 1.0) return 1.0;
|
|
1239
|
+
|
|
1240
|
+
const symbol = result?.name
|
|
1241
|
+
|| result?.metadata?.name
|
|
1242
|
+
|| result?.entity?.name
|
|
1243
|
+
|| result?.symbol
|
|
1244
|
+
|| '';
|
|
1245
|
+
const skipTarget = opts._symbolExactTarget ? String(opts._symbolExactTarget).toLowerCase() : '';
|
|
1246
|
+
const norm = (s) => s.replace(/[_-]/g, '').toLowerCase();
|
|
1247
|
+
|
|
1248
|
+
// Path 1 — direct symbol comparison (existing behaviour). Fires when the
|
|
1249
|
+
// chunk has a populated label that lexically matches a query mention.
|
|
1250
|
+
if (symbol) {
|
|
1251
|
+
const symLower = String(symbol).toLowerCase();
|
|
1252
|
+
const symNorm = norm(symLower);
|
|
1253
|
+
for (const mention of mentions) {
|
|
1254
|
+
const mLower = mention.toLowerCase();
|
|
1255
|
+
if (skipTarget && mLower === skipTarget) continue;
|
|
1256
|
+
if (symLower === mLower || symNorm === norm(mLower)) {
|
|
1257
|
+
return boost;
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
// No Path 1 match — fall through to Path 2 (code-graph fallback). The
|
|
1261
|
+
// chunk's labeled symbol didn't match any mention, but a contained
|
|
1262
|
+
// sibling entity might (cAST sibling-merge case: chunk labeled with
|
|
1263
|
+
// the FIRST entity in the merged group while query references a
|
|
1264
|
+
// later sibling, e.g. Java chunk 121-168 labeled `verifyNoTypeVariable`
|
|
1265
|
+
// but containing the gold's `getType` at 166-168).
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
// Path 2 — code-graph fallback (2026-05-13). Two scenarios:
|
|
1269
|
+
// 1. Null-name LI metadata — some indexes were built without populated
|
|
1270
|
+
// `metadata.name` (e.g. the typescript ast-tester repo has every doc
|
|
1271
|
+
// carrying `name: null`). Path 1 short-circuits via `if (symbol)`
|
|
1272
|
+
// and we land here directly.
|
|
1273
|
+
// 2. Sibling-merge mislabel — chunk has a populated label that doesn't
|
|
1274
|
+
// match any mention, but the chunk's range contains a sibling entity
|
|
1275
|
+
// that does. The fall-through above lets us still apply the boost.
|
|
1276
|
+
// Either way we look up each mention via `findEntityWithNameInRange` —
|
|
1277
|
+
// same code-graph signal that F8's exactSymbolTargetEntity uses — and
|
|
1278
|
+
// apply the same boost factor when a match exists. Cached per
|
|
1279
|
+
// (file, startLine, endLine, mention).
|
|
1280
|
+
//
|
|
1281
|
+
// Format-gated by the caller (identifierMentions is built only when
|
|
1282
|
+
// isAgentFormat is true), so this path is dormant on GCSN benchmark
|
|
1283
|
+
// traffic.
|
|
1284
|
+
if (!opts.codeGraphRepo || typeof opts.codeGraphRepo.findEntityWithNameInRange !== 'function') {
|
|
1285
|
+
return 1.0;
|
|
1286
|
+
}
|
|
1287
|
+
const file = resolveFilePath(result);
|
|
1288
|
+
const meta = result?.metadata ?? {};
|
|
1289
|
+
const sl = Number(result?.startLine ?? meta.startLine);
|
|
1290
|
+
const el = Number(result?.endLine ?? meta.endLine);
|
|
1291
|
+
if (!file || !Number.isFinite(sl) || !Number.isFinite(el)) return 1.0;
|
|
1292
|
+
const cache = opts._entityNameCache;
|
|
1293
|
+
for (const mention of mentions) {
|
|
1294
|
+
const mLower = mention.toLowerCase();
|
|
1295
|
+
if (skipTarget && mLower === skipTarget) continue;
|
|
1296
|
+
const cacheKey = cache ? `${file}|${sl}|${el}|mention:${mention}` : null;
|
|
1297
|
+
let resolved;
|
|
1298
|
+
if (cacheKey && cache.has(cacheKey)) {
|
|
1299
|
+
resolved = cache.get(cacheKey);
|
|
1300
|
+
} else {
|
|
1301
|
+
try {
|
|
1302
|
+
resolved = opts.codeGraphRepo.findEntityWithNameInRange(file, sl, el, mention);
|
|
1303
|
+
} catch { resolved = null; }
|
|
1304
|
+
if (cacheKey) cache.set(cacheKey, resolved);
|
|
1305
|
+
}
|
|
1306
|
+
if (resolved) return boost;
|
|
1307
|
+
}
|
|
1308
|
+
return 1.0;
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
/**
|
|
1312
|
+
* Path-token boost (added 2026-05-07 — 60-probe diagnosis NEW pattern).
|
|
1313
|
+
*
|
|
1314
|
+
* When a query mentions a crate / module / package name (e.g. "in globset",
|
|
1315
|
+
* "in render package", "from binding/json"), boost candidates whose file
|
|
1316
|
+
* path contains that token. Same Sourcegraph BM25F principle as the
|
|
1317
|
+
* symbol boost: filename matches are a strong field-level signal that
|
|
1318
|
+
* dense embedding alone underweights.
|
|
1319
|
+
*
|
|
1320
|
+
* SOTA: BM25F filename field weighting (Sourcegraph "Keeping it boring..."
|
|
1321
|
+
* April 2025). Quote: "we should be able to use these indexes to reward
|
|
1322
|
+
* symbol and FILENAME matches... think of contents, symbols, and filenames
|
|
1323
|
+
* as different 'fields' within a file." See docs/SOTA_RESEARCH_2026_FIXES.md.
|
|
1324
|
+
*
|
|
1325
|
+
* Diagnosed cases (60-probe new-set #4): ripgrep S6-Q8 (two `Glob` structs
|
|
1326
|
+
* in different crates — symbol-exact alone CANNOT disambiguate; the query
|
|
1327
|
+
* said "in globset" so paths containing /globset/ should win).
|
|
1328
|
+
*
|
|
1329
|
+
* Trigger pattern: extract bare path-like tokens after a path preposition
|
|
1330
|
+
* /\b(?:in|from|inside|under|within)\s+(\w[\w/-]*)\b/gi
|
|
1331
|
+
*
|
|
1332
|
+
* Only fires on tokens of length ≥ 4 (avoid trivial "in"/"on") and not
|
|
1333
|
+
* common English stopwords. Boost: 1.20× when path contains the token
|
|
1334
|
+
* (case-insensitive substring match on the path string). Mild magnitude
|
|
1335
|
+
* because path tokens are softer signals than symbol-exact matches.
|
|
1336
|
+
*
|
|
1337
|
+
* Override env: SWEET_SEARCH_PATH_TOKEN_BOOST (default 1.20). Disable
|
|
1338
|
+
* with `ablations: ['no-path-token-boost']`.
|
|
1339
|
+
*/
|
|
1340
|
+
const PATH_TOKEN_QUERY_RE = /\b(?:in|from|inside|under|within|of)\s+([a-z][\w-]*(?:[\/-][\w-]+)*)\b/gi;
|
|
1341
|
+
const PATH_TOKEN_STOPWORDS = new Set([
|
|
1342
|
+
'the', 'this', 'that', 'these', 'those', 'them', 'their', 'they',
|
|
1343
|
+
'when', 'while', 'where', 'with', 'without', 'have', 'been', 'each',
|
|
1344
|
+
'and', 'but', 'for', 'all', 'any', 'some', 'can', 'will', 'would',
|
|
1345
|
+
'fact', 'case', 'order', 'time', 'turn', 'fact',
|
|
1346
|
+
]);
|
|
1347
|
+
|
|
1348
|
+
function extractPathTokens(query) {
|
|
1349
|
+
if (!query || typeof query !== 'string') return [];
|
|
1350
|
+
const tokens = [];
|
|
1351
|
+
let m;
|
|
1352
|
+
PATH_TOKEN_QUERY_RE.lastIndex = 0;
|
|
1353
|
+
while ((m = PATH_TOKEN_QUERY_RE.exec(query)) !== null) {
|
|
1354
|
+
const tok = m[1];
|
|
1355
|
+
if (!tok || tok.length < 4) continue;
|
|
1356
|
+
if (PATH_TOKEN_STOPWORDS.has(tok.toLowerCase())) continue;
|
|
1357
|
+
tokens.push(tok.toLowerCase());
|
|
1358
|
+
}
|
|
1359
|
+
return tokens;
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
function pathTokenBoost(result, pathTokens, opts = {}) {
|
|
1363
|
+
if (!pathTokens || pathTokens.length === 0) return 1.0;
|
|
1364
|
+
const raw = process.env.SWEET_SEARCH_PATH_TOKEN_BOOST;
|
|
1365
|
+
let boost = opts.pathTokenBoost ?? 1.20;
|
|
1366
|
+
if (raw != null && raw !== '') {
|
|
1367
|
+
const n = Number.parseFloat(raw);
|
|
1368
|
+
if (Number.isFinite(n) && n >= 1.0 && n <= 2.0) boost = n;
|
|
1369
|
+
}
|
|
1370
|
+
if (boost === 1.0) return 1.0;
|
|
1371
|
+
const path = String(result?.file || result?.metadata?.file || '').toLowerCase();
|
|
1372
|
+
if (!path) return 1.0;
|
|
1373
|
+
// Match token as path component (separator-bounded) — avoid spurious
|
|
1374
|
+
// substring matches like "iter" matching inside "literator".
|
|
1375
|
+
for (const tok of pathTokens) {
|
|
1376
|
+
const re = new RegExp('(^|[/_-])' + tok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '($|[/_.-])');
|
|
1377
|
+
if (re.test(path)) return boost;
|
|
1378
|
+
}
|
|
1379
|
+
return 1.0;
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
function symbolExactMatchBoost(result, target, opts = {}) {
|
|
1383
|
+
if (!target) return 1.0;
|
|
1384
|
+
const raw = process.env.SWEET_SEARCH_SYMBOL_EXACT_BOOST;
|
|
1385
|
+
let boost = opts.symbolExactBoost ?? 1.30;
|
|
1386
|
+
if (raw != null && raw !== '') {
|
|
1387
|
+
const n = Number.parseFloat(raw);
|
|
1388
|
+
if (Number.isFinite(n) && n >= 1.0 && n <= 2.0) boost = n;
|
|
1389
|
+
}
|
|
1390
|
+
if (boost === 1.0) return 1.0;
|
|
1391
|
+
|
|
1392
|
+
const symbol = result?.name
|
|
1393
|
+
|| result?.metadata?.name
|
|
1394
|
+
|| result?.entity?.name
|
|
1395
|
+
|| result?.symbol
|
|
1396
|
+
|| '';
|
|
1397
|
+
const tLower = target.toLowerCase();
|
|
1398
|
+
const norm = (s) => s.replace(/[_-]/g, '').toLowerCase();
|
|
1399
|
+
if (symbol) {
|
|
1400
|
+
const sLower = String(symbol).toLowerCase();
|
|
1401
|
+
if (sLower === tLower) return boost;
|
|
1402
|
+
if (norm(sLower) === norm(tLower)) return boost;
|
|
1403
|
+
}
|
|
1404
|
+
// F7 (2026-05-07): chunk's labeled symbol may not match the target, but a
|
|
1405
|
+
// sibling entity contained inside the chunk range may match. Diagnosed from
|
|
1406
|
+
// S6-Q3 (fastify): chunk 103-150 contains both fallbackErrorHandler AND
|
|
1407
|
+
// buildErrorHandler at lines 142-150; chunker labeled it fallbackErrorHandler.
|
|
1408
|
+
// Query "show me the buildErrorHandler function definition" extracts target
|
|
1409
|
+
// "buildErrorHandler" — the contained-entity check finds it and applies the
|
|
1410
|
+
// same 1.30× boost so the chunk wins over adjacent setErrorHeaders chunk.
|
|
1411
|
+
if (opts.codeGraphRepo && typeof opts.codeGraphRepo.hasEntityWithNameInRange === 'function') {
|
|
1412
|
+
const filePath = resolveFilePath(result);
|
|
1413
|
+
const meta = result?.metadata ?? {};
|
|
1414
|
+
const startLine = Number(result?.startLine ?? meta.startLine);
|
|
1415
|
+
const endLine = Number(result?.endLine ?? meta.endLine);
|
|
1416
|
+
if (filePath && Number.isFinite(startLine) && Number.isFinite(endLine)) {
|
|
1417
|
+
try {
|
|
1418
|
+
if (opts.codeGraphRepo.hasEntityWithNameInRange(filePath, startLine, endLine, target)) {
|
|
1419
|
+
return boost;
|
|
1420
|
+
}
|
|
1421
|
+
} catch { /* fall through */ }
|
|
1422
|
+
}
|
|
1423
|
+
}
|
|
1424
|
+
return 1.0;
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
/**
|
|
1428
|
+
* Demote anomalous chunks: anonymous (symbol==null) AND symbolType==='code',
|
|
1429
|
+
* AND ANY of:
|
|
1430
|
+
* (a) file-header — startLine===1 (e.g. file-imports leak)
|
|
1431
|
+
* (b) tiny span — endLine-startLine<5 (e.g. bare impl-header text)
|
|
1432
|
+
* (c) preprocessor-dense — mid-file anonymous chunk where >50% of non-blank
|
|
1433
|
+
* lines are preprocessor directives (#include/#define/#ifdef/etc.) and
|
|
1434
|
+
* no real declaration appears. Covers C/C++ "include cluster" + "macro
|
|
1435
|
+
* wall" chunks that the (a)/(b) predicates miss because they sit in the
|
|
1436
|
+
* middle of a header and span 10-100 lines (CPP-002 / CPP-003 / CPP-008
|
|
1437
|
+
* root cause: anonymous code chunks dominated by HWY_ATTAINABLE_*,
|
|
1438
|
+
* HWY_HAVE_RUNTIME_DISPATCH_*, #include "hwy/base.h" etc., winning
|
|
1439
|
+
* NL queries on token density).
|
|
1440
|
+
*
|
|
1441
|
+
* These chunks bypass the entity DB (sparse/grep fallback or chunker leak)
|
|
1442
|
+
* and shouldn't surface as top-1.
|
|
1443
|
+
*
|
|
1444
|
+
* Predicate (a)+(b) verified 2026-05-07 against live probe + FreshStack
|
|
1445
|
+
* PARTIALs: legitimate symbol-mislabel cases (S3-Q2, S4-Q1, S6-Q4, S3-Q8)
|
|
1446
|
+
* all have span >20 lines and startLine deep in file — they pass through
|
|
1447
|
+
* unaffected by (a)+(b). Predicate (c) is narrower: requires zero real
|
|
1448
|
+
* declarations AND ≥50% preprocessor lines, so a real code chunk with a
|
|
1449
|
+
* couple of #includes at the top is NOT affected.
|
|
1450
|
+
*
|
|
1451
|
+
* Demote (×0.10) rather than filter so a single-anomalous-result fallback
|
|
1452
|
+
* still surfaces the chunk if nothing else matches.
|
|
1453
|
+
*/
|
|
1454
|
+
const PREPROC_LINE_RE = /^\s*#\s*(?:include|define|undef|ifdef|ifndef|if|else|elif|endif|pragma|error|warning|line)\b/;
|
|
1455
|
+
|
|
1456
|
+
function isPreprocDenseAnonymousChunk(result, opts) {
|
|
1457
|
+
const text = resolveResultText(result, opts);
|
|
1458
|
+
if (!text || text.length < 50) return false;
|
|
1459
|
+
const lines = text.split(/\r?\n/);
|
|
1460
|
+
let nonBlank = 0;
|
|
1461
|
+
let preproc = 0;
|
|
1462
|
+
let hasDecl = false;
|
|
1463
|
+
// A `#define` (or `#pragma`/etc.) that ends in `\` is a multi-line
|
|
1464
|
+
// continuation. Lines participating in the continuation are functionally
|
|
1465
|
+
// part of the preprocessor directive and should count as preproc, not as
|
|
1466
|
+
// "real code". Without this, a long multi-line macro definition like
|
|
1467
|
+
// `#define HWY_BASELINE_TARGETS (FLAG_A | FLAG_B | \\` ... `FLAG_Z)`
|
|
1468
|
+
// dilutes the density below 50% even though the entire span is one macro.
|
|
1469
|
+
let inContinuation = false;
|
|
1470
|
+
for (const line of lines) {
|
|
1471
|
+
const trimmed = line.trim();
|
|
1472
|
+
if (trimmed.length === 0) {
|
|
1473
|
+
inContinuation = false;
|
|
1474
|
+
continue;
|
|
1475
|
+
}
|
|
1476
|
+
nonBlank++;
|
|
1477
|
+
const isPreproc = PREPROC_LINE_RE.test(line) || inContinuation;
|
|
1478
|
+
if (isPreproc) {
|
|
1479
|
+
preproc++;
|
|
1480
|
+
} else if (DECL_KEYWORD_RE.test(trimmed)) {
|
|
1481
|
+
hasDecl = true;
|
|
1482
|
+
break;
|
|
1483
|
+
}
|
|
1484
|
+
// Track continuation for the next line: a preproc line (or an existing
|
|
1485
|
+
// continuation line) that ends in `\` continues the directive.
|
|
1486
|
+
inContinuation = isPreproc && line.replace(/\s+$/, '').endsWith('\\');
|
|
1487
|
+
}
|
|
1488
|
+
if (hasDecl) return false;
|
|
1489
|
+
if (nonBlank < 5) return false;
|
|
1490
|
+
return (preproc / nonBlank) >= 0.5;
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
function anomalousChunkDemotion(result, opts = {}) {
|
|
1494
|
+
if (process.env.SWEET_SEARCH_NO_ANOMALOUS_CHUNK_DEMOTION === '1') return 1.0;
|
|
1495
|
+
if (hasAblation(opts.ablations, 'no-anomalous-chunk-demotion')) return 1.0;
|
|
1496
|
+
// Format-gated: GCSN-style NL queries hit many file-start anonymous code
|
|
1497
|
+
// chunks that are actually correct answers; ungated, this demotion drops
|
|
1498
|
+
// GCSN dev MRR by ~27pp. Agent-format queries (probes/FreshStack) don't
|
|
1499
|
+
// expect file-header content as the answer.
|
|
1500
|
+
if (!opts._isAgentFormat) return 1.0;
|
|
1501
|
+
const meta = result?.metadata ?? {};
|
|
1502
|
+
const symbolType = result?.symbolType ?? result?.type ?? meta.type ?? null;
|
|
1503
|
+
const startLine = Number(result?.startLine ?? meta.startLine ?? meta.line_start);
|
|
1504
|
+
const endLine = Number(result?.endLine ?? meta.endLine ?? meta.line_end);
|
|
1505
|
+
if (!Number.isFinite(startLine) || !Number.isFinite(endLine)) return 1.0;
|
|
1506
|
+
const span = endLine - startLine;
|
|
1507
|
+
|
|
1508
|
+
// PATH 1 — Anonymous code chunks (symbol==null, type='code'):
|
|
1509
|
+
// demote on file-header (startLine===1) OR tiny-span (<5 lines) OR
|
|
1510
|
+
// preprocessor-density >=50% with no declarations.
|
|
1511
|
+
const sym = result?.symbol ?? meta.symbol ?? meta.name ?? null;
|
|
1512
|
+
const isAnonymousCode = (sym === null || sym === '' || sym === undefined) && symbolType === 'code';
|
|
1513
|
+
if (isAnonymousCode) {
|
|
1514
|
+
const isFileHeader = startLine === 1;
|
|
1515
|
+
const isTinySpan = span < 5;
|
|
1516
|
+
const isPreprocDense = (!isFileHeader && !isTinySpan)
|
|
1517
|
+
? isPreprocDenseAnonymousChunk(result, opts)
|
|
1518
|
+
: false;
|
|
1519
|
+
if (isFileHeader || isTinySpan || isPreprocDense) {
|
|
1520
|
+
return opts.anomalousChunkFactor ?? 0.10;
|
|
1521
|
+
}
|
|
1522
|
+
return 1.0;
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
// PATH 2 — Macro-wall chunks (symbolType='macro', span>=5, preprocessor-dense):
|
|
1526
|
+
// adopted from the entity DB during search, these chunks have a non-null
|
|
1527
|
+
// `symbol` like HWY_BASELINE_TARGETS even though their underlying chunk
|
|
1528
|
+
// metadata is anonymous-code. Functionally identical to PATH 1: a wall of
|
|
1529
|
+
// #defines that happens to include one extractable macro entity. Single
|
|
1530
|
+
// small macros (span<5) are NOT demoted — they may be the correct answer
|
|
1531
|
+
// for a query targeting that macro.
|
|
1532
|
+
if (symbolType === 'macro' && span >= 5 && isPreprocDenseAnonymousChunk(result, opts)) {
|
|
1533
|
+
return opts.anomalousChunkFactor ?? 0.10;
|
|
1534
|
+
}
|
|
1535
|
+
|
|
1536
|
+
return 1.0;
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
/**
|
|
1540
|
+
* Mega-entity penalty (F1, 2026-05-07): when a chunk's enclosing entity
|
|
1541
|
+
* (e.g. function fastify @ 735 lines, Flask App class @ 1516 lines) exceeds
|
|
1542
|
+
* a configurable cap, demote the chunk's score. The fix targets the post-
|
|
1543
|
+
* retrieval envelope-bloat pattern from the taxonomy: small chunks score
|
|
1544
|
+
* highly because they're packed with token-dense surfaces from a mega-fn,
|
|
1545
|
+
* and presentation later expands them into a 700+ line envelope.
|
|
1546
|
+
*
|
|
1547
|
+
* Format-gated (agent only): GCSN single-function NL queries shouldn't
|
|
1548
|
+
* be affected by entity envelope sizes.
|
|
1549
|
+
*
|
|
1550
|
+
* Off by default (Infinity); calibrated via SWEET_SEARCH_MAX_ENVELOPE_LINES
|
|
1551
|
+
* env var or opts.maxEnvelopeLines.
|
|
1552
|
+
*/
|
|
1553
|
+
// Loop-invariant resolution of the env-controlled envelope cap. Computed
|
|
1554
|
+
// once per applyResultDemotions call (see ruleOpts setup) and stashed on
|
|
1555
|
+
// opts._megaEnvelopeMax to avoid the env+parseInt+default lookup per
|
|
1556
|
+
// result. Resolver returns -1 to mean "skip the rule entirely" (when the
|
|
1557
|
+
// env var is set to a non-positive/non-finite value).
|
|
1558
|
+
function resolveMaxEnvelopeLines(opts) {
|
|
1559
|
+
if (typeof opts._megaEnvelopeMax === 'number') return opts._megaEnvelopeMax;
|
|
1560
|
+
const raw = process.env.SWEET_SEARCH_MAX_ENVELOPE_LINES;
|
|
1561
|
+
if (raw != null && raw !== '') {
|
|
1562
|
+
const n = Number.parseInt(raw, 10);
|
|
1563
|
+
if (Number.isFinite(n) && n > 0) return n;
|
|
1564
|
+
return -1;
|
|
1565
|
+
}
|
|
1566
|
+
// Default 500: calibrated 2026-05-07 on 60-probe + FreshStack uv + GCSN dev/held-out.
|
|
1567
|
+
// Cap=500 yields +1 PASS on probes (S5-Q9 Flask Scaffold) and +1 FAIL→PARTIAL on
|
|
1568
|
+
// FreshStack uv (UV-NL-2 do_lock) with zero regression on GCSN. Smaller caps
|
|
1569
|
+
// regressed FreshStack; larger caps yielded no further gain.
|
|
1570
|
+
return opts.maxEnvelopeLines ?? 500;
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
function megaEntityPenalty(result, opts = {}) {
|
|
1574
|
+
if (!opts._isAgentFormat) return 1.0;
|
|
1575
|
+
if (hasAblation(opts.ablations, 'no-mega-entity-penalty')) return 1.0;
|
|
1576
|
+
const maxEnvelopeLines = resolveMaxEnvelopeLines(opts);
|
|
1577
|
+
if (maxEnvelopeLines <= 0 || !Number.isFinite(maxEnvelopeLines)) return 1.0;
|
|
1578
|
+
if (!opts.codeGraphRepo || typeof opts.codeGraphRepo.findEnclosingEntity !== 'function') {
|
|
1579
|
+
return 1.0;
|
|
1580
|
+
}
|
|
1581
|
+
// Route through resolveEntityKindInfo so we hit the search-scoped
|
|
1582
|
+
// _entityKindCache instead of going to SQLite again. The cached entity
|
|
1583
|
+
// carries startLine/endLine which is all this rule needs.
|
|
1584
|
+
const entity = resolveEntityKindInfo(result, opts);
|
|
1585
|
+
if (!entity || !Number.isFinite(entity.startLine) || !Number.isFinite(entity.endLine)) return 1.0;
|
|
1586
|
+
const entityLines = (entity.endLine - entity.startLine) + 1;
|
|
1587
|
+
if (entityLines <= maxEnvelopeLines) return 1.0;
|
|
1588
|
+
const factor = opts.megaEntityFactor ?? 0.85;
|
|
1589
|
+
return factor;
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
/**
|
|
1593
|
+
* Doc-comment-only chunk demotion (F6, 2026-05-07).
|
|
1594
|
+
*
|
|
1595
|
+
* Detects chunks whose content is predominantly doc-comments without any
|
|
1596
|
+
* executable type/function declarations. Diagnosed from S3-Q8 ripgrep
|
|
1597
|
+
* (walk.rs:434-469): the chunker split WalkBuilder's 48-line docstring
|
|
1598
|
+
* across two chunks; the docstring-only chunk lexically matched
|
|
1599
|
+
* "WalkBuilder" + "directory iterator" and out-ranked the chunk that
|
|
1600
|
+
* actually contained the `pub struct WalkBuilder` declaration.
|
|
1601
|
+
*
|
|
1602
|
+
* Predicate: doc-comment lines / total non-blank lines > 0.85 AND no
|
|
1603
|
+
* declaration keywords (pub struct/fn/impl/enum/trait/class/def/function).
|
|
1604
|
+
*
|
|
1605
|
+
* Format-gated to agent: GCSN-style queries don't reliably target docs vs
|
|
1606
|
+
* code, and over-demoting comment-heavy chunks could regress. Format-gated
|
|
1607
|
+
* keeps it safe per the CLAUDE.md format-gating principle.
|
|
1608
|
+
*/
|
|
1609
|
+
const DOC_COMMENT_LINE_RE = /^\s*(?:\/\/[\/!]|\/\*\*?|\*\s|"""|'''|#'\s|#\s|--\s|--\|)/;
|
|
1610
|
+
const DECL_KEYWORD_RE = /\b(?:pub\s+)?(?:struct|enum|trait|impl|mod)\b|\bfn\s+\w|\bclass\s+\w|\bdef\s+\w|\bfunction\s+\w|\binterface\s+\w|^\s*(?:export\s+)?(?:async\s+)?function\b/;
|
|
1611
|
+
function docCommentOnlyDemotion(result, opts = {}) {
|
|
1612
|
+
if (!opts._isAgentFormat) return 1.0;
|
|
1613
|
+
if (hasAblation(opts.ablations, 'no-doc-comment-demote')) return 1.0;
|
|
1614
|
+
const text = resolveResultText(result, opts);
|
|
1615
|
+
if (!text || text.length < 80) return 1.0;
|
|
1616
|
+
const lines = text.split(/\r?\n/);
|
|
1617
|
+
let docLines = 0;
|
|
1618
|
+
let nonBlankLines = 0;
|
|
1619
|
+
let hasDecl = false;
|
|
1620
|
+
for (const line of lines) {
|
|
1621
|
+
const trimmed = line.trim();
|
|
1622
|
+
if (trimmed.length === 0) continue;
|
|
1623
|
+
nonBlankLines++;
|
|
1624
|
+
if (DOC_COMMENT_LINE_RE.test(line)) {
|
|
1625
|
+
docLines++;
|
|
1626
|
+
} else if (DECL_KEYWORD_RE.test(trimmed)) {
|
|
1627
|
+
hasDecl = true;
|
|
1628
|
+
break;
|
|
1629
|
+
}
|
|
1630
|
+
}
|
|
1631
|
+
if (hasDecl) return 1.0;
|
|
1632
|
+
if (nonBlankLines < 5) return 1.0;
|
|
1633
|
+
if (docLines / nonBlankLines < 0.85) return 1.0;
|
|
1634
|
+
return opts.docCommentOnlyFactor ?? 0.70;
|
|
1635
|
+
}
|
|
1636
|
+
|
|
1637
|
+
function megaChunkSizePenalty(result, opts = {}) {
|
|
1638
|
+
const floor = (() => {
|
|
1639
|
+
const raw = process.env.SWEET_SEARCH_MEGA_CHUNK_FLOOR;
|
|
1640
|
+
if (raw == null || raw === '') return opts.megaChunkFloor ?? 0.80;
|
|
1641
|
+
const n = Number.parseFloat(raw);
|
|
1642
|
+
return Number.isFinite(n) && n >= 0 && n <= 1 ? n : (opts.megaChunkFloor ?? 0.80);
|
|
1643
|
+
})();
|
|
1644
|
+
if (floor >= 1.0) return 1.0; // disabled
|
|
1645
|
+
const cutoff = (() => {
|
|
1646
|
+
const raw = process.env.SWEET_SEARCH_MEGA_CHUNK_CUTOFF;
|
|
1647
|
+
if (raw == null || raw === '') return opts.megaChunkCutoff ?? 500;
|
|
1648
|
+
const n = Number.parseInt(raw, 10);
|
|
1649
|
+
return Number.isFinite(n) && n > 0 ? n : (opts.megaChunkCutoff ?? 500);
|
|
1650
|
+
})();
|
|
1651
|
+
const slope = (() => {
|
|
1652
|
+
const raw = process.env.SWEET_SEARCH_MEGA_CHUNK_SLOPE;
|
|
1653
|
+
if (raw == null || raw === '') return opts.megaChunkSlope ?? 0.0003;
|
|
1654
|
+
const n = Number.parseFloat(raw);
|
|
1655
|
+
return Number.isFinite(n) && n >= 0 && n <= 0.01 ? n : (opts.megaChunkSlope ?? 0.0003);
|
|
1656
|
+
})();
|
|
1657
|
+
|
|
1658
|
+
const lineCount = inferLineCount(result);
|
|
1659
|
+
if (!Number.isFinite(lineCount) || lineCount <= cutoff) return 1.0;
|
|
1660
|
+
return Math.max(floor, 1.0 - slope * (lineCount - cutoff));
|
|
1661
|
+
}
|
|
1662
|
+
|
|
1663
|
+
function bodyDensityMultiplier(result, opts = {}) {
|
|
1664
|
+
if (process.env.SWEET_SEARCH_BODY_DENSITY === '0'
|
|
1665
|
+
|| process.env.SWEET_SEARCH_BODY_DENSITY === 'false') {
|
|
1666
|
+
return 1;
|
|
1667
|
+
}
|
|
1668
|
+
// Procedural-intent gate: a query asking "what is the X interface" should
|
|
1669
|
+
// not penalize declaration chunks.
|
|
1670
|
+
const intent = opts.intent || classifyFileKindIntent(opts.query || '');
|
|
1671
|
+
if (intent !== 'implementation') return 1;
|
|
1672
|
+
|
|
1673
|
+
// Trigger 1: declarative-entity types. Cheap — uses already-known metadata.
|
|
1674
|
+
const recordedType = normalizeType(resolveResultType(result));
|
|
1675
|
+
const inferredType = recordedType && recordedType !== 'code' && recordedType !== 'chunk'
|
|
1676
|
+
? recordedType
|
|
1677
|
+
: normalizeType(resolveEntityKindInfo(result, opts)?.type);
|
|
1678
|
+
let mult = 1;
|
|
1679
|
+
if (DECLARATIVE_ENTITY_TYPES.has(inferredType)) {
|
|
1680
|
+
const declFactor = envFloatRange('SWEET_SEARCH_DECLARATIVE_FACTOR', 0.85);
|
|
1681
|
+
mult *= declFactor;
|
|
1682
|
+
}
|
|
1683
|
+
|
|
1684
|
+
// Triggers 2 & 3: text-content-derived signals for `impl` chunks.
|
|
1685
|
+
// Both target Rust impl blocks specifically because the failure shape
|
|
1686
|
+
// we observed (clap-style flag-arg impls) is a Rust idiom — it doesn't
|
|
1687
|
+
// exist in JS/TS/Go/Python.
|
|
1688
|
+
//
|
|
1689
|
+
// 2. Raw-string-dominant — > rsThreshold of non-blank chars live inside
|
|
1690
|
+
// a Rust raw-string literal. Catches impls where `doc_long()` is a
|
|
1691
|
+
// large `r#"..."#` description (e.g. `impl Flag for SearchZip`).
|
|
1692
|
+
//
|
|
1693
|
+
// 3. Stub-impl — multiple fn defs with avg body line count < stubMaxLines.
|
|
1694
|
+
// Catches impls where every method is a 1-line literal return
|
|
1695
|
+
// (e.g. `impl Flag for CaseSensitive` whose 6 methods total ~6 body
|
|
1696
|
+
// lines), independent of doc string size.
|
|
1697
|
+
//
|
|
1698
|
+
// Both apply 0.85× by default. They MAY stack on a chunk that hits both,
|
|
1699
|
+
// but the combined factor (~0.72) is still milder than the existing
|
|
1700
|
+
// doc/test demotion (0.35) so a true impl chunk that wrongly trips one
|
|
1701
|
+
// of these can still recover via other signals.
|
|
1702
|
+
if (inferredType === 'impl') {
|
|
1703
|
+
const text = resolveResultText(result, opts);
|
|
1704
|
+
if (text && text.length > 200) {
|
|
1705
|
+
const rsDensity = rawStringDensity(text);
|
|
1706
|
+
const rsThreshold = envFloatRange('SWEET_SEARCH_RAWSTRING_THRESHOLD', 0.50);
|
|
1707
|
+
if (rsDensity > rsThreshold) {
|
|
1708
|
+
const rsFactor = envFloatRange('SWEET_SEARCH_RAWSTRING_FACTOR', 0.85);
|
|
1709
|
+
mult *= rsFactor;
|
|
1710
|
+
}
|
|
1711
|
+
|
|
1712
|
+
const avgBody = avgFnBodyLines(text);
|
|
1713
|
+
// Threshold of 4.0 catches:
|
|
1714
|
+
// - CaseSensitive impl in ripgrep (avg body ≈ 2.6 incl. raw-string lines)
|
|
1715
|
+
// - SearchZip impl in ripgrep (avg body ≈ 3.8)
|
|
1716
|
+
// - Other clap-style flag-arg impls with mostly 1-line literal returns
|
|
1717
|
+
// While leaving alone real impls — Display/Iterator/Builder typically
|
|
1718
|
+
// have avg body ≥ 5 lines because their core methods are non-trivial.
|
|
1719
|
+
const stubMax = envFloatRange('SWEET_SEARCH_STUB_MAX_LINES', 4.0);
|
|
1720
|
+
if (avgBody < stubMax) {
|
|
1721
|
+
const stubFactor = envFloatRange('SWEET_SEARCH_STUB_FACTOR', 0.85);
|
|
1722
|
+
mult *= stubFactor;
|
|
1723
|
+
}
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
|
|
1727
|
+
return mult;
|
|
1728
|
+
}
|
|
1729
|
+
|
|
1730
|
+
// Reference-count boost (added 2026-05-05). Aider-style behavioural-graph
|
|
1731
|
+
// signal: chunks whose primary entity is invoked from many call sites get
|
|
1732
|
+
// a small log-scaled boost, capped low enough that it can't dominate
|
|
1733
|
+
// embedding scores.
|
|
1734
|
+
//
|
|
1735
|
+
// Why this matters. The bi-encoder ranks `lib/decorate.js`'s `decorate` fn
|
|
1736
|
+
// purely on text similarity, where doc-rich `.d.ts` namespace blocks or
|
|
1737
|
+
// generic helpers can outrank it. The call graph encodes that `decorate`
|
|
1738
|
+
// is invoked 41 times across the codebase while the namespace declaration
|
|
1739
|
+
// is referenced almost exclusively from imports (4 hits). That's a strong
|
|
1740
|
+
// behavioural signal: this entity is structurally important.
|
|
1741
|
+
//
|
|
1742
|
+
// Restrictions:
|
|
1743
|
+
// - Only fires on `function` / `method` / `impl` entities. Declarative
|
|
1744
|
+
// types are handled by T1 above and shouldn't compete on call count.
|
|
1745
|
+
// - Only fires under `intent='implementation'`. Asking "what is the
|
|
1746
|
+
// ConfigError type" should not promote a fn just because it's called
|
|
1747
|
+
// a lot.
|
|
1748
|
+
// - Counts `type='calls'` only — not `imports`/`uses`/`extends`. Imports
|
|
1749
|
+
// are noisy (every file imports a few standards) and don't reflect
|
|
1750
|
+
// behavioural invocation.
|
|
1751
|
+
// - Boost is `1 + alpha · log(1 + count)` capped at REF_BOOST_CAP. With
|
|
1752
|
+
// alpha=0.025 and cap=1.10, 30 calls yields ~1.085, 1000 calls hits
|
|
1753
|
+
// the cap. So a heavily-tested helper can't run away with the ranking.
|
|
1754
|
+
// - Skipped on chunks larger than REF_BOOST_LARGE_LINES (default 80) to
|
|
1755
|
+
// avoid worsening Cluster B (oversized parent chunks like a 700-line
|
|
1756
|
+
// factory function whose graph degree is naturally high).
|
|
1757
|
+
//
|
|
1758
|
+
// Disable with `ablations: 'no-ref-count-boost'` or
|
|
1759
|
+
// SWEET_SEARCH_REF_BOOST_ALPHA=0. Suffix aggregation is homonym-gated in
|
|
1760
|
+
// CodeGraphRepository (`SWEET_SEARCH_REF_SUFFIX_AGG_FANOUT_MAX`, default 12).
|
|
1761
|
+
const REF_BOOSTABLE_TYPES = new Set(['function', 'method', 'impl']);
|
|
1762
|
+
|
|
1763
|
+
function referenceCountBoost(result, refCounts, opts = {}) {
|
|
1764
|
+
if (!refCounts || refCounts.size === 0) return 1;
|
|
1765
|
+
if (process.env.SWEET_SEARCH_REF_BOOST_ALPHA === '0') return 1;
|
|
1766
|
+
|
|
1767
|
+
const intent = opts.intent || classifyFileKindIntent(opts.query || '');
|
|
1768
|
+
if (intent !== 'implementation') return 1;
|
|
1769
|
+
|
|
1770
|
+
const recordedType = normalizeType(resolveResultType(result));
|
|
1771
|
+
const inferredType = recordedType && recordedType !== 'code' && recordedType !== 'chunk'
|
|
1772
|
+
? recordedType
|
|
1773
|
+
: normalizeType(resolveEntityKindInfo(result, opts)?.type);
|
|
1774
|
+
if (!REF_BOOSTABLE_TYPES.has(inferredType)) return 1;
|
|
1775
|
+
|
|
1776
|
+
const meta = result?.metadata || {};
|
|
1777
|
+
const start = result?.startLine ?? meta.startLine;
|
|
1778
|
+
const end = result?.endLine ?? meta.endLine;
|
|
1779
|
+
if (Number.isFinite(start) && Number.isFinite(end)) {
|
|
1780
|
+
const lineCount = Math.max(1, end - start + 1);
|
|
1781
|
+
const largeThresh = Number(process.env.SWEET_SEARCH_REF_BOOST_LARGE_LINES || 80);
|
|
1782
|
+
if (lineCount > largeThresh) return 1;
|
|
1783
|
+
}
|
|
1784
|
+
|
|
1785
|
+
const name = resolveResultName(result) || resolveEntityKindInfo(result, opts)?.name;
|
|
1786
|
+
if (!name || name.length < 3) return 1;
|
|
1787
|
+
|
|
1788
|
+
const count = refCounts.get(name) || 0;
|
|
1789
|
+
if (count <= 0) return 1;
|
|
1790
|
+
|
|
1791
|
+
const alpha = envFloatRange('SWEET_SEARCH_REF_BOOST_ALPHA', 0.025);
|
|
1792
|
+
const cap = (() => {
|
|
1793
|
+
const v = process.env.SWEET_SEARCH_REF_BOOST_CAP;
|
|
1794
|
+
if (v == null || v === '') return 1.10;
|
|
1795
|
+
const n = Number(v);
|
|
1796
|
+
return Number.isFinite(n) && n >= 1.0 && n <= 1.5 ? n : 1.10;
|
|
1797
|
+
})();
|
|
1798
|
+
const boost = Math.min(cap, 1 + alpha * Math.log(1 + count));
|
|
1799
|
+
return boost;
|
|
1800
|
+
}
|
|
1801
|
+
|
|
1802
|
+
// Pre-compute incoming-call counts for ALL candidate names in one DB query.
|
|
1803
|
+
// Without this, the multiplier function would do N queries per result set
|
|
1804
|
+
// (one per candidate), which adds 100-200 ms in practice.
|
|
1805
|
+
function buildRefCountMap(results, opts = {}) {
|
|
1806
|
+
const repo = opts.codeGraphRepo;
|
|
1807
|
+
if (!repo || typeof repo.countIncomingCallsByNames !== 'function') return new Map();
|
|
1808
|
+
const intent = opts.intent || classifyFileKindIntent(opts.query || '');
|
|
1809
|
+
if (intent !== 'implementation') return new Map();
|
|
1810
|
+
if (process.env.SWEET_SEARCH_REF_BOOST_ALPHA === '0') return new Map();
|
|
1811
|
+
|
|
1812
|
+
const names = [];
|
|
1813
|
+
for (const r of results) {
|
|
1814
|
+
const recordedType = normalizeType(resolveResultType(r));
|
|
1815
|
+
const inferredType = recordedType && recordedType !== 'code' && recordedType !== 'chunk'
|
|
1816
|
+
? recordedType
|
|
1817
|
+
: normalizeType(resolveEntityKindInfo(r, opts)?.type);
|
|
1818
|
+
if (!REF_BOOSTABLE_TYPES.has(inferredType)) continue;
|
|
1819
|
+
const name = resolveResultName(r) || resolveEntityKindInfo(r, opts)?.name;
|
|
1820
|
+
if (name && name.length >= 3) names.push(name);
|
|
1821
|
+
}
|
|
1822
|
+
if (names.length === 0) return new Map();
|
|
1823
|
+
try {
|
|
1824
|
+
// Default: skip ref-boost for the whole query when any boostable candidate
|
|
1825
|
+
// bare name has >12 distinct call-graph targets (dense single-fun corpora).
|
|
1826
|
+
// Opt out with SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE=0; tighten for
|
|
1827
|
+
// eval with =2..=8 (lifts GCSN, may trim monorepo boosts — see probes).
|
|
1828
|
+
const rawTh = process.env.SWEET_SEARCH_REF_BOOST_QUERY_HOMONYM_DISABLE;
|
|
1829
|
+
const parsed = parseInt(rawTh != null && rawTh !== '' ? rawTh : '12', 10);
|
|
1830
|
+
const homonymCeil = rawTh === '0'
|
|
1831
|
+
? Infinity
|
|
1832
|
+
: (Number.isFinite(parsed) && parsed > 0 ? parsed : 12);
|
|
1833
|
+
if (typeof repo.relationshipBareFanout === 'function'
|
|
1834
|
+
&& homonymCeil < Infinity
|
|
1835
|
+
&& names.some((n) => repo.relationshipBareFanout(n) > homonymCeil)) {
|
|
1836
|
+
return new Map();
|
|
1837
|
+
}
|
|
1838
|
+
return repo.countIncomingCallsByNames(names);
|
|
1839
|
+
} catch {
|
|
1840
|
+
return new Map();
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1844
|
+
// Removed (2026-05-05): file-header chunk detection became redundant
|
|
1845
|
+
// once cAST sibling-merge was confirmed. With cAST, a chunk starting at
|
|
1846
|
+
// line 1 of a source file naturally merges the package decl + imports
|
|
1847
|
+
// with the first executable declaration(s), so a "lines 1-N: imports
|
|
1848
|
+
// only" chunk shouldn't normally win retrieval. Cases where it still
|
|
1849
|
+
// does are rare enough that the cost of the false-positive demotion
|
|
1850
|
+
// (e.g. a `types.go` consisting purely of type aliases) outweighs the
|
|
1851
|
+
// benefit. The per-doc `tinyAncillaryFactor` in applyFileKindRanking
|
|
1852
|
+
// still catches tiny doc/test/example top-1 results.
|
|
1853
|
+
|
|
1854
|
+
/**
|
|
1855
|
+
* Apply content-aware result demotions/boosts before top-k truncation.
|
|
1856
|
+
* Catches inline test functions and explicit entity-kind queries that
|
|
1857
|
+
* path-only demotion cannot see. Tiny-chunk and file-header rules were
|
|
1858
|
+
* removed once cAST sibling-merge made them structurally redundant.
|
|
1859
|
+
*/
|
|
1860
|
+
export function applyResultDemotions(results, opts = {}) {
|
|
1861
|
+
if (!Array.isArray(results) || results.length === 0) return results;
|
|
1862
|
+
|
|
1863
|
+
// Attach intra-call (and optionally cross-call) memoization for the three
|
|
1864
|
+
// hot lookups inside the demotion sub-rules:
|
|
1865
|
+
// - _entityKindCache : enclosing/contained entity from SQLite
|
|
1866
|
+
// - _entityNameCache : findEntityWithNameInRange (symbol-target adopt)
|
|
1867
|
+
// - _resultTextCache : readFileSync source span — biggest win, since
|
|
1868
|
+
// 5+ rules call resolveResultText per result and
|
|
1869
|
+
// each cache-miss fires a full readFileSync.
|
|
1870
|
+
// Caller may pass pre-allocated Maps via opts to share across both
|
|
1871
|
+
// applyResultDemotions calls in the same search() invocation.
|
|
1872
|
+
opts = {
|
|
1873
|
+
...opts,
|
|
1874
|
+
_entityKindCache: opts._entityKindCache instanceof Map ? opts._entityKindCache : new Map(),
|
|
1875
|
+
_entityNameCache: opts._entityNameCache instanceof Map ? opts._entityNameCache : new Map(),
|
|
1876
|
+
_resultTextCache: opts._resultTextCache instanceof Map ? opts._resultTextCache : new Map(),
|
|
1877
|
+
_fullFileTextCache: opts._fullFileTextCache instanceof Map ? opts._fullFileTextCache : new Map(),
|
|
1878
|
+
_isTestSupportCache: opts._isTestSupportCache instanceof Map ? opts._isTestSupportCache : new Map(),
|
|
1879
|
+
_isTestChunkCache: opts._isTestChunkCache instanceof Map ? opts._isTestChunkCache : new Map(),
|
|
1880
|
+
_fileKindCache: opts._fileKindCache instanceof Map ? opts._fileKindCache : new Map(),
|
|
1881
|
+
};
|
|
1882
|
+
|
|
1883
|
+
const ablations = opts.ablations;
|
|
1884
|
+
if (hasAblation(ablations, 'no-result-demotions')) return results;
|
|
1885
|
+
|
|
1886
|
+
const qTokens = queryTokenSet(opts.query || '', opts.queryTokens);
|
|
1887
|
+
const preferredKind = hasAblation(ablations, 'no-entity-kind-pref')
|
|
1888
|
+
? null
|
|
1889
|
+
: entityKindPreferenceFromQuery(opts.query || '');
|
|
1890
|
+
const nameHints = hasAblation(ablations, 'no-name-precision')
|
|
1891
|
+
? new Set()
|
|
1892
|
+
: extractNameHints(opts.query || '');
|
|
1893
|
+
const nameHintsLower = hasAblation(ablations, 'no-name-precision')
|
|
1894
|
+
? new Set()
|
|
1895
|
+
: new Set([...nameHints].map(s => s.toLowerCase()));
|
|
1896
|
+
|
|
1897
|
+
// Pre-compute incoming-call counts in a single batched query so the
|
|
1898
|
+
// per-result loop doesn't make N round trips to SQLite.
|
|
1899
|
+
const refCounts = !hasAblation(ablations, 'no-ref-count-boost')
|
|
1900
|
+
? buildRefCountMap(results, opts)
|
|
1901
|
+
: new Map();
|
|
1902
|
+
|
|
1903
|
+
// Symbol-exact-match target + path-token targets — extracted ONCE per
|
|
1904
|
+
// query (not per-result). BM25F SOTA pattern (Sourcegraph BM25F blog
|
|
1905
|
+
// April 2025, +20% on code search; Pérez-Iglesias et al. arXiv
|
|
1906
|
+
// 0911.5046; Robertson & Zaragoza 2009).
|
|
1907
|
+
//
|
|
1908
|
+
// CRITICAL — gated on opts.format === 'agent' (or env override) to
|
|
1909
|
+
// avoid −0.07pp regression on GCSN heldout MRR. GCSN-style NL queries
|
|
1910
|
+
// ("Sort an array of integers", "Find the index of an element") trip
|
|
1911
|
+
// the path-token "of X" pattern with non-path tokens like "integers"
|
|
1912
|
+
// / "ascending", and lightly poison ranking. The boosts are designed
|
|
1913
|
+
// for agent queries with explicit identifier/path hints ("show me X
|
|
1914
|
+
// struct", "in globset"), not for benchmark NL traffic. Probes use
|
|
1915
|
+
// format='agent', so their behaviour is preserved; GCSN bench uses
|
|
1916
|
+
// mode='auto' without format, so boosts are skipped — restoring the
|
|
1917
|
+
// 85.99% MRR heldout baseline.
|
|
1918
|
+
//
|
|
1919
|
+
// See docs/SOTA_RESEARCH_2026_FIXES.md for full rationale.
|
|
1920
|
+
const isAgentFormat = opts.format === 'agent'
|
|
1921
|
+
|| opts.format === 'agent_full'
|
|
1922
|
+
|| opts.format === 'agent_full_xl'
|
|
1923
|
+
|| opts.format === 'agent_preview'
|
|
1924
|
+
|| process.env.SWEET_SEARCH_FORCE_BM25F_BOOSTS === '1';
|
|
1925
|
+
const symbolExactTarget = isAgentFormat && !hasAblation(ablations, 'no-symbol-exact-boost')
|
|
1926
|
+
? extractSymbolDefinitionTarget(opts.query || '')
|
|
1927
|
+
: null;
|
|
1928
|
+
const pathTokens = isAgentFormat && !hasAblation(ablations, 'no-path-token-boost')
|
|
1929
|
+
? extractPathTokens(opts.query || '')
|
|
1930
|
+
: [];
|
|
1931
|
+
// Identifier-mention boost (complements verb-anchored symbolExactTarget):
|
|
1932
|
+
// fires on noun-anchored probe phrasings where the gold symbol appears in
|
|
1933
|
+
// the query without a "show me/find/where is" prefix. Format-gated; opts
|
|
1934
|
+
// passes through `_symbolExactTarget` so this boost skips mentions already
|
|
1935
|
+
// boosted by the higher-precision verb path.
|
|
1936
|
+
const identifierMentions = isAgentFormat && !hasAblation(ablations, 'no-identifier-mention-boost')
|
|
1937
|
+
? extractIdentifierMentions(opts.query || '')
|
|
1938
|
+
: null;
|
|
1939
|
+
|
|
1940
|
+
// F9 (2026-05-12): pre-compute query word tokens once for additional_symbols
|
|
1941
|
+
// re-anchoring (see findAdditionalSymbolRelabel docstring). Format-gated;
|
|
1942
|
+
// skipped entirely when isAgentFormat=false so GCSN benchmark traffic is
|
|
1943
|
+
// untouched (same gate as the other BM25F-family signals above).
|
|
1944
|
+
//
|
|
1945
|
+
// F10 (2026-05-12): the extraction regex preserves a leading `$` so
|
|
1946
|
+
// identifiers like $ZodType / $ZodTypeInternals (zod v4/core public-API
|
|
1947
|
+
// convention — the structural interfaces are $-prefixed while the runtime
|
|
1948
|
+
// classes are not) round-trip through tokenization. `\b[A-Za-z0-9_]+\b`
|
|
1949
|
+
// would silently strip the `$` (since `$` is non-word) and make
|
|
1950
|
+
// `$ZodType` indistinguishable from plain `ZodType`, costing TSL-004/8
|
|
1951
|
+
// (chunk relabel picked classic/ZodType over core/$ZodType because the
|
|
1952
|
+
// tier-A literal match was ambiguous).
|
|
1953
|
+
const f9QueryWordsArr = (isAgentFormat && !hasAblation(ablations, 'no-addsym-relabel'))
|
|
1954
|
+
? ((opts.query || '').match(/\$?[A-Za-z_][A-Za-z0-9_]*/g) || []).map(w => w.toLowerCase())
|
|
1955
|
+
: null;
|
|
1956
|
+
const f9QueryWordsSet = f9QueryWordsArr ? new Set(f9QueryWordsArr) : null;
|
|
1957
|
+
|
|
1958
|
+
let changed = false;
|
|
1959
|
+
const window = Math.min(opts.window ?? results.length, results.length);
|
|
1960
|
+
|
|
1961
|
+
// Per-rule timers — accumulator pattern, no object allocation per call.
|
|
1962
|
+
// No-op in production; only fires when profile-search-stages.mjs sets
|
|
1963
|
+
// globalThis.__stageTimings. Adds ~1ms overhead per call when profiling
|
|
1964
|
+
// (12 rules × 100 results × 2 performance.now() calls), acceptable for
|
|
1965
|
+
// the diagnostic.
|
|
1966
|
+
const __profOn = !!globalThis.__stageTimings;
|
|
1967
|
+
const __ruleTime = __profOn ? new Float64Array(12) : null;
|
|
1968
|
+
let __ruleT0 = 0;
|
|
1969
|
+
// Hoist loop-invariant work out of the per-result map():
|
|
1970
|
+
// - ruleOpts: a single spread reused across the 3 ruleOpts callsites
|
|
1971
|
+
// (anomalous, docComment, megaEntity). Original allocated 3 fresh
|
|
1972
|
+
// spreads per result (~15-20 keys each) × 100 results = 300 extra
|
|
1973
|
+
// objects per call.
|
|
1974
|
+
// - skip* flags: hasAblation() called once per result per rule otherwise.
|
|
1975
|
+
// - preferredKindKeywordSet: the kind→keywords list never changes during
|
|
1976
|
+
// the loop, but the original recomputed
|
|
1977
|
+
// `(ENTITY_KIND_KEYWORDS[preferredKind] || []).map(normalizeType)` per
|
|
1978
|
+
// result inside the entity-adoption gate.
|
|
1979
|
+
// Pre-resolve the envelope-cap once for ruleOpts. resolveMaxEnvelopeLines
|
|
1980
|
+
// does an env-var lookup + parseInt + default fallback; without this it
|
|
1981
|
+
// ran per result inside megaEntityPenalty.
|
|
1982
|
+
const ruleOpts = {
|
|
1983
|
+
...opts,
|
|
1984
|
+
ablations,
|
|
1985
|
+
_isAgentFormat: isAgentFormat,
|
|
1986
|
+
_megaEnvelopeMax: resolveMaxEnvelopeLines(opts),
|
|
1987
|
+
};
|
|
1988
|
+
const skipTestName = hasAblation(ablations, 'no-test-name-overlap');
|
|
1989
|
+
const skipBodyDensity = hasAblation(ablations, 'no-body-density');
|
|
1990
|
+
const skipMegaChunk = hasAblation(ablations, 'no-mega-chunk-penalty');
|
|
1991
|
+
const skipRefCount = hasAblation(ablations, 'no-ref-count-boost');
|
|
1992
|
+
const skipNamePrecision = hasAblation(ablations, 'no-name-precision');
|
|
1993
|
+
const skipEntityKindPref = hasAblation(ablations, 'no-entity-kind-pref');
|
|
1994
|
+
const testNameOverlapThreshold = opts.testNameOverlapThreshold ?? 0.5;
|
|
1995
|
+
const testNameOverlapFactor = opts.testNameOverlapFactor ?? 0.40;
|
|
1996
|
+
const preferredKindKeywordSet = preferredKind
|
|
1997
|
+
? new Set((ENTITY_KIND_KEYWORDS[preferredKind] || []).map(normalizeType))
|
|
1998
|
+
: null;
|
|
1999
|
+
|
|
2000
|
+
// For-loop with a pre-allocated array. The hot path here was a `.map()`
|
|
2001
|
+
// callback that always allocated a `details` array per result and a fresh
|
|
2002
|
+
// result spread `{ ...result, _resultDemotionOrigIndex: index }` even when
|
|
2003
|
+
// no rule fired. With ~100 results × 2 demotion sites that's hundreds of
|
|
2004
|
+
// empty arrays + light spreads per query for nothing. Lazy `details`
|
|
2005
|
+
// allocation skips the array when the result has zero rule hits;
|
|
2006
|
+
// unchanged-result spreads keep going through the same shape (the caller
|
|
2007
|
+
// expects new references — cascade scoring writes back r.score).
|
|
2008
|
+
const adjusted = new Array(window);
|
|
2009
|
+
for (let index = 0; index < window; index++) {
|
|
2010
|
+
const result = results[index];
|
|
2011
|
+
let mult = 1;
|
|
2012
|
+
let details = null;
|
|
2013
|
+
|
|
2014
|
+
if (!skipTestName) {
|
|
2015
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2016
|
+
if (isTestChunk(result, opts)) {
|
|
2017
|
+
const overlap = testNameQueryOverlap(result, qTokens);
|
|
2018
|
+
if (overlap >= testNameOverlapThreshold) {
|
|
2019
|
+
mult *= testNameOverlapFactor;
|
|
2020
|
+
(details ||= []).push('test-name:0.40');
|
|
2021
|
+
}
|
|
2022
|
+
}
|
|
2023
|
+
if (__profOn) __ruleTime[0] += performance.now() - __ruleT0;
|
|
2024
|
+
}
|
|
2025
|
+
|
|
2026
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2027
|
+
const kindMult = entityKindMultiplier(result, preferredKind, opts);
|
|
2028
|
+
if (__profOn) __ruleTime[1] += performance.now() - __ruleT0;
|
|
2029
|
+
if (kindMult !== 1) {
|
|
2030
|
+
mult *= kindMult;
|
|
2031
|
+
(details ||= []).push(`kind-pref:${kindMult.toFixed(2)}`);
|
|
2032
|
+
}
|
|
2033
|
+
|
|
2034
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2035
|
+
const nameMult = namePrecisionMultiplier(result, preferredKind, nameHintsLower, opts);
|
|
2036
|
+
if (__profOn) __ruleTime[2] += performance.now() - __ruleT0;
|
|
2037
|
+
if (nameMult !== 1) {
|
|
2038
|
+
mult *= nameMult;
|
|
2039
|
+
(details ||= []).push(`name-precision:${nameMult.toFixed(2)}`);
|
|
2040
|
+
}
|
|
2041
|
+
|
|
2042
|
+
if (!skipBodyDensity) {
|
|
2043
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2044
|
+
const bodyMult = bodyDensityMultiplier(result, opts);
|
|
2045
|
+
if (__profOn) __ruleTime[3] += performance.now() - __ruleT0;
|
|
2046
|
+
if (bodyMult !== 1) {
|
|
2047
|
+
mult *= bodyMult;
|
|
2048
|
+
(details ||= []).push(`body-density:${bodyMult.toFixed(2)}`);
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
if (!skipMegaChunk) {
|
|
2053
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2054
|
+
const megaMult = megaChunkSizePenalty(result, opts);
|
|
2055
|
+
if (__profOn) __ruleTime[4] += performance.now() - __ruleT0;
|
|
2056
|
+
if (megaMult !== 1) {
|
|
2057
|
+
mult *= megaMult;
|
|
2058
|
+
(details ||= []).push(`mega-chunk:${megaMult.toFixed(2)}`);
|
|
2059
|
+
}
|
|
2060
|
+
}
|
|
2061
|
+
|
|
2062
|
+
{
|
|
2063
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2064
|
+
const anomMult = anomalousChunkDemotion(result, ruleOpts);
|
|
2065
|
+
if (__profOn) __ruleTime[5] += performance.now() - __ruleT0;
|
|
2066
|
+
if (anomMult !== 1) {
|
|
2067
|
+
mult *= anomMult;
|
|
2068
|
+
(details ||= []).push(`anomalous-chunk:${anomMult.toFixed(2)}`);
|
|
2069
|
+
}
|
|
2070
|
+
}
|
|
2071
|
+
|
|
2072
|
+
{
|
|
2073
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2074
|
+
const docMult = docCommentOnlyDemotion(result, ruleOpts);
|
|
2075
|
+
if (__profOn) __ruleTime[6] += performance.now() - __ruleT0;
|
|
2076
|
+
if (docMult !== 1) {
|
|
2077
|
+
mult *= docMult;
|
|
2078
|
+
(details ||= []).push(`doc-comment-only:${docMult.toFixed(2)}`);
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
|
|
2082
|
+
{
|
|
2083
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2084
|
+
const entMult = megaEntityPenalty(result, ruleOpts);
|
|
2085
|
+
if (__profOn) __ruleTime[7] += performance.now() - __ruleT0;
|
|
2086
|
+
if (entMult !== 1) {
|
|
2087
|
+
mult *= entMult;
|
|
2088
|
+
(details ||= []).push(`mega-entity:${entMult.toFixed(2)}`);
|
|
2089
|
+
}
|
|
2090
|
+
}
|
|
2091
|
+
|
|
2092
|
+
|
|
2093
|
+
if (symbolExactTarget) {
|
|
2094
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2095
|
+
const symbolMult = symbolExactMatchBoost(result, symbolExactTarget, opts);
|
|
2096
|
+
if (__profOn) __ruleTime[8] += performance.now() - __ruleT0;
|
|
2097
|
+
if (symbolMult !== 1) {
|
|
2098
|
+
mult *= symbolMult;
|
|
2099
|
+
(details ||= []).push(`symbol-exact:${symbolMult.toFixed(2)}`);
|
|
2100
|
+
}
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2103
|
+
if (identifierMentions) {
|
|
2104
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2105
|
+
const mentionMult = identifierMentionBoost(result, identifierMentions, {
|
|
2106
|
+
...opts,
|
|
2107
|
+
_symbolExactTarget: symbolExactTarget,
|
|
2108
|
+
});
|
|
2109
|
+
if (__profOn) __ruleTime[8] += performance.now() - __ruleT0;
|
|
2110
|
+
if (mentionMult !== 1) {
|
|
2111
|
+
mult *= mentionMult;
|
|
2112
|
+
(details ||= []).push(`identifier-mention:${mentionMult.toFixed(2)}`);
|
|
2113
|
+
}
|
|
2114
|
+
}
|
|
2115
|
+
|
|
2116
|
+
if (pathTokens.length > 0) {
|
|
2117
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2118
|
+
const pathMult = pathTokenBoost(result, pathTokens, opts);
|
|
2119
|
+
if (__profOn) __ruleTime[9] += performance.now() - __ruleT0;
|
|
2120
|
+
if (pathMult !== 1) {
|
|
2121
|
+
mult *= pathMult;
|
|
2122
|
+
(details ||= []).push(`path-token:${pathMult.toFixed(2)}`);
|
|
2123
|
+
}
|
|
2124
|
+
}
|
|
2125
|
+
|
|
2126
|
+
if (!skipRefCount) {
|
|
2127
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2128
|
+
const refMult = referenceCountBoost(result, refCounts, opts);
|
|
2129
|
+
if (__profOn) __ruleTime[10] += performance.now() - __ruleT0;
|
|
2130
|
+
if (refMult !== 1) {
|
|
2131
|
+
mult *= refMult;
|
|
2132
|
+
(details ||= []).push(`ref-count:${refMult.toFixed(2)}`);
|
|
2133
|
+
}
|
|
2134
|
+
}
|
|
2135
|
+
|
|
2136
|
+
const baseScore = typeof result.score === 'number' ? result.score : 0;
|
|
2137
|
+
if (__profOn) __ruleT0 = performance.now();
|
|
2138
|
+
// F8 (2026-05-07): when the query has an explicit symbol target (extractSymbolDefinitionTarget)
|
|
2139
|
+
// AND the chunk contains an entity matching that name, prefer THAT entity for labeling
|
|
2140
|
+
// over kind-preference / name-precision heuristics. Targets cases like S3-Q4 (chunk
|
|
2141
|
+
// labeled "Binding" but contains the Default function the user asked for) and parallels
|
|
2142
|
+
// F7's contained-entity boost (which only changes ranking, not symbol attribution).
|
|
2143
|
+
// Format-gated through symbolExactTarget which is set only when isAgentFormat.
|
|
2144
|
+
const exactSymbolTargetEntity = symbolExactTarget && opts.codeGraphRepo
|
|
2145
|
+
&& typeof opts.codeGraphRepo.findEntityWithNameInRange === 'function'
|
|
2146
|
+
? (() => {
|
|
2147
|
+
const fp = resolveFilePath(result);
|
|
2148
|
+
const meta = result?.metadata ?? {};
|
|
2149
|
+
const sl = Number(result?.startLine ?? meta.startLine);
|
|
2150
|
+
const el = Number(result?.endLine ?? meta.endLine);
|
|
2151
|
+
if (!fp || !Number.isFinite(sl) || !Number.isFinite(el)) return null;
|
|
2152
|
+
const cache = opts._entityNameCache;
|
|
2153
|
+
const cacheKey = cache ? `${fp}|${sl}|${el}|${symbolExactTarget}` : null;
|
|
2154
|
+
if (cacheKey && cache.has(cacheKey)) return cache.get(cacheKey);
|
|
2155
|
+
let resolved = null;
|
|
2156
|
+
try {
|
|
2157
|
+
resolved = opts.codeGraphRepo.findEntityWithNameInRange(fp, sl, el, symbolExactTarget);
|
|
2158
|
+
} catch { resolved = null; }
|
|
2159
|
+
if (cacheKey) cache.set(cacheKey, resolved);
|
|
2160
|
+
return resolved;
|
|
2161
|
+
})()
|
|
2162
|
+
: null;
|
|
2163
|
+
// F9 (2026-05-12): when F8 (verb-anchored explicit target) did not fire,
|
|
2164
|
+
// try additional_symbols re-anchoring. JS/TS/TSX/JSX-gated inside the
|
|
2165
|
+
// helper. Pure label adoption — no score change. See helper docstring.
|
|
2166
|
+
const additionalSymbolRelabelEntity = !exactSymbolTargetEntity
|
|
2167
|
+
&& f9QueryWordsArr
|
|
2168
|
+
? findAdditionalSymbolRelabel(result, f9QueryWordsArr, f9QueryWordsSet, opts)
|
|
2169
|
+
: null;
|
|
2170
|
+
const exactEntity = exactSymbolTargetEntity
|
|
2171
|
+
|| additionalSymbolRelabelEntity
|
|
2172
|
+
|| (!skipNamePrecision
|
|
2173
|
+
? exactNamedEntityForResult(result, preferredKind, nameHints, nameHintsLower, opts)
|
|
2174
|
+
: null);
|
|
2175
|
+
const preferredEntity = exactEntity || (preferredKind && !skipEntityKindPref
|
|
2176
|
+
? resolveEntityKindInfo(result, opts)
|
|
2177
|
+
: null);
|
|
2178
|
+
const preferredType = normalizeType(preferredEntity?.type);
|
|
2179
|
+
// F8 (continued): when the chunk contains an entity matching the explicit
|
|
2180
|
+
// symbol target (function name from "show me X function" queries), bypass
|
|
2181
|
+
// the kind-keyword gate. Functions/methods aren't in ENTITY_KIND_KEYWORDS
|
|
2182
|
+
// (which is struct/enum/class/interface/trait/type), so without bypass the
|
|
2183
|
+
// relabel path was gated off for "show me X function" queries — defeating
|
|
2184
|
+
// the purpose of having SYMBOL_DEFN_QUERY_RE recognise "function".
|
|
2185
|
+
const shouldAdoptViaExactTarget = !!(exactSymbolTargetEntity
|
|
2186
|
+
&& exactSymbolTargetEntity.name
|
|
2187
|
+
&& exactSymbolTargetEntity.startLine
|
|
2188
|
+
&& exactSymbolTargetEntity.endLine);
|
|
2189
|
+
// F9: noun-anchored addSym path also bypasses the kind-keyword gate.
|
|
2190
|
+
// Variables/typeAliases captured as additional_symbols (TS-006/8 scope)
|
|
2191
|
+
// would otherwise fail the gate (variable/typeAlias not in keyword set).
|
|
2192
|
+
const shouldAdoptViaAddSym = !!(additionalSymbolRelabelEntity
|
|
2193
|
+
&& additionalSymbolRelabelEntity.name
|
|
2194
|
+
&& additionalSymbolRelabelEntity.startLine
|
|
2195
|
+
&& additionalSymbolRelabelEntity.endLine);
|
|
2196
|
+
const shouldAdoptEntity = shouldAdoptViaExactTarget || shouldAdoptViaAddSym || !!(preferredEntity?.startLine
|
|
2197
|
+
&& preferredEntity?.endLine
|
|
2198
|
+
&& preferredKindKeywordSet && preferredKindKeywordSet.has(preferredType));
|
|
2199
|
+
const containedEntity = !shouldAdoptEntity && opts.codeGraphRepo && typeof opts.codeGraphRepo.findFirstEntityInRange === 'function'
|
|
2200
|
+
? resolveEntityKindInfo(result, opts)
|
|
2201
|
+
: null;
|
|
2202
|
+
const shouldAdoptContained = !!(containedEntity?.name && containedEntity?.startLine && containedEntity?.endLine);
|
|
2203
|
+
const entityToAdopt = shouldAdoptEntity ? preferredEntity : shouldAdoptContained ? containedEntity : null;
|
|
2204
|
+
if (__profOn) __ruleTime[11] += performance.now() - __ruleT0;
|
|
2205
|
+
if (mult === 1 && !entityToAdopt) {
|
|
2206
|
+
// Unchanged: shallow copy preserves the caller-expected new-reference
|
|
2207
|
+
// shape (downstream cascade scoring writes back r.score) without the
|
|
2208
|
+
// redundant _resultDemotionOrigIndex field — V8 Array.sort is stable
|
|
2209
|
+
// since ES2019, so the in-place index-order tie-break is implicit.
|
|
2210
|
+
adjusted[index] = { ...result };
|
|
2211
|
+
continue;
|
|
2212
|
+
}
|
|
2213
|
+
changed = true;
|
|
2214
|
+
// Range-preservation invariant: adopting an entity is a *labeling*
|
|
2215
|
+
// operation (it tells the caller what symbol the chunk is about); it
|
|
2216
|
+
// must not SHRINK a well-formed retrieval chunk to a per-symbol entity
|
|
2217
|
+
// boundary. The cAST/sibling-merged chunk is the right unit for the
|
|
2218
|
+
// agent to read; the entity name + type are added as annotations.
|
|
2219
|
+
//
|
|
2220
|
+
// Concretely: a Go file's bsonBinding has a 1-line typeAlias entity
|
|
2221
|
+
// at line 14, but the LI chunk is lines 1-31 (typeAlias + 3 methods,
|
|
2222
|
+
// all merged by cAST). Adopting the entity's range used to drop 30
|
|
2223
|
+
// lines of content; now we keep the chunk range and just adopt the
|
|
2224
|
+
// name/type as labels. Range adoption only fires when the entity
|
|
2225
|
+
// is at least as large as the chunk (e.g. expanding a partial
|
|
2226
|
+
// chunk to its enclosing symbol — which is the legitimate use case).
|
|
2227
|
+
const chunkStart = result.metadata?.startLine ?? result.startLine ?? null;
|
|
2228
|
+
const chunkEnd = result.metadata?.endLine ?? result.endLine ?? null;
|
|
2229
|
+
const chunkRange = (chunkStart != null && chunkEnd != null)
|
|
2230
|
+
? Math.max(0, chunkEnd - chunkStart + 1) : 0;
|
|
2231
|
+
const entityRange = entityToAdopt
|
|
2232
|
+
? Math.max(0, (entityToAdopt.endLine || 0) - (entityToAdopt.startLine || 0) + 1) : 0;
|
|
2233
|
+
const adoptRange = !!entityToAdopt && entityRange >= chunkRange;
|
|
2234
|
+
const adoptedFile = entityToAdopt
|
|
2235
|
+
? (entityToAdopt.file || entityToAdopt.filePath || resolveFilePath(result))
|
|
2236
|
+
: null;
|
|
2237
|
+
const baseMetadata = result.metadata || {};
|
|
2238
|
+
const nextMetadata = entityToAdopt
|
|
2239
|
+
? {
|
|
2240
|
+
...baseMetadata,
|
|
2241
|
+
...(shouldAdoptEntity
|
|
2242
|
+
? { name: entityToAdopt.name || baseMetadata.name || result.name || null }
|
|
2243
|
+
: { name: entityToAdopt.name }),
|
|
2244
|
+
type: entityToAdopt.type,
|
|
2245
|
+
...(adoptRange ? {
|
|
2246
|
+
file: adoptedFile,
|
|
2247
|
+
startLine: entityToAdopt.startLine,
|
|
2248
|
+
endLine: entityToAdopt.endLine,
|
|
2249
|
+
} : {}),
|
|
2250
|
+
}
|
|
2251
|
+
: baseMetadata;
|
|
2252
|
+
adjusted[index] = {
|
|
2253
|
+
...result,
|
|
2254
|
+
...(entityToAdopt ? {
|
|
2255
|
+
name: shouldAdoptEntity
|
|
2256
|
+
? (entityToAdopt.name || result.name)
|
|
2257
|
+
: entityToAdopt.name,
|
|
2258
|
+
type: entityToAdopt.type,
|
|
2259
|
+
...(adoptRange ? {
|
|
2260
|
+
startLine: entityToAdopt.startLine,
|
|
2261
|
+
endLine: entityToAdopt.endLine,
|
|
2262
|
+
} : {}),
|
|
2263
|
+
} : {}),
|
|
2264
|
+
...(nextMetadata ? { metadata: nextMetadata } : {}),
|
|
2265
|
+
score: baseScore * mult,
|
|
2266
|
+
_resultDemotionOrigScore: baseScore,
|
|
2267
|
+
_resultDemotionMult: mult,
|
|
2268
|
+
_resultDemotionDetails: details ?? [],
|
|
2269
|
+
};
|
|
2270
|
+
}
|
|
2271
|
+
|
|
2272
|
+
// Dump per-rule timings to globalThis.__stageTimings (set by the profiler).
|
|
2273
|
+
// No-op in production. Labels mirror the rule names so the profiler's flat
|
|
2274
|
+
// table reads cleanly.
|
|
2275
|
+
if (__profOn && __ruleTime) {
|
|
2276
|
+
const labels = [
|
|
2277
|
+
'rule:testName', 'rule:entityKind', 'rule:namePrec', 'rule:body',
|
|
2278
|
+
'rule:megaChunk', 'rule:anomalous', 'rule:docComment', 'rule:megaEntity',
|
|
2279
|
+
'rule:symbolExact', 'rule:pathToken', 'rule:refCount', 'rule:adoptEntity',
|
|
2280
|
+
];
|
|
2281
|
+
const buf = globalThis.__stageTimings;
|
|
2282
|
+
for (let i = 0; i < labels.length; i++) {
|
|
2283
|
+
(buf[labels[i]] = buf[labels[i]] || []).push(__ruleTime[i]);
|
|
2284
|
+
}
|
|
2285
|
+
}
|
|
2286
|
+
|
|
2287
|
+
if (!changed) return results;
|
|
2288
|
+
|
|
2289
|
+
// V8 Array.sort is stable (ES2019) — same-score results retain their
|
|
2290
|
+
// original-window order without needing the explicit _origIndex tiebreak
|
|
2291
|
+
// the prior implementation carried.
|
|
2292
|
+
adjusted.sort((a, b) => (b.score || 0) - (a.score || 0));
|
|
2293
|
+
return window === results.length ? adjusted : adjusted.concat(results.slice(window));
|
|
2294
|
+
}
|
|
2295
|
+
|
|
144
2296
|
function envOff() {
|
|
145
2297
|
return process.env.SWEET_SEARCH_FILE_KIND_RANKING === '0'
|
|
146
2298
|
|| process.env.SWEET_SEARCH_FILE_KIND_RANKING === 'false';
|
|
@@ -169,7 +2321,7 @@ const DEFAULT_WINDOW = 30;
|
|
|
169
2321
|
*
|
|
170
2322
|
* Demotion fires only when:
|
|
171
2323
|
* - intent === 'implementation' (confident, NOT 'unknown'), AND
|
|
172
|
-
* - the top-N window contains at least one
|
|
2324
|
+
* - the top-N window contains at least one demotable candidate, AND
|
|
173
2325
|
* - the top-N window contains at least one implementation candidate.
|
|
174
2326
|
*
|
|
175
2327
|
* In every other case the original `results` array is returned unchanged
|
|
@@ -182,11 +2334,15 @@ const DEFAULT_WINDOW = 30;
|
|
|
182
2334
|
* @param {Object} [opts]
|
|
183
2335
|
* @param {string} [opts.query] - raw query (used to infer intent
|
|
184
2336
|
* if opts.intent isn't supplied)
|
|
185
|
-
* @param {'docs'|'tests'|'types'|'implementation'|'unknown'} [opts.intent]
|
|
2337
|
+
* @param {'docs'|'tests'|'types'|'ancillary'|'implementation'|'unknown'} [opts.intent]
|
|
186
2338
|
* - explicit intent override
|
|
187
2339
|
* @param {number} [opts.docFactor] - default from env / 0.85
|
|
2340
|
+
* @param {number} [opts.exampleFactor] - default from docFactor
|
|
188
2341
|
* @param {number} [opts.testFactor] - default from env / 0.85
|
|
189
2342
|
* @param {number} [opts.typeFactor] - default from env / 0.85
|
|
2343
|
+
* @param {number} [opts.ancillaryFactor] - default from env / 0.85
|
|
2344
|
+
* @param {number} [opts.tinyAncillaryFactor]
|
|
2345
|
+
* @param {number} [opts.tinyLineThreshold]
|
|
190
2346
|
* @param {number} [opts.window] - top-N window for analysis +
|
|
191
2347
|
* bounded re-sort (default 30)
|
|
192
2348
|
* @returns {Array} either the original `results` (no-op) or a new array
|
|
@@ -209,14 +2365,22 @@ export function applyFileKindRanking(results, opts = {}) {
|
|
|
209
2365
|
: envWindow('SWEET_SEARCH_FILE_KIND_WINDOW', DEFAULT_WINDOW);
|
|
210
2366
|
const windowSize = Math.min(window, results.length);
|
|
211
2367
|
|
|
2368
|
+
// Per-call file-kind cache: detectFileKind is invoked for every result
|
|
2369
|
+
// here AND inside isTestChunk → the same file path can be classified
|
|
2370
|
+
// many times in one applyFileKindRanking + applyResultDemotions pass.
|
|
2371
|
+
// Caller may pass opts._fileKindCache to share with the demotion sites.
|
|
2372
|
+
const fileKindOpts = opts._fileKindCache instanceof Map
|
|
2373
|
+
? opts
|
|
2374
|
+
: { ...opts, _fileKindCache: new Map() };
|
|
2375
|
+
|
|
212
2376
|
// Walk the window once: classify kinds and check for competition.
|
|
213
2377
|
const kinds = new Array(windowSize);
|
|
214
2378
|
let demotableCount = 0;
|
|
215
2379
|
let implCount = 0;
|
|
216
2380
|
for (let i = 0; i < windowSize; i++) {
|
|
217
|
-
const k = detectFileKind(resolveFilePath(results[i]));
|
|
2381
|
+
const k = detectFileKind(resolveFilePath(results[i]), fileKindOpts);
|
|
218
2382
|
kinds[i] = k;
|
|
219
|
-
if (k === 'docs' || k === 'tests' || k === 'types') demotableCount++;
|
|
2383
|
+
if (k === 'docs' || k === 'examples' || k === 'tests' || k === 'types' || k === 'ancillary') demotableCount++;
|
|
220
2384
|
else if (k === 'implementation') implCount++;
|
|
221
2385
|
}
|
|
222
2386
|
|
|
@@ -225,8 +2389,14 @@ export function applyFileKindRanking(results, opts = {}) {
|
|
|
225
2389
|
|
|
226
2390
|
const factor = envFactor('SWEET_SEARCH_FILE_KIND_FACTOR', DEFAULT_FACTOR);
|
|
227
2391
|
const docFactor = opts.docFactor != null ? opts.docFactor : factor;
|
|
2392
|
+
const exampleFactor = opts.exampleFactor != null ? opts.exampleFactor : docFactor;
|
|
228
2393
|
const testFactor = opts.testFactor != null ? opts.testFactor : factor;
|
|
229
2394
|
const typeFactor = opts.typeFactor != null ? opts.typeFactor : factor;
|
|
2395
|
+
const ancillaryFactor = opts.ancillaryFactor != null ? opts.ancillaryFactor : factor;
|
|
2396
|
+
const tinyAncillaryFactor = opts.tinyAncillaryFactor != null
|
|
2397
|
+
? opts.tinyAncillaryFactor
|
|
2398
|
+
: ancillaryFactor;
|
|
2399
|
+
const tinyLineThreshold = opts.tinyLineThreshold != null ? opts.tinyLineThreshold : 3;
|
|
230
2400
|
|
|
231
2401
|
const reranked = new Array(windowSize);
|
|
232
2402
|
for (let i = 0; i < windowSize; i++) {
|
|
@@ -234,8 +2404,15 @@ export function applyFileKindRanking(results, opts = {}) {
|
|
|
234
2404
|
const kind = kinds[i];
|
|
235
2405
|
let mult = 1;
|
|
236
2406
|
if (kind === 'docs') mult = docFactor;
|
|
2407
|
+
else if (kind === 'examples') mult = exampleFactor;
|
|
237
2408
|
else if (kind === 'tests') mult = testFactor;
|
|
238
2409
|
else if (kind === 'types') mult = typeFactor;
|
|
2410
|
+
else if (kind === 'ancillary') {
|
|
2411
|
+
const lineCount = inferLineCount(r);
|
|
2412
|
+
mult = lineCount <= tinyLineThreshold
|
|
2413
|
+
? Math.min(ancillaryFactor, tinyAncillaryFactor)
|
|
2414
|
+
: ancillaryFactor;
|
|
2415
|
+
}
|
|
239
2416
|
const baseScore = (typeof r.score === 'number') ? r.score : 0;
|
|
240
2417
|
reranked[i] = {
|
|
241
2418
|
...r,
|