sweet-search 2.4.2 → 2.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +43 -5
- package/core/embedding/embedding-cache.js +266 -18
- package/core/embedding/embedding-service.js +45 -9
- package/core/graph/graph-expansion.js +52 -12
- package/core/graph/graph-extractor.js +30 -1
- package/core/indexing/ast-chunker.js +331 -16
- package/core/indexing/chunking/chunk-builder.js +34 -1
- package/core/indexing/index-codebase-v21.js +31 -2
- package/core/indexing/index.js +6 -3
- package/core/indexing/indexer-ann.js +45 -6
- package/core/indexing/indexer-build.js +9 -1
- package/core/indexing/indexer-phases.js +6 -4
- package/core/indexing/indexing-file-policy.js +140 -0
- package/core/indexing/li-skip-policy.js +11 -220
- package/core/infrastructure/codebase-repository.js +21 -0
- package/core/infrastructure/config/embedding.js +20 -1
- package/core/infrastructure/config/graph.js +2 -2
- package/core/infrastructure/config/ranking.js +10 -0
- package/core/infrastructure/config/vector-store.js +1 -1
- package/core/infrastructure/coreml-cascade.js +236 -30
- package/core/infrastructure/coreml-cascade.json +25 -0
- package/core/infrastructure/index.js +17 -0
- package/core/infrastructure/init-config.js +216 -0
- package/core/infrastructure/language-patterns/registry-core.js +18 -0
- package/core/infrastructure/model-registry.js +12 -0
- package/core/infrastructure/native-inference.js +143 -51
- package/core/infrastructure/tree-sitter-provider.js +92 -2
- package/core/ranking/cascaded-scorer.js +6 -2
- package/core/ranking/file-kind-ranking.js +264 -0
- package/core/ranking/late-interaction-index.js +10 -4
- package/core/ranking/late-interaction-policy.js +304 -0
- package/core/search/context-expander.js +267 -28
- package/core/search/index.js +4 -0
- package/core/search/search-cli.js +3 -1
- package/core/search/search-pattern.js +4 -3
- package/core/search/search-postprocess.js +189 -8
- package/core/search/search-read-semantic.js +734 -0
- package/core/search/search-read.js +481 -0
- package/core/search/search-server.js +153 -5
- package/core/search/sweet-search.js +133 -16
- package/core/start-server.js +13 -2
- package/mcp/server.js +41 -0
- package/mcp/tool-handlers.js +117 -6
- package/package.json +9 -7
- package/scripts/init.js +386 -5
- package/scripts/uninstall.js +152 -6
|
@@ -69,6 +69,28 @@ const BOUNDARY_TYPES = new Set([
|
|
|
69
69
|
'class_specifier', 'namespace_definition',
|
|
70
70
|
]);
|
|
71
71
|
|
|
72
|
+
// AST node types that represent function/class bodies. Used by
|
|
73
|
+
// extractSignature() to find where the declaration's body starts so
|
|
74
|
+
// the signature span is everything before it (decorators + name +
|
|
75
|
+
// parameters + return type, excluding body).
|
|
76
|
+
const BODY_TYPES = new Set([
|
|
77
|
+
// JS/TS, Java, Go, Rust, Kotlin, Swift, C#, Ruby (sometimes)
|
|
78
|
+
'block', 'statement_block', 'class_body', 'function_body',
|
|
79
|
+
// C / C++ — function bodies
|
|
80
|
+
'compound_statement', 'field_declaration_list',
|
|
81
|
+
// Python uses `block` (already covered) but `:` precedes it
|
|
82
|
+
// PHP — function/method body
|
|
83
|
+
'compound_statement_php',
|
|
84
|
+
// Swift / Kotlin — sometimes labelled differently
|
|
85
|
+
'enum_class_body', 'enum_body', 'interface_body',
|
|
86
|
+
// Rust impl/trait bodies
|
|
87
|
+
'declaration_list',
|
|
88
|
+
]);
|
|
89
|
+
|
|
90
|
+
// Maximum signature length (chars) after whitespace normalization.
|
|
91
|
+
// Signatures longer than this get truncated with `…`.
|
|
92
|
+
const MAX_SIGNATURE_LENGTH = 200;
|
|
93
|
+
|
|
72
94
|
// Map tree-sitter node type -> our chunk type label
|
|
73
95
|
const NODE_TYPE_MAP = {
|
|
74
96
|
'function_declaration': 'function',
|
|
@@ -410,12 +432,23 @@ export class TreeSitterProvider {
|
|
|
410
432
|
/**
|
|
411
433
|
* Parse file content into semantic chunks using the cAST recursive algorithm.
|
|
412
434
|
* Returns array of chunk objects or null if tree-sitter can't handle it.
|
|
435
|
+
*
|
|
436
|
+
* Header-aware budget (research-only ablation, May 2026): set
|
|
437
|
+
* SWEET_SEARCH_CHUNK_HEADER_OVERHEAD=N to subtract N chars from the
|
|
438
|
+
* cAST max chunk size, leaving room for the embedding-text headers
|
|
439
|
+
* (path / parent / symbol / language ≈ 50–100 chars) without spilling
|
|
440
|
+
* past the embedding cap. Default 0 = byte-identical to shipped. The
|
|
441
|
+
* audit motivating this lever lives in eval/results/chunk-overflow-audit.md.
|
|
413
442
|
*/
|
|
414
443
|
async parseFileToChunks(content, languageId, options = {}) {
|
|
415
444
|
const tree = await this.parse(content, languageId);
|
|
416
445
|
if (!tree) return null;
|
|
417
446
|
|
|
418
|
-
const
|
|
447
|
+
const headerOverhead = (() => {
|
|
448
|
+
const v = parseInt(process.env.SWEET_SEARCH_CHUNK_HEADER_OVERHEAD || '', 10);
|
|
449
|
+
return Number.isFinite(v) && v >= 0 ? v : 0;
|
|
450
|
+
})();
|
|
451
|
+
const maxChunkSize = (options.maxChunkSize || 2000) - headerOverhead;
|
|
419
452
|
this._chunkCounter = 0;
|
|
420
453
|
|
|
421
454
|
const children = this._getChildren(tree.rootNode);
|
|
@@ -467,6 +500,7 @@ export class TreeSitterProvider {
|
|
|
467
500
|
const firstBoundary = buffer.find(n => BOUNDARY_TYPES.has(n.type));
|
|
468
501
|
const name = firstBoundary ? this._extractNodeName(firstBoundary) : null;
|
|
469
502
|
const type = firstBoundary ? (NODE_TYPE_MAP[firstBoundary.type] || 'code') : 'code';
|
|
503
|
+
const signature = firstBoundary ? this._extractSignature(firstBoundary, content) : null;
|
|
470
504
|
|
|
471
505
|
chunks.push({
|
|
472
506
|
chunkId: this._nextChunkId(),
|
|
@@ -478,6 +512,7 @@ export class TreeSitterProvider {
|
|
|
478
512
|
endLine: buffer[buffer.length - 1].endPosition.row,
|
|
479
513
|
type,
|
|
480
514
|
name: name || (buffer.length === 1 ? null : null),
|
|
515
|
+
signature,
|
|
481
516
|
});
|
|
482
517
|
}
|
|
483
518
|
buffer = [];
|
|
@@ -536,6 +571,7 @@ export class TreeSitterProvider {
|
|
|
536
571
|
endLine: node.endPosition.row,
|
|
537
572
|
type: NODE_TYPE_MAP[node.type] || 'code',
|
|
538
573
|
name: this._extractNodeName(node),
|
|
574
|
+
signature: this._extractSignature(node, content),
|
|
539
575
|
});
|
|
540
576
|
}
|
|
541
577
|
}
|
|
@@ -546,6 +582,60 @@ export class TreeSitterProvider {
|
|
|
546
582
|
return chunks;
|
|
547
583
|
}
|
|
548
584
|
|
|
585
|
+
/**
|
|
586
|
+
* Extract a compact, single-line signature for a boundary AST node.
|
|
587
|
+
*
|
|
588
|
+
* Strategy: find the first body-like child (block / statement_block /
|
|
589
|
+
* compound_statement / class_body / declaration_list / …), and return
|
|
590
|
+
* the source span [node.startIndex, body.startIndex) with whitespace
|
|
591
|
+
* normalized to single spaces. If no body child is found (e.g.
|
|
592
|
+
* declarations without a body, abstract methods, interface members),
|
|
593
|
+
* return the full first line of the node.
|
|
594
|
+
*
|
|
595
|
+
* Returns null when the node has no children to inspect.
|
|
596
|
+
*
|
|
597
|
+
* Used by the `signature` R1 embedding-text variant. Intentionally
|
|
598
|
+
* does NOT alter `text`, `li_text`, or `li_greedy_text` — signature
|
|
599
|
+
* surface is research-only on `embedding_text`.
|
|
600
|
+
*/
|
|
601
|
+
_extractSignature(node, content) {
|
|
602
|
+
if (!node || !content) return null;
|
|
603
|
+
if (!BOUNDARY_TYPES.has(node.type)) return null;
|
|
604
|
+
|
|
605
|
+
let bodyStart = null;
|
|
606
|
+
// Try field-name lookup first (works for most modern grammars).
|
|
607
|
+
const bodyField = node.childForFieldName?.('body');
|
|
608
|
+
if (bodyField && BODY_TYPES.has(bodyField.type)) {
|
|
609
|
+
bodyStart = bodyField.startIndex;
|
|
610
|
+
} else {
|
|
611
|
+
// Fall back to scanning children for a body-shaped child.
|
|
612
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
613
|
+
const child = node.child(i);
|
|
614
|
+
if (BODY_TYPES.has(child.type)) {
|
|
615
|
+
bodyStart = child.startIndex;
|
|
616
|
+
break;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
let raw;
|
|
622
|
+
if (bodyStart != null && bodyStart > node.startIndex) {
|
|
623
|
+
raw = content.substring(node.startIndex, bodyStart);
|
|
624
|
+
} else {
|
|
625
|
+
// No body found — declaration only (e.g. abstract method, type
|
|
626
|
+
// alias). Take the whole node text.
|
|
627
|
+
raw = content.substring(node.startIndex, node.endIndex);
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Normalize: collapse runs of whitespace (including newlines) to a
|
|
631
|
+
// single space, drop leading/trailing whitespace.
|
|
632
|
+
const normalized = raw.replace(/\s+/g, ' ').trim();
|
|
633
|
+
if (!normalized) return null;
|
|
634
|
+
|
|
635
|
+
if (normalized.length <= MAX_SIGNATURE_LENGTH) return normalized;
|
|
636
|
+
return normalized.slice(0, MAX_SIGNATURE_LENGTH - 1) + '…';
|
|
637
|
+
}
|
|
638
|
+
|
|
549
639
|
/** Extract symbol name from an AST node */
|
|
550
640
|
_extractNodeName(node) {
|
|
551
641
|
// Try field name first (most reliable)
|
|
@@ -662,4 +752,4 @@ export function resetTreeSitterProvider() {
|
|
|
662
752
|
}
|
|
663
753
|
|
|
664
754
|
// Re-export constants for testing
|
|
665
|
-
export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };
|
|
755
|
+
export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, BODY_TYPES, MAX_SIGNATURE_LENGTH, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };
|
|
@@ -121,11 +121,15 @@ function partitionByTokenAvailability(candidates, liIndex) {
|
|
|
121
121
|
if (!liIndex) {
|
|
122
122
|
return { withTokens: [], withoutTokens: [...candidates] };
|
|
123
123
|
}
|
|
124
|
-
|
|
124
|
+
// Graph-expanded candidates have entity_id-based public ids that don't
|
|
125
|
+
// match LI-indexed chunk ids; they carry the resolved chunk id under
|
|
126
|
+
// _liChunkId. Honour it so expanded candidates can participate in MaxSim.
|
|
127
|
+
const lookupId = (c) => c._liChunkId || c.id || c.entity_id;
|
|
128
|
+
const available = liIndex.hasTokens(candidates.map(lookupId));
|
|
125
129
|
const withTokens = [];
|
|
126
130
|
const withoutTokens = [];
|
|
127
131
|
for (const c of candidates) {
|
|
128
|
-
(available.has(c
|
|
132
|
+
(available.has(lookupId(c)) ? withTokens : withoutTokens).push(c);
|
|
129
133
|
}
|
|
130
134
|
return { withTokens, withoutTokens };
|
|
131
135
|
}
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Intent-aware file-kind ranking (conservative variant).
|
|
3
|
+
*
|
|
4
|
+
* Background: real-codebase miss analysis found that documentation, test, and
|
|
5
|
+
* TypeScript-declaration files often outrank the implementation file users
|
|
6
|
+
* were actually looking for on multi-file codebases. The first version of
|
|
7
|
+
* this rule (commit f6fcfd1) lifted graph-2hop R@1 from 47.46 % → 64.41 %
|
|
8
|
+
* but catastrophically regressed GenCodeSearchNet under the dense profile
|
|
9
|
+
* (full-6 000 dense run: MRR@10 84.4 % → 47.4 %, Recall@5 92.0 % → 48.4 %).
|
|
10
|
+
* Root cause: the legacy LI-rerank pipeline assembles
|
|
11
|
+
* `results = [...liScored, ...tail]`, where `liScored` carries MaxSim
|
|
12
|
+
* scores that are sometimes *lower* (in absolute value) than the int8
|
|
13
|
+
* cosine scores already on the un-reranked tail. The concatenated list is
|
|
14
|
+
* therefore not globally score-monotonic. The old helper unconditionally
|
|
15
|
+
* spread and re-sorted *all* results by `score`, which floated the
|
|
16
|
+
* int8-only tail above the LI-reranked head and undid the rerank — even
|
|
17
|
+
* when every multiplier was 1 (GenCodeSearchNet is a single-source
|
|
18
|
+
* corpus, so no docs/tests/types kind ever matches there).
|
|
19
|
+
*
|
|
20
|
+
* Conservative variant fixes both regressions with three guards:
|
|
21
|
+
*
|
|
22
|
+
* 1. Confident-intent gating. `classifyFileKindIntent` now returns
|
|
23
|
+
* `'unknown'` for queries with no implementation-seeking signal. Only
|
|
24
|
+
* explicit `'implementation'` intent triggers demotion. `'unknown'`,
|
|
25
|
+
* `'docs'`, `'tests'`, `'types'` are no-ops.
|
|
26
|
+
*
|
|
27
|
+
* 2. Structural skip. The rule looks at the top-N candidates (default 30).
|
|
28
|
+
* If the window has zero docs/tests/types files (single-source corpus
|
|
29
|
+
* like GCSN) or zero implementation files (nothing to promote), the
|
|
30
|
+
* input is returned untouched. No re-sort, no new objects.
|
|
31
|
+
*
|
|
32
|
+
* 3. Window-bounded re-sort. When the rule does fire, only the top-N
|
|
33
|
+
* window is re-ranked. The tail — where the rerank/non-rerank score-
|
|
34
|
+
* scale boundary usually lives — is concatenated unchanged. This
|
|
35
|
+
* keeps mixed-scale damage contained.
|
|
36
|
+
*
|
|
37
|
+
* Disable at runtime with `SWEET_SEARCH_FILE_KIND_RANKING=0`. Tune the soft
|
|
38
|
+
* factor with `SWEET_SEARCH_FILE_KIND_FACTOR` (default 0.85; range (0, 1]).
|
|
39
|
+
* Tune the window with `SWEET_SEARCH_FILE_KIND_WINDOW` (default 30).
|
|
40
|
+
*/
|
|
41
|
+
|
|
42
|
+
const DOCS_RE = /\.md$|\.mdx$|\.rst$|(?:^|\/)docs?\//i;
|
|
43
|
+
const TESTS_RE = /(?:^|\/)tests?\/|(?:^|\/)spec\/|\.test\.[a-z0-9]+$|_test\.[a-z0-9]+$|\.spec\.[a-z0-9]+$|_spec\.[a-z0-9]+$/i;
|
|
44
|
+
const TYPES_RE = /\.d\.ts$|(?:^|\/)types\//i;
|
|
45
|
+
|
|
46
|
+
// Strong implementation-seeking signals. A query that fires one of these is
|
|
47
|
+
// confidently asking for source code; anything else is treated as `'unknown'`.
|
|
48
|
+
// Curated to cover the validated guard-set queries plus common phrasings,
|
|
49
|
+
// without matching pure descriptive corpus prose like "Convert XML to URL List".
|
|
50
|
+
const IMPL_INTENT_RE = new RegExp(
|
|
51
|
+
'\\b(' + [
|
|
52
|
+
// English wh-questions about location/behaviour
|
|
53
|
+
'where', 'how does', 'how do',
|
|
54
|
+
// Definition / implementation phrasing
|
|
55
|
+
'implements?', 'implementation', 'defines?', 'definition', 'declared?',
|
|
56
|
+
// Code-structure nouns
|
|
57
|
+
'function', 'functions', 'method', 'methods', 'class', 'classes',
|
|
58
|
+
'constructor', 'module', 'library', 'crate', 'package',
|
|
59
|
+
// Verbs that strongly signal a code unit
|
|
60
|
+
'dispatch(?:es|er)?', 'handles?', 'handler', 'handlers',
|
|
61
|
+
'parses?', 'parser', 'parsers',
|
|
62
|
+
'router?', 'routes?', 'routing',
|
|
63
|
+
'register(?:s|ed|ing)?',
|
|
64
|
+
'builds?', 'builder', 'builders',
|
|
65
|
+
'generat(?:es?|or|ors|ed|ing)',
|
|
66
|
+
'creat(?:es?|or|ed|ion|ing)',
|
|
67
|
+
'loads?', 'loader',
|
|
68
|
+
'writes?', 'writer',
|
|
69
|
+
'reads?', 'reader',
|
|
70
|
+
'sends?', 'receives?',
|
|
71
|
+
'computes?', 'computed',
|
|
72
|
+
'encodes?', 'encoder', 'decodes?', 'decoder',
|
|
73
|
+
'transforms?', 'transformer',
|
|
74
|
+
'invokes?', 'calls?', 'returns?',
|
|
75
|
+
'valid(?:ate|ates|ator|ation)',
|
|
76
|
+
'serial(?:ize|izes|izer)', 'deserial(?:ize|izes|izer)',
|
|
77
|
+
'wrap(?:s|per|ped|ping)?',
|
|
78
|
+
'matchers?', 'matches?',
|
|
79
|
+
'printers?', 'prints?',
|
|
80
|
+
'searchers?', 'searches?',
|
|
81
|
+
// Specific terms common in real-repo guard queries
|
|
82
|
+
'callback', 'callbacks',
|
|
83
|
+
'factory', 'factories',
|
|
84
|
+
'controller', 'controllers',
|
|
85
|
+
'middleware',
|
|
86
|
+
'fallback', 'fallbacks',
|
|
87
|
+
'entrypoint', 'entry-point', 'main',
|
|
88
|
+
'init', 'initialise', 'initialize', 'initialiser', 'initializer',
|
|
89
|
+
'kernel', 'engine',
|
|
90
|
+
'wrapper', 'wrappers',
|
|
91
|
+
'singleton',
|
|
92
|
+
'factory',
|
|
93
|
+
'decorator', 'decorators',
|
|
94
|
+
'closure', 'closures',
|
|
95
|
+
].join('|') + ')\\b',
|
|
96
|
+
'i',
|
|
97
|
+
);
|
|
98
|
+
|
|
99
|
+
const DOCS_INTENT_RE = /\b(doc|docs|documentation|readme|guide|tutorial|reference|example)\b/i;
|
|
100
|
+
const TESTS_INTENT_RE = /\b(test|tests|spec|specs|fixture|fixtures|mock|mocks)\b/i;
|
|
101
|
+
const TYPES_INTENT_RE = /\b(type|types|interface|declaration|signature|typings|typedef)\b/i;
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Detect the file kind from a result path.
|
|
105
|
+
* @returns {'docs'|'tests'|'types'|'implementation'}
|
|
106
|
+
*/
|
|
107
|
+
export function detectFileKind(filePath) {
|
|
108
|
+
if (!filePath || typeof filePath !== 'string') return 'implementation';
|
|
109
|
+
if (DOCS_RE.test(filePath)) return 'docs';
|
|
110
|
+
if (TESTS_RE.test(filePath)) return 'tests';
|
|
111
|
+
if (TYPES_RE.test(filePath)) return 'types';
|
|
112
|
+
return 'implementation';
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Detect file-kind intent of a query along the docs/tests/types/implementation
|
|
117
|
+
* axis. Conservative: a query with no implementation-seeking signal returns
|
|
118
|
+
* `'unknown'`, and the helper treats `'unknown'` as a no-op (just like the
|
|
119
|
+
* docs/tests/types intents).
|
|
120
|
+
*
|
|
121
|
+
* @returns {'docs'|'tests'|'types'|'implementation'|'unknown'}
|
|
122
|
+
*/
|
|
123
|
+
export function classifyFileKindIntent(query) {
|
|
124
|
+
const q = (query || '').toLowerCase();
|
|
125
|
+
if (!q) return 'unknown';
|
|
126
|
+
// Type-seeking trumps test-seeking when both fire (existing convention).
|
|
127
|
+
if (TYPES_INTENT_RE.test(q)) return 'types';
|
|
128
|
+
if (DOCS_INTENT_RE.test(q)) return 'docs';
|
|
129
|
+
if (TESTS_INTENT_RE.test(q)) return 'tests';
|
|
130
|
+
if (IMPL_INTENT_RE.test(q)) return 'implementation';
|
|
131
|
+
return 'unknown';
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function resolveFilePath(r) {
|
|
135
|
+
return r?.file
|
|
136
|
+
|| r?.file_path
|
|
137
|
+
|| r?.path
|
|
138
|
+
|| r?.metadata?.file
|
|
139
|
+
|| r?.metadata?.file_path
|
|
140
|
+
|| r?.metadata?.path
|
|
141
|
+
|| '';
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function envOff() {
|
|
145
|
+
return process.env.SWEET_SEARCH_FILE_KIND_RANKING === '0'
|
|
146
|
+
|| process.env.SWEET_SEARCH_FILE_KIND_RANKING === 'false';
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function envFactor(name, fallback) {
|
|
150
|
+
const v = process.env[name];
|
|
151
|
+
if (!v) return fallback;
|
|
152
|
+
const n = Number.parseFloat(v);
|
|
153
|
+
return Number.isFinite(n) && n > 0 && n <= 1 ? n : fallback;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function envWindow(name, fallback) {
|
|
157
|
+
const v = process.env[name];
|
|
158
|
+
if (!v) return fallback;
|
|
159
|
+
const n = Number.parseInt(v, 10);
|
|
160
|
+
return Number.isFinite(n) && n > 0 ? n : fallback;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const DEFAULT_FACTOR = 0.85;
|
|
164
|
+
const DEFAULT_WINDOW = 30;
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Apply intent-aware file-kind score multipliers, then re-sort the top-N
|
|
168
|
+
* window descending. The original array is not mutated.
|
|
169
|
+
*
|
|
170
|
+
* Demotion fires only when:
|
|
171
|
+
* - intent === 'implementation' (confident, NOT 'unknown'), AND
|
|
172
|
+
* - the top-N window contains at least one docs/tests/types candidate, AND
|
|
173
|
+
* - the top-N window contains at least one implementation candidate.
|
|
174
|
+
*
|
|
175
|
+
* In every other case the original `results` array is returned unchanged
|
|
176
|
+
* (same reference, no copy, no re-sort) — this is critical so the helper is
|
|
177
|
+
* a structural no-op on single-source corpora (GCSN) and on cascades whose
|
|
178
|
+
* top-N has no demotable competition.
|
|
179
|
+
*
|
|
180
|
+
* @param {Array} results - search results carrying .score and a file-path
|
|
181
|
+
* field (.file / .file_path / .path / .metadata.*).
|
|
182
|
+
* @param {Object} [opts]
|
|
183
|
+
* @param {string} [opts.query] - raw query (used to infer intent
|
|
184
|
+
* if opts.intent isn't supplied)
|
|
185
|
+
* @param {'docs'|'tests'|'types'|'implementation'|'unknown'} [opts.intent]
|
|
186
|
+
* - explicit intent override
|
|
187
|
+
* @param {number} [opts.docFactor] - default from env / 0.85
|
|
188
|
+
* @param {number} [opts.testFactor] - default from env / 0.85
|
|
189
|
+
* @param {number} [opts.typeFactor] - default from env / 0.85
|
|
190
|
+
* @param {number} [opts.window] - top-N window for analysis +
|
|
191
|
+
* bounded re-sort (default 30)
|
|
192
|
+
* @returns {Array} either the original `results` (no-op) or a new array
|
|
193
|
+
* whose head is sorted by adjusted score and whose tail is
|
|
194
|
+
* the unchanged input tail. Stable on ties.
|
|
195
|
+
*/
|
|
196
|
+
export function applyFileKindRanking(results, opts = {}) {
|
|
197
|
+
if (envOff()) return results;
|
|
198
|
+
if (!Array.isArray(results) || results.length === 0) return results;
|
|
199
|
+
|
|
200
|
+
const intent = opts.intent != null
|
|
201
|
+
? opts.intent
|
|
202
|
+
: classifyFileKindIntent(opts.query || '');
|
|
203
|
+
|
|
204
|
+
// Conservative gate: only confident 'implementation' intent fires.
|
|
205
|
+
if (intent !== 'implementation') return results;
|
|
206
|
+
|
|
207
|
+
const window = opts.window != null
|
|
208
|
+
? opts.window
|
|
209
|
+
: envWindow('SWEET_SEARCH_FILE_KIND_WINDOW', DEFAULT_WINDOW);
|
|
210
|
+
const windowSize = Math.min(window, results.length);
|
|
211
|
+
|
|
212
|
+
// Walk the window once: classify kinds and check for competition.
|
|
213
|
+
const kinds = new Array(windowSize);
|
|
214
|
+
let demotableCount = 0;
|
|
215
|
+
let implCount = 0;
|
|
216
|
+
for (let i = 0; i < windowSize; i++) {
|
|
217
|
+
const k = detectFileKind(resolveFilePath(results[i]));
|
|
218
|
+
kinds[i] = k;
|
|
219
|
+
if (k === 'docs' || k === 'tests' || k === 'types') demotableCount++;
|
|
220
|
+
else if (k === 'implementation') implCount++;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Structural skip: nothing to demote, or nothing to promote.
|
|
224
|
+
if (demotableCount === 0 || implCount === 0) return results;
|
|
225
|
+
|
|
226
|
+
const factor = envFactor('SWEET_SEARCH_FILE_KIND_FACTOR', DEFAULT_FACTOR);
|
|
227
|
+
const docFactor = opts.docFactor != null ? opts.docFactor : factor;
|
|
228
|
+
const testFactor = opts.testFactor != null ? opts.testFactor : factor;
|
|
229
|
+
const typeFactor = opts.typeFactor != null ? opts.typeFactor : factor;
|
|
230
|
+
|
|
231
|
+
const reranked = new Array(windowSize);
|
|
232
|
+
for (let i = 0; i < windowSize; i++) {
|
|
233
|
+
const r = results[i];
|
|
234
|
+
const kind = kinds[i];
|
|
235
|
+
let mult = 1;
|
|
236
|
+
if (kind === 'docs') mult = docFactor;
|
|
237
|
+
else if (kind === 'tests') mult = testFactor;
|
|
238
|
+
else if (kind === 'types') mult = typeFactor;
|
|
239
|
+
const baseScore = (typeof r.score === 'number') ? r.score : 0;
|
|
240
|
+
reranked[i] = {
|
|
241
|
+
...r,
|
|
242
|
+
_fileKindOrigScore: baseScore,
|
|
243
|
+
_fileKindMult: mult,
|
|
244
|
+
_fileKindKind: kind,
|
|
245
|
+
_fileKindOrigIndex: i,
|
|
246
|
+
score: baseScore * mult,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Stable sort: descending score, tie-break on original index.
|
|
251
|
+
reranked.sort((a, b) => {
|
|
252
|
+
const d = (b.score || 0) - (a.score || 0);
|
|
253
|
+
return d !== 0 ? d : a._fileKindOrigIndex - b._fileKindOrigIndex;
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
for (const r of reranked) delete r._fileKindOrigIndex;
|
|
257
|
+
|
|
258
|
+
// Concatenate unchanged tail. The cascade's CE/MaxSim score-scale
|
|
259
|
+
// boundary typically lives near rank `ceTopK`, so leaving rank
|
|
260
|
+
// `windowSize`+ untouched contains the damage from any cross-scale
|
|
261
|
+
// re-sort that might happen inside the window.
|
|
262
|
+
if (windowSize === results.length) return reranked;
|
|
263
|
+
return reranked.concat(results.slice(windowSize));
|
|
264
|
+
}
|
|
@@ -1414,10 +1414,16 @@ export class LateInteractionIndex {
|
|
|
1414
1414
|
// don't support importance weighting, so we must use the JS-tier weighted path.
|
|
1415
1415
|
const nativeScored = new Set();
|
|
1416
1416
|
|
|
1417
|
+
// Resolve a doc-lookup ID for each candidate. Graph-expanded candidates
|
|
1418
|
+
// carry `_liChunkId` (a chunk id pointing into the LI index) while their
|
|
1419
|
+
// public `id` is the entity id from the code graph. Honouring _liChunkId
|
|
1420
|
+
// lets expanded candidates participate in MaxSim rerank.
|
|
1421
|
+
const docIdOf = (c) => c._liChunkId || c.id;
|
|
1422
|
+
|
|
1417
1423
|
if (useFlatPath && !this.useTokenWeights) {
|
|
1418
1424
|
const groups = { bit4: [], perToken: [], perDoc: [] };
|
|
1419
1425
|
for (const candidate of toScore) {
|
|
1420
|
-
const doc = this.documents.get(candidate
|
|
1426
|
+
const doc = this.documents.get(docIdOf(candidate));
|
|
1421
1427
|
if (!doc) continue;
|
|
1422
1428
|
if (doc.quantBits === 4 && doc.minArray && doc.tokenNorms) {
|
|
1423
1429
|
groups.bit4.push({ candidate, doc });
|
|
@@ -1453,7 +1459,7 @@ export class LateInteractionIndex {
|
|
|
1453
1459
|
// Try WASM fused kernels first (avoids JS-side dequant), fall back to JS dequant + wasmMaxSimF32.
|
|
1454
1460
|
for (const candidate of toScore) {
|
|
1455
1461
|
if (nativeScored.has(candidate.id)) continue;
|
|
1456
|
-
const doc = this.documents.get(candidate
|
|
1462
|
+
const doc = this.documents.get(docIdOf(candidate));
|
|
1457
1463
|
if (!doc) { pushFallback(candidate); continue; }
|
|
1458
1464
|
|
|
1459
1465
|
if (useFlatPath) {
|
|
@@ -1488,7 +1494,7 @@ export class LateInteractionIndex {
|
|
|
1488
1494
|
}
|
|
1489
1495
|
|
|
1490
1496
|
// JS dequant → WASM f32 or JS fallback
|
|
1491
|
-
const flatData = this.getTokensFlat(candidate
|
|
1497
|
+
const flatData = this.getTokensFlat(docIdOf(candidate));
|
|
1492
1498
|
if (flatData) {
|
|
1493
1499
|
pushScored(candidate, this.maxSimScoreFlat(
|
|
1494
1500
|
effectiveQueryTokens, flatData.flat, flatData.numTokens, flatData.dim,
|
|
@@ -1498,7 +1504,7 @@ export class LateInteractionIndex {
|
|
|
1498
1504
|
pushFallback(candidate);
|
|
1499
1505
|
}
|
|
1500
1506
|
} else {
|
|
1501
|
-
const docTokens = this.getTokens(candidate
|
|
1507
|
+
const docTokens = this.getTokens(docIdOf(candidate));
|
|
1502
1508
|
if (docTokens) {
|
|
1503
1509
|
pushScored(candidate, this.maxSimScore(effectiveQueryTokens, docTokens, pruneOpts));
|
|
1504
1510
|
} else {
|