@renseiai/agentfactory-code-intelligence 0.8.8 → 0.8.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/embedding/__tests__/embedding.test.d.ts +2 -0
- package/dist/src/embedding/__tests__/embedding.test.d.ts.map +1 -0
- package/dist/src/embedding/__tests__/embedding.test.js +339 -0
- package/dist/src/embedding/chunker.d.ts +40 -0
- package/dist/src/embedding/chunker.d.ts.map +1 -0
- package/dist/src/embedding/chunker.js +135 -0
- package/dist/src/embedding/embedding-provider.d.ts +15 -0
- package/dist/src/embedding/embedding-provider.d.ts.map +1 -0
- package/dist/src/embedding/embedding-provider.js +1 -0
- package/dist/src/embedding/voyage-provider.d.ts +39 -0
- package/dist/src/embedding/voyage-provider.d.ts.map +1 -0
- package/dist/src/embedding/voyage-provider.js +146 -0
- package/dist/src/index.d.ts +14 -2
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +10 -1
- package/dist/src/indexing/__tests__/vector-indexing.test.d.ts +2 -0
- package/dist/src/indexing/__tests__/vector-indexing.test.d.ts.map +1 -0
- package/dist/src/indexing/__tests__/vector-indexing.test.js +291 -0
- package/dist/src/indexing/incremental-indexer.d.ts +4 -0
- package/dist/src/indexing/incremental-indexer.d.ts.map +1 -1
- package/dist/src/indexing/incremental-indexer.js +45 -0
- package/dist/src/indexing/vector-indexer.d.ts +63 -0
- package/dist/src/indexing/vector-indexer.d.ts.map +1 -0
- package/dist/src/indexing/vector-indexer.js +197 -0
- package/dist/src/plugin/code-intelligence-plugin.d.ts.map +1 -1
- package/dist/src/plugin/code-intelligence-plugin.js +4 -2
- package/dist/src/reranking/__tests__/reranker.test.d.ts +2 -0
- package/dist/src/reranking/__tests__/reranker.test.d.ts.map +1 -0
- package/dist/src/reranking/__tests__/reranker.test.js +503 -0
- package/dist/src/reranking/cohere-reranker.d.ts +26 -0
- package/dist/src/reranking/cohere-reranker.d.ts.map +1 -0
- package/dist/src/reranking/cohere-reranker.js +110 -0
- package/dist/src/reranking/reranker-provider.d.ts +40 -0
- package/dist/src/reranking/reranker-provider.d.ts.map +1 -0
- package/dist/src/reranking/reranker-provider.js +6 -0
- package/dist/src/reranking/voyage-reranker.d.ts +27 -0
- package/dist/src/reranking/voyage-reranker.d.ts.map +1 -0
- package/dist/src/reranking/voyage-reranker.js +111 -0
- package/dist/src/search/__tests__/hybrid-search.test.d.ts +2 -0
- package/dist/src/search/__tests__/hybrid-search.test.d.ts.map +1 -0
- package/dist/src/search/__tests__/hybrid-search.test.js +437 -0
- package/dist/src/search/__tests__/query-classifier.test.d.ts +2 -0
- package/dist/src/search/__tests__/query-classifier.test.d.ts.map +1 -0
- package/dist/src/search/__tests__/query-classifier.test.js +136 -0
- package/dist/src/search/hybrid-search.d.ts +56 -0
- package/dist/src/search/hybrid-search.d.ts.map +1 -0
- package/dist/src/search/hybrid-search.js +299 -0
- package/dist/src/search/query-classifier.d.ts +20 -0
- package/dist/src/search/query-classifier.d.ts.map +1 -0
- package/dist/src/search/query-classifier.js +58 -0
- package/dist/src/search/score-normalizer.d.ts +16 -0
- package/dist/src/search/score-normalizer.d.ts.map +1 -0
- package/dist/src/search/score-normalizer.js +26 -0
- package/dist/src/types.d.ts +83 -0
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/types.js +36 -2
- package/dist/src/vector/__tests__/vector-store.test.d.ts +2 -0
- package/dist/src/vector/__tests__/vector-store.test.d.ts.map +1 -0
- package/dist/src/vector/__tests__/vector-store.test.js +278 -0
- package/dist/src/vector/hnsw-store.d.ts +48 -0
- package/dist/src/vector/hnsw-store.d.ts.map +1 -0
- package/dist/src/vector/hnsw-store.js +437 -0
- package/dist/src/vector/vector-store.d.ts +15 -0
- package/dist/src/vector/vector-store.d.ts.map +1 -0
- package/dist/src/vector/vector-store.js +1 -0
- package/package.json +1 -1
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { classifyQuery } from '../query-classifier.js';
|
|
3
|
+
describe('classifyQuery', () => {
|
|
4
|
+
// ── Identifier detection ──────────────────────────────────────────
|
|
5
|
+
describe('camelCase detection', () => {
|
|
6
|
+
it('classifies camelCase as identifier', () => {
|
|
7
|
+
const result = classifyQuery('handleRequest');
|
|
8
|
+
expect(result.type).toBe('identifier');
|
|
9
|
+
expect(result.alpha).toBe(0.25);
|
|
10
|
+
});
|
|
11
|
+
it('classifies multi-word camelCase as identifier', () => {
|
|
12
|
+
const result = classifyQuery('getUserById');
|
|
13
|
+
expect(result.type).toBe('identifier');
|
|
14
|
+
expect(result.alpha).toBe(0.25);
|
|
15
|
+
});
|
|
16
|
+
});
|
|
17
|
+
describe('snake_case detection', () => {
|
|
18
|
+
it('classifies snake_case as identifier', () => {
|
|
19
|
+
const result = classifyQuery('get_user_by_id');
|
|
20
|
+
expect(result.type).toBe('identifier');
|
|
21
|
+
expect(result.alpha).toBe(0.25);
|
|
22
|
+
});
|
|
23
|
+
it('classifies double snake_case as identifier', () => {
|
|
24
|
+
const result = classifyQuery('handle_http_request');
|
|
25
|
+
expect(result.type).toBe('identifier');
|
|
26
|
+
expect(result.alpha).toBe(0.25);
|
|
27
|
+
});
|
|
28
|
+
});
|
|
29
|
+
describe('PascalCase detection', () => {
|
|
30
|
+
it('classifies PascalCase as identifier', () => {
|
|
31
|
+
const result = classifyQuery('UserService');
|
|
32
|
+
expect(result.type).toBe('identifier');
|
|
33
|
+
expect(result.alpha).toBe(0.25);
|
|
34
|
+
});
|
|
35
|
+
it('classifies multi-word PascalCase as identifier', () => {
|
|
36
|
+
const result = classifyQuery('HttpRequestHandler');
|
|
37
|
+
expect(result.type).toBe('identifier');
|
|
38
|
+
expect(result.alpha).toBe(0.25);
|
|
39
|
+
});
|
|
40
|
+
});
|
|
41
|
+
describe('CONSTANT_CASE detection', () => {
|
|
42
|
+
it('classifies CONSTANT_CASE as identifier', () => {
|
|
43
|
+
const result = classifyQuery('MAX_RETRIES');
|
|
44
|
+
expect(result.type).toBe('identifier');
|
|
45
|
+
expect(result.alpha).toBe(0.25);
|
|
46
|
+
});
|
|
47
|
+
it('classifies single CONSTANT as identifier', () => {
|
|
48
|
+
const result = classifyQuery('HTTP_TIMEOUT');
|
|
49
|
+
expect(result.type).toBe('identifier');
|
|
50
|
+
expect(result.alpha).toBe(0.25);
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
describe('dot.notation detection', () => {
|
|
54
|
+
it('classifies dot notation as identifier', () => {
|
|
55
|
+
const result = classifyQuery('req.body');
|
|
56
|
+
expect(result.type).toBe('identifier');
|
|
57
|
+
expect(result.alpha).toBe(0.25);
|
|
58
|
+
});
|
|
59
|
+
it('classifies chained dot notation as identifier', () => {
|
|
60
|
+
const result = classifyQuery('this.service.getUser');
|
|
61
|
+
expect(result.type).toBe('identifier');
|
|
62
|
+
expect(result.alpha).toBe(0.25);
|
|
63
|
+
});
|
|
64
|
+
});
|
|
65
|
+
describe('operator tokens', () => {
|
|
66
|
+
it('classifies :: operator token as identifier', () => {
|
|
67
|
+
const result = classifyQuery('std::vector');
|
|
68
|
+
expect(result.type).toBe('identifier');
|
|
69
|
+
expect(result.alpha).toBe(0.25);
|
|
70
|
+
});
|
|
71
|
+
it('classifies -> operator token as identifier', () => {
|
|
72
|
+
const result = classifyQuery('node->next');
|
|
73
|
+
expect(result.type).toBe('identifier');
|
|
74
|
+
expect(result.alpha).toBe(0.25);
|
|
75
|
+
});
|
|
76
|
+
});
|
|
77
|
+
// ── Natural language detection ────────────────────────────────────
|
|
78
|
+
describe('natural language queries', () => {
|
|
79
|
+
it('classifies plain English as natural', () => {
|
|
80
|
+
const result = classifyQuery('how to handle errors');
|
|
81
|
+
expect(result.type).toBe('natural');
|
|
82
|
+
expect(result.alpha).toBe(0.75);
|
|
83
|
+
});
|
|
84
|
+
it('classifies question-style query as natural', () => {
|
|
85
|
+
const result = classifyQuery('authentication middleware for express');
|
|
86
|
+
expect(result.type).toBe('natural');
|
|
87
|
+
expect(result.alpha).toBe(0.75);
|
|
88
|
+
});
|
|
89
|
+
it('classifies descriptive query as natural', () => {
|
|
90
|
+
const result = classifyQuery('database connection pooling strategy');
|
|
91
|
+
expect(result.type).toBe('natural');
|
|
92
|
+
expect(result.alpha).toBe(0.75);
|
|
93
|
+
});
|
|
94
|
+
it('classifies short natural query as natural', () => {
|
|
95
|
+
const result = classifyQuery('error handling');
|
|
96
|
+
expect(result.type).toBe('natural');
|
|
97
|
+
expect(result.alpha).toBe(0.75);
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
// ── Mixed queries ─────────────────────────────────────────────────
|
|
101
|
+
describe('mixed queries', () => {
|
|
102
|
+
it('classifies query with one identifier in natural context as mixed', () => {
|
|
103
|
+
const result = classifyQuery('fix handleRequest error');
|
|
104
|
+
expect(result.type).toBe('mixed');
|
|
105
|
+
expect(result.alpha).toBe(0.55);
|
|
106
|
+
});
|
|
107
|
+
it('classifies query with identifier and natural words as mixed', () => {
|
|
108
|
+
const result = classifyQuery('where is UserService defined');
|
|
109
|
+
expect(result.type).toBe('mixed');
|
|
110
|
+
expect(result.alpha).toBe(0.55);
|
|
111
|
+
});
|
|
112
|
+
it('classifies query mixing snake_case with natural as mixed', () => {
|
|
113
|
+
const result = classifyQuery('update get_user_by_id function');
|
|
114
|
+
expect(result.type).toBe('mixed');
|
|
115
|
+
expect(result.alpha).toBe(0.55);
|
|
116
|
+
});
|
|
117
|
+
});
|
|
118
|
+
// ── Edge cases ────────────────────────────────────────────────────
|
|
119
|
+
describe('edge cases', () => {
|
|
120
|
+
it('handles empty query', () => {
|
|
121
|
+
const result = classifyQuery('');
|
|
122
|
+
expect(result.type).toBe('natural');
|
|
123
|
+
expect(result.alpha).toBe(0.75);
|
|
124
|
+
});
|
|
125
|
+
it('handles single word that is not an identifier', () => {
|
|
126
|
+
const result = classifyQuery('search');
|
|
127
|
+
expect(result.type).toBe('natural');
|
|
128
|
+
expect(result.alpha).toBe(0.75);
|
|
129
|
+
});
|
|
130
|
+
it('handles multiple identifiers', () => {
|
|
131
|
+
const result = classifyQuery('handleRequest processData getUserById');
|
|
132
|
+
expect(result.type).toBe('identifier');
|
|
133
|
+
expect(result.alpha).toBe(0.25);
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
});
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hybrid search engine combining BM25 lexical search with dense vector semantic search.
|
|
3
|
+
* Uses Convex Combination Score (CCS) fusion with query-adaptive alpha weighting
|
|
4
|
+
* and Reciprocal Rank Fusion (RRF) as a fallback.
|
|
5
|
+
*/
|
|
6
|
+
import type { SearchQuery, SearchResult } from '../types.js';
|
|
7
|
+
import type { VectorStore } from '../vector/vector-store.js';
|
|
8
|
+
import type { EmbeddingProvider } from '../embedding/embedding-provider.js';
|
|
9
|
+
import type { SearchEngine } from './search-engine.js';
|
|
10
|
+
import type { RerankerConfig } from '../reranking/reranker-provider.js';
|
|
11
|
+
export interface HybridSearchConfig {
|
|
12
|
+
/** Weight for vector scores in CCS (0 = BM25-only, 1 = vector-only). Default: 0.45 */
|
|
13
|
+
alpha: number;
|
|
14
|
+
/** Enable query-adaptive alpha selection. Default: true */
|
|
15
|
+
adaptiveAlpha: boolean;
|
|
16
|
+
/** Number of BM25 candidates to retrieve. Default: 100 */
|
|
17
|
+
bm25TopK: number;
|
|
18
|
+
/** Number of vector candidates to retrieve. Default: 100 */
|
|
19
|
+
vectorTopK: number;
|
|
20
|
+
/** Fusion method. Default: 'ccs' */
|
|
21
|
+
fusionMethod: 'ccs' | 'rrf';
|
|
22
|
+
/** RRF constant k. Default: 60 */
|
|
23
|
+
rrfK: number;
|
|
24
|
+
}
|
|
25
|
+
export declare class HybridSearchEngine {
|
|
26
|
+
private bm25Engine;
|
|
27
|
+
private vectorStore;
|
|
28
|
+
private embeddingProvider;
|
|
29
|
+
private config;
|
|
30
|
+
private rerankerConfig;
|
|
31
|
+
constructor(bm25Engine: SearchEngine, vectorStore: VectorStore | null, embeddingProvider: EmbeddingProvider | null, config?: Partial<HybridSearchConfig>, rerankerConfig?: RerankerConfig | null);
|
|
32
|
+
/** Run hybrid search combining BM25 and vector retrieval. */
|
|
33
|
+
search(query: SearchQuery): Promise<SearchResult[]>;
|
|
34
|
+
private hybridFusion;
|
|
35
|
+
/**
|
|
36
|
+
* Convex Combination Score fusion.
|
|
37
|
+
* score(d) = alpha * normalized_vector(d) + (1 - alpha) * normalized_bm25(d)
|
|
38
|
+
*/
|
|
39
|
+
private ccsFusion;
|
|
40
|
+
/**
|
|
41
|
+
* Reciprocal Rank Fusion.
|
|
42
|
+
* rrf_score(d) = sum(1 / (k + rank_i(d))) for each ranking
|
|
43
|
+
*/
|
|
44
|
+
private rrfFusion;
|
|
45
|
+
/**
|
|
46
|
+
* Apply cross-encoder reranking to search results.
|
|
47
|
+
* Returns results unchanged if reranker is not configured, disabled, or errors.
|
|
48
|
+
*/
|
|
49
|
+
private applyReranking;
|
|
50
|
+
/** Build text for reranking from a search result's symbol metadata. */
|
|
51
|
+
private buildRerankText;
|
|
52
|
+
/** Create a document key for matching BM25 results to vector results. */
|
|
53
|
+
private makeDocKey;
|
|
54
|
+
private matchPattern;
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=hybrid-search.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hybrid-search.d.ts","sourceRoot":"","sources":["../../../src/search/hybrid-search.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,aAAa,CAAA;AAC5D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAA;AAC5D,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,oCAAoC,CAAA;AAC3E,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAA;AACtD,OAAO,KAAK,EAAE,cAAc,EAAkB,MAAM,mCAAmC,CAAA;AAMvF,MAAM,WAAW,kBAAkB;IACjC,sFAAsF;IACtF,KAAK,EAAE,MAAM,CAAA;IACb,2DAA2D;IAC3D,aAAa,EAAE,OAAO,CAAA;IACtB,0DAA0D;IAC1D,QAAQ,EAAE,MAAM,CAAA;IAChB,4DAA4D;IAC5D,UAAU,EAAE,MAAM,CAAA;IAClB,oCAAoC;IACpC,YAAY,EAAE,KAAK,GAAG,KAAK,CAAA;IAC3B,kCAAkC;IAClC,IAAI,EAAE,MAAM,CAAA;CACb;AA0BD,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,UAAU,CAAc;IAChC,OAAO,CAAC,WAAW,CAAoB;IACvC,OAAO,CAAC,iBAAiB,CAA0B;IACnD,OAAO,CAAC,MAAM,CAAoB;IAClC,OAAO,CAAC,cAAc,CAAuB;gBAG3C,UAAU,EAAE,YAAY,EACxB,WAAW,EAAE,WAAW,GAAG,IAAI,EAC/B,iBAAiB,EAAE,iBAAiB,GAAG,IAAI,EAC3C,MAAM,CAAC,EAAE,OAAO,CAAC,kBAAkB,CAAC,EACpC,cAAc,CAAC,EAAE,cAAc,GAAG,IAAI;IASxC,6DAA6D;IACvD,MAAM,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;YA2B3C,YAAY;IA6F1B;;;OAGG;IACH,OAAO,CAAC,SAAS;IAuEjB;;;OAGG;IACH,OAAO,CAAC,SAAS;IA2BjB;;;OAGG;YACW,cAAc;IAkD5B,uEAAuE;IACvE,OAAO,CAAC,eAAe;IAmBvB,yEAAyE;IACzE,OAAO,CAAC,UAAU;IAIlB,OAAO,CAAC,YAAY;CAarB"}
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hybrid search engine combining BM25 lexical search with dense vector semantic search.
|
|
3
|
+
* Uses Convex Combination Score (CCS) fusion with query-adaptive alpha weighting
|
|
4
|
+
* and Reciprocal Rank Fusion (RRF) as a fallback.
|
|
5
|
+
*/
|
|
6
|
+
import { minMaxNormalize } from './score-normalizer.js';
|
|
7
|
+
import { classifyQuery } from './query-classifier.js';
|
|
8
|
+
const DEFAULT_CONFIG = {
|
|
9
|
+
alpha: 0.45,
|
|
10
|
+
adaptiveAlpha: true,
|
|
11
|
+
bm25TopK: 100,
|
|
12
|
+
vectorTopK: 100,
|
|
13
|
+
fusionMethod: 'ccs',
|
|
14
|
+
rrfK: 60,
|
|
15
|
+
};
|
|
16
|
+
// ── Hybrid Search Engine ─────────────────────────────────────────────
|
|
17
|
+
export class HybridSearchEngine {
|
|
18
|
+
bm25Engine;
|
|
19
|
+
vectorStore;
|
|
20
|
+
embeddingProvider;
|
|
21
|
+
config;
|
|
22
|
+
rerankerConfig;
|
|
23
|
+
constructor(bm25Engine, vectorStore, embeddingProvider, config, rerankerConfig) {
|
|
24
|
+
this.bm25Engine = bm25Engine;
|
|
25
|
+
this.vectorStore = vectorStore;
|
|
26
|
+
this.embeddingProvider = embeddingProvider;
|
|
27
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
28
|
+
this.rerankerConfig = rerankerConfig ?? null;
|
|
29
|
+
}
|
|
30
|
+
/** Run hybrid search combining BM25 and vector retrieval. */
|
|
31
|
+
async search(query) {
|
|
32
|
+
// Step 1: Get BM25 results (always available)
|
|
33
|
+
const bm25Query = {
|
|
34
|
+
...query,
|
|
35
|
+
maxResults: this.config.bm25TopK,
|
|
36
|
+
};
|
|
37
|
+
const bm25Results = this.bm25Engine.search(bm25Query);
|
|
38
|
+
let results;
|
|
39
|
+
// Step 2: If vector store and embedding provider are available, do hybrid fusion
|
|
40
|
+
if (this.vectorStore && this.embeddingProvider && this.vectorStore.size() > 0) {
|
|
41
|
+
results = await this.hybridFusion(query, bm25Results);
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
// Step 3: Fallback to BM25-only
|
|
45
|
+
results = bm25Results;
|
|
46
|
+
if (query.maxResults) {
|
|
47
|
+
results = results.slice(0, query.maxResults);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
// Step 4: Apply reranking if configured and enabled
|
|
51
|
+
results = await this.applyReranking(query.query, results);
|
|
52
|
+
return results;
|
|
53
|
+
}
|
|
54
|
+
async hybridFusion(query, bm25Results) {
|
|
55
|
+
// Embed the query
|
|
56
|
+
const queryVector = await this.embeddingProvider.embedQuery(query.query);
|
|
57
|
+
// Search vector store
|
|
58
|
+
const vectorResults = await this.vectorStore.search(queryVector, this.config.vectorTopK);
|
|
59
|
+
// Build candidate map keyed by docKey
|
|
60
|
+
const candidates = new Map();
|
|
61
|
+
// Add BM25 results
|
|
62
|
+
for (let i = 0; i < bm25Results.length; i++) {
|
|
63
|
+
const r = bm25Results[i];
|
|
64
|
+
const key = this.makeDocKey(r.symbol.filePath, r.symbol.name, r.symbol.line);
|
|
65
|
+
candidates.set(key, {
|
|
66
|
+
bm25Score: r.score,
|
|
67
|
+
vectorScore: undefined,
|
|
68
|
+
bm25Rank: i + 1,
|
|
69
|
+
vectorRank: undefined,
|
|
70
|
+
result: r,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
// Add/merge vector results
|
|
74
|
+
for (let i = 0; i < vectorResults.length; i++) {
|
|
75
|
+
const vr = vectorResults[i];
|
|
76
|
+
const meta = vr.chunk.metadata;
|
|
77
|
+
const key = this.makeDocKey(meta.filePath, meta.symbolName ?? '', meta.startLine);
|
|
78
|
+
const existing = candidates.get(key);
|
|
79
|
+
if (existing) {
|
|
80
|
+
existing.vectorScore = vr.score;
|
|
81
|
+
existing.vectorRank = i + 1;
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
// Vector-only result — create a SearchResult from the chunk metadata
|
|
85
|
+
candidates.set(key, {
|
|
86
|
+
bm25Score: undefined,
|
|
87
|
+
vectorScore: vr.score,
|
|
88
|
+
bm25Rank: undefined,
|
|
89
|
+
vectorRank: i + 1,
|
|
90
|
+
result: {
|
|
91
|
+
symbol: {
|
|
92
|
+
name: meta.symbolName ?? '',
|
|
93
|
+
kind: meta.symbolKind ?? 'function',
|
|
94
|
+
filePath: meta.filePath,
|
|
95
|
+
line: meta.startLine,
|
|
96
|
+
endLine: meta.endLine,
|
|
97
|
+
language: meta.language,
|
|
98
|
+
exported: false,
|
|
99
|
+
},
|
|
100
|
+
score: 0, // Will be set by fusion
|
|
101
|
+
matchType: 'semantic',
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
// Determine alpha
|
|
107
|
+
const alpha = this.config.adaptiveAlpha
|
|
108
|
+
? classifyQuery(query.query).alpha
|
|
109
|
+
: this.config.alpha;
|
|
110
|
+
// Fuse scores
|
|
111
|
+
const fused = this.config.fusionMethod === 'ccs'
|
|
112
|
+
? this.ccsFusion(candidates, alpha)
|
|
113
|
+
: this.rrfFusion(candidates);
|
|
114
|
+
// Apply filters
|
|
115
|
+
let results = fused.filter(r => {
|
|
116
|
+
if (query.symbolKinds && !query.symbolKinds.includes(r.symbol.kind))
|
|
117
|
+
return false;
|
|
118
|
+
if (query.language && r.symbol.language !== query.language)
|
|
119
|
+
return false;
|
|
120
|
+
if (query.filePattern && !this.matchPattern(r.symbol.filePath, query.filePattern))
|
|
121
|
+
return false;
|
|
122
|
+
return true;
|
|
123
|
+
});
|
|
124
|
+
// Sort by fused score descending
|
|
125
|
+
results.sort((a, b) => b.score - a.score);
|
|
126
|
+
// Limit results
|
|
127
|
+
if (query.maxResults) {
|
|
128
|
+
results = results.slice(0, query.maxResults);
|
|
129
|
+
}
|
|
130
|
+
return results;
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Convex Combination Score fusion.
|
|
134
|
+
* score(d) = alpha * normalized_vector(d) + (1 - alpha) * normalized_bm25(d)
|
|
135
|
+
*/
|
|
136
|
+
ccsFusion(candidates, alpha) {
|
|
137
|
+
// Collect raw scores for normalization
|
|
138
|
+
const bm25Scores = [];
|
|
139
|
+
const vectorScores = [];
|
|
140
|
+
for (const c of candidates.values()) {
|
|
141
|
+
if (c.bm25Score !== undefined)
|
|
142
|
+
bm25Scores.push(c.bm25Score);
|
|
143
|
+
if (c.vectorScore !== undefined)
|
|
144
|
+
vectorScores.push(c.vectorScore);
|
|
145
|
+
}
|
|
146
|
+
// Normalize
|
|
147
|
+
const bm25Normalized = minMaxNormalize(bm25Scores);
|
|
148
|
+
const vectorNormalized = minMaxNormalize(vectorScores);
|
|
149
|
+
// Build a map from raw score to normalized score
|
|
150
|
+
let bm25Idx = 0;
|
|
151
|
+
let vectorIdx = 0;
|
|
152
|
+
const bm25NormMap = new Map();
|
|
153
|
+
const vectorNormMap = new Map();
|
|
154
|
+
// Since multiple candidates can have the same raw score, use arrays
|
|
155
|
+
for (const c of candidates.values()) {
|
|
156
|
+
if (c.bm25Score !== undefined) {
|
|
157
|
+
if (!bm25NormMap.has(c.bm25Score))
|
|
158
|
+
bm25NormMap.set(c.bm25Score, []);
|
|
159
|
+
bm25NormMap.get(c.bm25Score).push(bm25Normalized[bm25Idx++]);
|
|
160
|
+
}
|
|
161
|
+
if (c.vectorScore !== undefined) {
|
|
162
|
+
if (!vectorNormMap.has(c.vectorScore))
|
|
163
|
+
vectorNormMap.set(c.vectorScore, []);
|
|
164
|
+
vectorNormMap.get(c.vectorScore).push(vectorNormalized[vectorIdx++]);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
// Reset counters for consumption
|
|
168
|
+
const bm25NormCounters = new Map();
|
|
169
|
+
const vectorNormCounters = new Map();
|
|
170
|
+
const results = [];
|
|
171
|
+
// Re-iterate to assign normalized scores in order
|
|
172
|
+
bm25Idx = 0;
|
|
173
|
+
vectorIdx = 0;
|
|
174
|
+
for (const c of candidates.values()) {
|
|
175
|
+
let normBm25 = 0;
|
|
176
|
+
let normVector = 0;
|
|
177
|
+
if (c.bm25Score !== undefined) {
|
|
178
|
+
normBm25 = bm25Normalized[bm25Idx++];
|
|
179
|
+
}
|
|
180
|
+
if (c.vectorScore !== undefined) {
|
|
181
|
+
normVector = vectorNormalized[vectorIdx++];
|
|
182
|
+
}
|
|
183
|
+
const fusedScore = alpha * normVector + (1 - alpha) * normBm25;
|
|
184
|
+
const hasBoth = c.bm25Score !== undefined && c.vectorScore !== undefined;
|
|
185
|
+
results.push({
|
|
186
|
+
...c.result,
|
|
187
|
+
score: fusedScore,
|
|
188
|
+
matchType: hasBoth ? 'hybrid' : (c.vectorScore !== undefined ? 'semantic' : c.result.matchType),
|
|
189
|
+
bm25Score: c.bm25Score,
|
|
190
|
+
vectorScore: c.vectorScore,
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
return results;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Reciprocal Rank Fusion.
|
|
197
|
+
* rrf_score(d) = sum(1 / (k + rank_i(d))) for each ranking
|
|
198
|
+
*/
|
|
199
|
+
rrfFusion(candidates) {
|
|
200
|
+
const k = this.config.rrfK;
|
|
201
|
+
const results = [];
|
|
202
|
+
for (const c of candidates.values()) {
|
|
203
|
+
let rrfScore = 0;
|
|
204
|
+
if (c.bm25Rank !== undefined) {
|
|
205
|
+
rrfScore += 1 / (k + c.bm25Rank);
|
|
206
|
+
}
|
|
207
|
+
if (c.vectorRank !== undefined) {
|
|
208
|
+
rrfScore += 1 / (k + c.vectorRank);
|
|
209
|
+
}
|
|
210
|
+
const hasBoth = c.bm25Rank !== undefined && c.vectorRank !== undefined;
|
|
211
|
+
results.push({
|
|
212
|
+
...c.result,
|
|
213
|
+
score: rrfScore,
|
|
214
|
+
matchType: hasBoth ? 'hybrid' : (c.vectorRank !== undefined ? 'semantic' : c.result.matchType),
|
|
215
|
+
bm25Score: c.bm25Score,
|
|
216
|
+
vectorScore: c.vectorScore,
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
return results;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Apply cross-encoder reranking to search results.
|
|
223
|
+
* Returns results unchanged if reranker is not configured, disabled, or errors.
|
|
224
|
+
*/
|
|
225
|
+
async applyReranking(query, results) {
|
|
226
|
+
if (!this.rerankerConfig || !this.rerankerConfig.enabled) {
|
|
227
|
+
return results;
|
|
228
|
+
}
|
|
229
|
+
const { provider, topN = 10, candidatePool = 50 } = this.rerankerConfig;
|
|
230
|
+
// Take the top candidatePool results for reranking
|
|
231
|
+
const candidates = results.slice(0, candidatePool);
|
|
232
|
+
if (candidates.length === 0)
|
|
233
|
+
return results;
|
|
234
|
+
// Build rerank documents from search results
|
|
235
|
+
const documents = candidates.map((r, i) => ({
|
|
236
|
+
id: `${i}`,
|
|
237
|
+
text: this.buildRerankText(r),
|
|
238
|
+
}));
|
|
239
|
+
try {
|
|
240
|
+
const rerankResults = await provider.rerank(query, documents);
|
|
241
|
+
// Build a map from index to rerank score
|
|
242
|
+
const scoreMap = new Map();
|
|
243
|
+
for (const rr of rerankResults) {
|
|
244
|
+
scoreMap.set(rr.index, rr.score);
|
|
245
|
+
}
|
|
246
|
+
// Update candidate results with rerank scores
|
|
247
|
+
const reranked = [];
|
|
248
|
+
for (let i = 0; i < candidates.length; i++) {
|
|
249
|
+
const rerankScore = scoreMap.get(i);
|
|
250
|
+
if (rerankScore !== undefined) {
|
|
251
|
+
reranked.push({
|
|
252
|
+
...candidates[i],
|
|
253
|
+
score: rerankScore,
|
|
254
|
+
rerankScore,
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
// Sort by reranker score descending
|
|
259
|
+
reranked.sort((a, b) => b.score - a.score);
|
|
260
|
+
// Return top N
|
|
261
|
+
return reranked.slice(0, topN);
|
|
262
|
+
}
|
|
263
|
+
catch {
|
|
264
|
+
// Graceful fallback: return original results if reranker errors
|
|
265
|
+
return results;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
/** Build text for reranking from a search result's symbol metadata. */
|
|
269
|
+
buildRerankText(result) {
|
|
270
|
+
const parts = [];
|
|
271
|
+
const { symbol } = result;
|
|
272
|
+
if (symbol.signature) {
|
|
273
|
+
parts.push(symbol.signature);
|
|
274
|
+
}
|
|
275
|
+
if (symbol.documentation) {
|
|
276
|
+
parts.push(symbol.documentation);
|
|
277
|
+
}
|
|
278
|
+
// Always include name and kind for context
|
|
279
|
+
parts.push(`${symbol.kind} ${symbol.name}`);
|
|
280
|
+
return parts.join('\n');
|
|
281
|
+
}
|
|
282
|
+
/** Create a document key for matching BM25 results to vector results. */
|
|
283
|
+
makeDocKey(filePath, symbolName, startLine) {
|
|
284
|
+
return `${filePath}:${symbolName}:${startLine}`;
|
|
285
|
+
}
|
|
286
|
+
matchPattern(filePath, pattern) {
|
|
287
|
+
if (pattern.startsWith('*')) {
|
|
288
|
+
return filePath.endsWith(pattern.slice(1));
|
|
289
|
+
}
|
|
290
|
+
if (pattern.endsWith('/**')) {
|
|
291
|
+
return filePath.startsWith(pattern.slice(0, -3));
|
|
292
|
+
}
|
|
293
|
+
if (pattern.endsWith('/*')) {
|
|
294
|
+
const dir = pattern.slice(0, -2);
|
|
295
|
+
return filePath.startsWith(dir) && !filePath.slice(dir.length + 1).includes('/');
|
|
296
|
+
}
|
|
297
|
+
return filePath.includes(pattern);
|
|
298
|
+
}
|
|
299
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Query classifier for adaptive alpha weighting in hybrid search.
|
|
3
|
+
* Detects whether a query is identifier-heavy, natural language, or mixed
|
|
4
|
+
* and returns a recommended alpha value.
|
|
5
|
+
*/
|
|
6
|
+
export type QueryType = 'identifier' | 'natural' | 'mixed';
|
|
7
|
+
export interface QueryClassification {
|
|
8
|
+
type: QueryType;
|
|
9
|
+
alpha: number;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Classify a search query and return the recommended alpha value
|
|
13
|
+
* for CCS fusion.
|
|
14
|
+
*
|
|
15
|
+
* - identifier-heavy queries (camelCase, snake_case, etc.): alpha = 0.25 (favor BM25)
|
|
16
|
+
* - natural language queries ("authentication middleware"): alpha = 0.75 (favor vectors)
|
|
17
|
+
* - mixed queries ("fix CORS error in Express"): alpha = 0.55 (balanced)
|
|
18
|
+
*/
|
|
19
|
+
export declare function classifyQuery(query: string): QueryClassification;
|
|
20
|
+
//# sourceMappingURL=query-classifier.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"query-classifier.d.ts","sourceRoot":"","sources":["../../../src/search/query-classifier.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,MAAM,MAAM,SAAS,GAAG,YAAY,GAAG,SAAS,GAAG,OAAO,CAAA;AAE1D,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,SAAS,CAAA;IACf,KAAK,EAAE,MAAM,CAAA;CACd;AAuBD;;;;;;;GAOG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,MAAM,GAAG,mBAAmB,CAuBhE"}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Query classifier for adaptive alpha weighting in hybrid search.
|
|
3
|
+
* Detects whether a query is identifier-heavy, natural language, or mixed
|
|
4
|
+
* and returns a recommended alpha value.
|
|
5
|
+
*/
|
|
6
|
+
// Patterns that indicate code identifiers
|
|
7
|
+
const CAMEL_CASE = /[a-z][a-zA-Z]*[A-Z]/;
|
|
8
|
+
const PASCAL_CASE = /^[A-Z][a-zA-Z]+[A-Z]/;
|
|
9
|
+
const SNAKE_CASE = /\w+_\w+/;
|
|
10
|
+
const CONSTANT_CASE = /^[A-Z][A-Z0-9_]+$/;
|
|
11
|
+
const DOT_NOTATION = /\w+\.\w+/;
|
|
12
|
+
const OPERATOR_TOKENS = /(::|->|=>|#)/;
|
|
13
|
+
/**
|
|
14
|
+
* Check if a token looks like a code identifier.
|
|
15
|
+
*/
|
|
16
|
+
function isIdentifierToken(token) {
|
|
17
|
+
if (CAMEL_CASE.test(token))
|
|
18
|
+
return true;
|
|
19
|
+
if (PASCAL_CASE.test(token))
|
|
20
|
+
return true;
|
|
21
|
+
if (SNAKE_CASE.test(token))
|
|
22
|
+
return true;
|
|
23
|
+
if (CONSTANT_CASE.test(token))
|
|
24
|
+
return true;
|
|
25
|
+
if (DOT_NOTATION.test(token))
|
|
26
|
+
return true;
|
|
27
|
+
if (OPERATOR_TOKENS.test(token))
|
|
28
|
+
return true;
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Classify a search query and return the recommended alpha value
|
|
33
|
+
* for CCS fusion.
|
|
34
|
+
*
|
|
35
|
+
* - identifier-heavy queries (camelCase, snake_case, etc.): alpha = 0.25 (favor BM25)
|
|
36
|
+
* - natural language queries ("authentication middleware"): alpha = 0.75 (favor vectors)
|
|
37
|
+
* - mixed queries ("fix CORS error in Express"): alpha = 0.55 (balanced)
|
|
38
|
+
*/
|
|
39
|
+
export function classifyQuery(query) {
|
|
40
|
+
const tokens = query.split(/\s+/).filter(t => t.length > 0);
|
|
41
|
+
if (tokens.length === 0) {
|
|
42
|
+
return { type: 'natural', alpha: 0.75 };
|
|
43
|
+
}
|
|
44
|
+
let identifierCount = 0;
|
|
45
|
+
for (const token of tokens) {
|
|
46
|
+
if (isIdentifierToken(token)) {
|
|
47
|
+
identifierCount++;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const ratio = identifierCount / tokens.length;
|
|
51
|
+
if (ratio > 0.5) {
|
|
52
|
+
return { type: 'identifier', alpha: 0.25 };
|
|
53
|
+
}
|
|
54
|
+
if (ratio < 0.2) {
|
|
55
|
+
return { type: 'natural', alpha: 0.75 };
|
|
56
|
+
}
|
|
57
|
+
return { type: 'mixed', alpha: 0.55 };
|
|
58
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Min-max score normalization for hybrid search fusion.
|
|
3
|
+
* Normalizes scores to [0, 1] range for CCS combination.
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Apply min-max normalization to an array of scores.
|
|
7
|
+
*
|
|
8
|
+
* normalized = (score - min) / (max - min)
|
|
9
|
+
*
|
|
10
|
+
* Edge cases:
|
|
11
|
+
* - Empty array → []
|
|
12
|
+
* - Single result → [1.0]
|
|
13
|
+
* - All same scores → all 1.0
|
|
14
|
+
*/
|
|
15
|
+
export declare function minMaxNormalize(scores: number[]): number[];
|
|
16
|
+
//# sourceMappingURL=score-normalizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"score-normalizer.d.ts","sourceRoot":"","sources":["../../../src/search/score-normalizer.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;;;;;;;;GASG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAW1D"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Min-max score normalization for hybrid search fusion.
|
|
3
|
+
* Normalizes scores to [0, 1] range for CCS combination.
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Apply min-max normalization to an array of scores.
|
|
7
|
+
*
|
|
8
|
+
* normalized = (score - min) / (max - min)
|
|
9
|
+
*
|
|
10
|
+
* Edge cases:
|
|
11
|
+
* - Empty array → []
|
|
12
|
+
* - Single result → [1.0]
|
|
13
|
+
* - All same scores → all 1.0
|
|
14
|
+
*/
|
|
15
|
+
export function minMaxNormalize(scores) {
|
|
16
|
+
if (scores.length === 0)
|
|
17
|
+
return [];
|
|
18
|
+
if (scores.length === 1)
|
|
19
|
+
return [1.0];
|
|
20
|
+
const min = Math.min(...scores);
|
|
21
|
+
const max = Math.max(...scores);
|
|
22
|
+
const range = max - min;
|
|
23
|
+
if (range === 0)
|
|
24
|
+
return scores.map(() => 1.0);
|
|
25
|
+
return scores.map(s => (s - min) / range);
|
|
26
|
+
}
|