bluera-knowledge 0.9.25 → 0.9.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/commit.md +4 -7
- package/.claude/hooks/post-edit-check.sh +21 -24
- package/.claude/skills/atomic-commits/SKILL.md +6 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.env.example +4 -0
- package/.husky/pre-push +12 -2
- package/.versionrc.json +0 -4
- package/CHANGELOG.md +76 -0
- package/README.md +55 -20
- package/bun.lock +35 -1
- package/commands/crawl.md +2 -0
- package/dist/{chunk-BICFAWMN.js → chunk-DNOIM7BO.js} +73 -8
- package/dist/chunk-DNOIM7BO.js.map +1 -0
- package/dist/{chunk-5QMHZUC4.js → chunk-NJUMU4X2.js} +462 -105
- package/dist/chunk-NJUMU4X2.js.map +1 -0
- package/dist/{chunk-J7J6LXOJ.js → chunk-SZNTYLYT.js} +106 -41
- package/dist/chunk-SZNTYLYT.js.map +1 -0
- package/dist/index.js +65 -25
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +2 -2
- package/eslint.config.js +1 -1
- package/package.json +3 -1
- package/src/analysis/ast-parser.test.ts +46 -0
- package/src/cli/commands/crawl.test.ts +99 -12
- package/src/cli/commands/crawl.ts +76 -24
- package/src/crawl/article-converter.ts +36 -1
- package/src/crawl/bridge.ts +18 -7
- package/src/crawl/intelligent-crawler.ts +45 -4
- package/src/db/embeddings.test.ts +16 -0
- package/src/logging/index.ts +29 -0
- package/src/logging/logger.test.ts +75 -0
- package/src/logging/logger.ts +147 -0
- package/src/logging/payload.test.ts +152 -0
- package/src/logging/payload.ts +121 -0
- package/src/mcp/handlers/search.handler.test.ts +28 -9
- package/src/mcp/handlers/search.handler.ts +69 -29
- package/src/mcp/handlers/store.handler.test.ts +1 -0
- package/src/mcp/server.ts +44 -16
- package/src/services/chunking.service.ts +23 -0
- package/src/services/index.service.test.ts +921 -1
- package/src/services/index.service.ts +76 -1
- package/src/services/index.ts +10 -1
- package/src/services/search.service.test.ts +573 -21
- package/src/services/search.service.ts +257 -105
- package/src/services/snippet.service.ts +28 -3
- package/src/services/token.service.test.ts +45 -0
- package/src/services/token.service.ts +33 -0
- package/src/types/result.test.ts +10 -0
- package/src/workers/spawn-worker.test.ts +19 -21
- package/tests/integration/cli-consistency.test.ts +1 -4
- package/vitest.config.ts +4 -0
- package/dist/chunk-5QMHZUC4.js.map +0 -1
- package/dist/chunk-BICFAWMN.js.map +0 -1
- package/dist/chunk-J7J6LXOJ.js.map +0 -1
- package/scripts/readme-version-updater.cjs +0 -18
|
@@ -6,13 +6,24 @@ import { CodeUnitService } from './code-unit.service.js';
|
|
|
6
6
|
import type { CodeUnit } from '../types/search.js';
|
|
7
7
|
import type { CodeGraphService } from './code-graph.service.js';
|
|
8
8
|
import type { CodeGraph } from '../analysis/code-graph.js';
|
|
9
|
+
import { createLogger } from '../logging/index.js';
|
|
10
|
+
|
|
11
|
+
const logger = createLogger('search-service');
|
|
9
12
|
|
|
10
13
|
/**
|
|
11
14
|
* Query intent classification for context-aware ranking.
|
|
12
|
-
*
|
|
15
|
+
* Different intents prioritize different content types.
|
|
13
16
|
*/
|
|
14
17
|
export type QueryIntent = 'how-to' | 'implementation' | 'conceptual' | 'comparison' | 'debugging';
|
|
15
18
|
|
|
19
|
+
/**
|
|
20
|
+
* Classified intent with confidence score for multi-intent queries.
|
|
21
|
+
*/
|
|
22
|
+
export interface ClassifiedIntent {
|
|
23
|
+
intent: QueryIntent;
|
|
24
|
+
confidence: number;
|
|
25
|
+
}
|
|
26
|
+
|
|
16
27
|
/**
|
|
17
28
|
* Intent-based file type multipliers - CONSERVATIVE version.
|
|
18
29
|
* Applied on top of base file-type boosts.
|
|
@@ -84,101 +95,120 @@ const FRAMEWORK_PATTERNS: Array<{ pattern: RegExp; terms: string[] }> = [
|
|
|
84
95
|
{ pattern: /\bjwt\b/i, terms: ['jwt', 'jsonwebtoken', 'json-web-token'] },
|
|
85
96
|
];
|
|
86
97
|
|
|
98
|
+
// Pattern definitions for intent classification
|
|
99
|
+
const HOW_TO_PATTERNS = [
|
|
100
|
+
/how (do|can|should|would) (i|you|we)/i,
|
|
101
|
+
/how to\b/i,
|
|
102
|
+
/what('s| is) the (best |right |correct )?(way|approach) to/i,
|
|
103
|
+
/i (need|want|have) to/i,
|
|
104
|
+
/show me how/i,
|
|
105
|
+
/\bwhat's the syntax\b/i,
|
|
106
|
+
/\bhow do i (use|create|make|set up|configure|implement|add|get)\b/i,
|
|
107
|
+
/\bi'm (trying|building|creating|making)\b/i,
|
|
108
|
+
];
|
|
109
|
+
|
|
110
|
+
const IMPLEMENTATION_PATTERNS = [
|
|
111
|
+
/how (does|is) .* (implemented|work internally)/i,
|
|
112
|
+
/\binternal(ly)?\b/i,
|
|
113
|
+
/\bsource code\b/i,
|
|
114
|
+
/\bunder the hood\b/i,
|
|
115
|
+
/\bimplementation (of|details?)\b/i,
|
|
116
|
+
];
|
|
117
|
+
|
|
118
|
+
const COMPARISON_PATTERNS = [
|
|
119
|
+
/\b(vs\.?|versus)\b/i,
|
|
120
|
+
/\bdifference(s)? between\b/i,
|
|
121
|
+
/\bcompare\b/i,
|
|
122
|
+
/\bshould (i|we) use .* or\b/i,
|
|
123
|
+
/\bwhat's the difference\b/i,
|
|
124
|
+
/\bwhich (one|is better)\b/i,
|
|
125
|
+
/\bwhen (should|to) use\b/i,
|
|
126
|
+
];
|
|
127
|
+
|
|
128
|
+
const DEBUGGING_PATTERNS = [
|
|
129
|
+
/\b(error|bug|issue|problem|crash|fail|broken|wrong)\b/i,
|
|
130
|
+
/\bdoesn't (work|compile|run)\b/i,
|
|
131
|
+
/\bisn't (working|updating|rendering)\b/i,
|
|
132
|
+
/\bwhy (is|does|doesn't|isn't)\b/i,
|
|
133
|
+
/\bwhat('s| is) (wrong|happening|going on)\b/i,
|
|
134
|
+
/\bwhat am i doing wrong\b/i,
|
|
135
|
+
/\bnot (working|updating|showing)\b/i,
|
|
136
|
+
/\bhow do i (fix|debug|solve|resolve)\b/i,
|
|
137
|
+
];
|
|
138
|
+
|
|
139
|
+
const CONCEPTUAL_PATTERNS = [
|
|
140
|
+
/\bwhat (is|are)\b/i,
|
|
141
|
+
/\bexplain\b/i,
|
|
142
|
+
/\bwhat does .* (mean|do)\b/i,
|
|
143
|
+
/\bhow does .* work\b/i,
|
|
144
|
+
/\bwhat('s| is) the (purpose|point|idea)\b/i,
|
|
145
|
+
];
|
|
146
|
+
|
|
87
147
|
/**
|
|
88
|
-
* Classify
|
|
89
|
-
*
|
|
148
|
+
* Classify query intents with confidence scores.
|
|
149
|
+
* Returns all matching intents, allowing queries to have multiple intents.
|
|
90
150
|
*/
|
|
91
|
-
function
|
|
151
|
+
function classifyQueryIntents(query: string): ClassifiedIntent[] {
|
|
92
152
|
const q = query.toLowerCase();
|
|
153
|
+
const intents: ClassifiedIntent[] = [];
|
|
154
|
+
|
|
155
|
+
// Check all pattern groups and add matching intents with confidence
|
|
156
|
+
if (IMPLEMENTATION_PATTERNS.some(p => p.test(q))) {
|
|
157
|
+
intents.push({ intent: 'implementation', confidence: 0.9 });
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (DEBUGGING_PATTERNS.some(p => p.test(q))) {
|
|
161
|
+
intents.push({ intent: 'debugging', confidence: 0.85 });
|
|
162
|
+
}
|
|
93
163
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
//
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
/\bshould (i|we) use .* or\b/i,
|
|
121
|
-
/\bwhat's the difference\b/i,
|
|
122
|
-
/\bwhich (one|is better)\b/i,
|
|
123
|
-
/\bwhen (should|to) use\b/i,
|
|
124
|
-
];
|
|
125
|
-
|
|
126
|
-
// Debugging patterns: user is troubleshooting a problem
|
|
127
|
-
const debuggingPatterns = [
|
|
128
|
-
/\b(error|bug|issue|problem|crash|fail|broken|wrong)\b/i,
|
|
129
|
-
/\bdoesn't (work|compile|run)\b/i,
|
|
130
|
-
/\bisn't (working|updating|rendering)\b/i,
|
|
131
|
-
/\bwhy (is|does|doesn't|isn't)\b/i,
|
|
132
|
-
/\bwhat('s| is) (wrong|happening|going on)\b/i,
|
|
133
|
-
/\bwhat am i doing wrong\b/i,
|
|
134
|
-
/\bnot (working|updating|showing)\b/i,
|
|
135
|
-
/\bhow do i (fix|debug|solve|resolve)\b/i,
|
|
136
|
-
];
|
|
137
|
-
|
|
138
|
-
// Conceptual patterns: user wants to understand a concept
|
|
139
|
-
const conceptualPatterns = [
|
|
140
|
-
/\bwhat (is|are)\b/i,
|
|
141
|
-
/\bexplain\b/i,
|
|
142
|
-
/\bwhat does .* (mean|do)\b/i,
|
|
143
|
-
/\bhow does .* work\b/i,
|
|
144
|
-
/\bwhat('s| is) the (purpose|point|idea)\b/i,
|
|
145
|
-
];
|
|
146
|
-
|
|
147
|
-
// Check patterns in order of specificity
|
|
148
|
-
if (implementationPatterns.some(p => p.test(q))) {
|
|
149
|
-
return 'implementation';
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
if (debuggingPatterns.some(p => p.test(q))) {
|
|
153
|
-
return 'debugging';
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
if (comparisonPatterns.some(p => p.test(q))) {
|
|
157
|
-
return 'comparison';
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
if (howToPatterns.some(p => p.test(q))) {
|
|
161
|
-
return 'how-to';
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
if (conceptualPatterns.some(p => p.test(q))) {
|
|
165
|
-
return 'conceptual';
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
// Default to how-to as most queries are seeking practical usage
|
|
169
|
-
return 'how-to';
|
|
164
|
+
if (COMPARISON_PATTERNS.some(p => p.test(q))) {
|
|
165
|
+
intents.push({ intent: 'comparison', confidence: 0.8 });
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (HOW_TO_PATTERNS.some(p => p.test(q))) {
|
|
169
|
+
intents.push({ intent: 'how-to', confidence: 0.75 });
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
if (CONCEPTUAL_PATTERNS.some(p => p.test(q))) {
|
|
173
|
+
intents.push({ intent: 'conceptual', confidence: 0.7 });
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// If no patterns match, use how-to as the baseline intent
|
|
177
|
+
if (intents.length === 0) {
|
|
178
|
+
intents.push({ intent: 'how-to', confidence: 0.5 });
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Sort by confidence descending
|
|
182
|
+
return intents.sort((a, b) => b.confidence - a.confidence);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Get primary intent for logging/display purposes.
|
|
187
|
+
*/
|
|
188
|
+
function getPrimaryIntent(intents: ClassifiedIntent[]): QueryIntent {
|
|
189
|
+
return intents[0]?.intent ?? 'how-to';
|
|
170
190
|
}
|
|
171
191
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
192
|
+
/**
|
|
193
|
+
* RRF presets for different content types.
|
|
194
|
+
* Web/docs content uses higher k to reduce noise from repetitive structure.
|
|
195
|
+
*/
|
|
196
|
+
const RRF_PRESETS = {
|
|
197
|
+
code: { k: 20, vectorWeight: 0.6, ftsWeight: 0.4 },
|
|
198
|
+
web: { k: 30, vectorWeight: 0.55, ftsWeight: 0.45 },
|
|
199
|
+
} as const;
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Detect if results are primarily web content (have urls vs file paths).
|
|
203
|
+
*/
|
|
204
|
+
function detectContentType(results: SearchResult[]): 'web' | 'code' {
|
|
205
|
+
const webCount = results.filter(r => 'url' in r.metadata).length;
|
|
206
|
+
return webCount > results.length / 2 ? 'web' : 'code';
|
|
176
207
|
}
|
|
177
208
|
|
|
178
209
|
export class SearchService {
|
|
179
210
|
private readonly lanceStore: LanceStore;
|
|
180
211
|
private readonly embeddingEngine: EmbeddingEngine;
|
|
181
|
-
private readonly rrfConfig: RRFConfig;
|
|
182
212
|
private readonly codeUnitService: CodeUnitService;
|
|
183
213
|
private readonly codeGraphService: CodeGraphService | undefined;
|
|
184
214
|
private readonly graphCache: Map<string, CodeGraph | null>;
|
|
@@ -186,13 +216,10 @@ export class SearchService {
|
|
|
186
216
|
constructor(
|
|
187
217
|
lanceStore: LanceStore,
|
|
188
218
|
embeddingEngine: EmbeddingEngine,
|
|
189
|
-
// Lower k value (20 vs 60) produces more differentiated scores for top results
|
|
190
|
-
rrfConfig: RRFConfig = { k: 20, vectorWeight: 0.6, ftsWeight: 0.4 },
|
|
191
219
|
codeGraphService?: CodeGraphService
|
|
192
220
|
) {
|
|
193
221
|
this.lanceStore = lanceStore;
|
|
194
222
|
this.embeddingEngine = embeddingEngine;
|
|
195
|
-
this.rrfConfig = rrfConfig;
|
|
196
223
|
this.codeUnitService = new CodeUnitService();
|
|
197
224
|
this.codeGraphService = codeGraphService;
|
|
198
225
|
this.graphCache = new Map();
|
|
@@ -220,6 +247,18 @@ export class SearchService {
|
|
|
220
247
|
const limit = query.limit ?? 10;
|
|
221
248
|
const stores = query.stores ?? [];
|
|
222
249
|
const detail = query.detail ?? 'minimal';
|
|
250
|
+
const intents = classifyQueryIntents(query.query);
|
|
251
|
+
const primaryIntent = getPrimaryIntent(intents);
|
|
252
|
+
|
|
253
|
+
logger.debug({
|
|
254
|
+
query: query.query,
|
|
255
|
+
mode,
|
|
256
|
+
limit,
|
|
257
|
+
stores,
|
|
258
|
+
detail,
|
|
259
|
+
intent: primaryIntent,
|
|
260
|
+
intents,
|
|
261
|
+
}, 'Search query received');
|
|
223
262
|
|
|
224
263
|
let allResults: SearchResult[] = [];
|
|
225
264
|
|
|
@@ -254,13 +293,24 @@ export class SearchService {
|
|
|
254
293
|
return this.addProgressiveContext(r, query.query, detail, graph);
|
|
255
294
|
});
|
|
256
295
|
|
|
296
|
+
const timeMs = Date.now() - startTime;
|
|
297
|
+
|
|
298
|
+
logger.info({
|
|
299
|
+
query: query.query,
|
|
300
|
+
mode,
|
|
301
|
+
resultCount: enhancedResults.length,
|
|
302
|
+
dedupedFrom: allResults.length,
|
|
303
|
+
intents: intents.map(i => `${i.intent}(${i.confidence.toFixed(2)})`),
|
|
304
|
+
timeMs,
|
|
305
|
+
}, 'Search complete');
|
|
306
|
+
|
|
257
307
|
return {
|
|
258
308
|
query: query.query,
|
|
259
309
|
mode,
|
|
260
310
|
stores,
|
|
261
311
|
results: enhancedResults,
|
|
262
312
|
totalResults: enhancedResults.length,
|
|
263
|
-
timeMs
|
|
313
|
+
timeMs,
|
|
264
314
|
};
|
|
265
315
|
}
|
|
266
316
|
|
|
@@ -273,20 +323,22 @@ export class SearchService {
|
|
|
273
323
|
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
274
324
|
|
|
275
325
|
for (const result of results) {
|
|
276
|
-
// Use file path as the source key,
|
|
326
|
+
// Use file path as the source key (or url for web content, or id as last resort)
|
|
277
327
|
const sourceKey = result.metadata.path ?? result.metadata.url ?? result.id;
|
|
278
328
|
|
|
279
329
|
const existing = bySource.get(sourceKey);
|
|
280
330
|
if (!existing) {
|
|
281
331
|
bySource.set(sourceKey, result);
|
|
282
332
|
} else {
|
|
283
|
-
//
|
|
333
|
+
// Score-weighted relevance: accounts for fileType/framework boosts
|
|
284
334
|
const existingTermCount = this.countQueryTerms(existing.content, queryTerms);
|
|
285
335
|
const newTermCount = this.countQueryTerms(result.content, queryTerms);
|
|
286
336
|
|
|
287
|
-
//
|
|
288
|
-
|
|
289
|
-
|
|
337
|
+
// Weight term count by score to account for ranking boosts
|
|
338
|
+
const existingRelevance = existingTermCount * existing.score;
|
|
339
|
+
const newRelevance = newTermCount * result.score;
|
|
340
|
+
|
|
341
|
+
if (newRelevance > existingRelevance) {
|
|
290
342
|
bySource.set(sourceKey, result);
|
|
291
343
|
}
|
|
292
344
|
}
|
|
@@ -352,8 +404,8 @@ export class SearchService {
|
|
|
352
404
|
limit: number,
|
|
353
405
|
threshold?: number
|
|
354
406
|
): Promise<SearchResult[]> {
|
|
355
|
-
//
|
|
356
|
-
const
|
|
407
|
+
// Classify query intents for context-aware ranking (supports multiple intents)
|
|
408
|
+
const intents = classifyQueryIntents(query);
|
|
357
409
|
|
|
358
410
|
// Get both result sets
|
|
359
411
|
const [vectorResults, ftsResults] = await Promise.all([
|
|
@@ -390,9 +442,14 @@ export class SearchService {
|
|
|
390
442
|
ftsRRF: number;
|
|
391
443
|
fileTypeBoost: number;
|
|
392
444
|
frameworkBoost: number;
|
|
445
|
+
urlKeywordBoost: number;
|
|
446
|
+
pathKeywordBoost: number;
|
|
393
447
|
};
|
|
394
448
|
}> = [];
|
|
395
|
-
|
|
449
|
+
|
|
450
|
+
// Select RRF config based on content type (web vs code)
|
|
451
|
+
const contentType = detectContentType([...allDocs.values()]);
|
|
452
|
+
const { k, vectorWeight, ftsWeight } = RRF_PRESETS[contentType];
|
|
396
453
|
|
|
397
454
|
for (const [id, result] of allDocs) {
|
|
398
455
|
const vectorRank = vectorRanks.get(id) ?? Infinity;
|
|
@@ -401,16 +458,22 @@ export class SearchService {
|
|
|
401
458
|
const vectorRRF = vectorRank !== Infinity ? vectorWeight / (k + vectorRank) : 0;
|
|
402
459
|
const ftsRRF = ftsRank !== Infinity ? ftsWeight / (k + ftsRank) : 0;
|
|
403
460
|
|
|
404
|
-
// Apply file-type boost (base + intent-adjusted)
|
|
461
|
+
// Apply file-type boost (base + multi-intent-adjusted)
|
|
405
462
|
const fileTypeBoost = this.getFileTypeBoost(
|
|
406
463
|
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions
|
|
407
464
|
result.metadata['fileType'] as string | undefined,
|
|
408
|
-
|
|
465
|
+
intents
|
|
409
466
|
);
|
|
410
467
|
|
|
411
468
|
// Apply framework context boost
|
|
412
469
|
const frameworkBoost = this.getFrameworkContextBoost(query, result);
|
|
413
470
|
|
|
471
|
+
// Apply URL keyword boost (helps "troubleshooting" find /troubleshooting pages)
|
|
472
|
+
const urlKeywordBoost = this.getUrlKeywordBoost(query, result);
|
|
473
|
+
|
|
474
|
+
// Apply path keyword boost (helps "dispatcher" find async_dispatcher.py)
|
|
475
|
+
const pathKeywordBoost = this.getPathKeywordBoost(query, result);
|
|
476
|
+
|
|
414
477
|
const metadata: {
|
|
415
478
|
vectorRank?: number;
|
|
416
479
|
ftsRank?: number;
|
|
@@ -418,11 +481,15 @@ export class SearchService {
|
|
|
418
481
|
ftsRRF: number;
|
|
419
482
|
fileTypeBoost: number;
|
|
420
483
|
frameworkBoost: number;
|
|
484
|
+
urlKeywordBoost: number;
|
|
485
|
+
pathKeywordBoost: number;
|
|
421
486
|
} = {
|
|
422
487
|
vectorRRF,
|
|
423
488
|
ftsRRF,
|
|
424
489
|
fileTypeBoost,
|
|
425
490
|
frameworkBoost,
|
|
491
|
+
urlKeywordBoost,
|
|
492
|
+
pathKeywordBoost,
|
|
426
493
|
};
|
|
427
494
|
|
|
428
495
|
if (vectorRank !== Infinity) {
|
|
@@ -434,7 +501,7 @@ export class SearchService {
|
|
|
434
501
|
|
|
435
502
|
rrfScores.push({
|
|
436
503
|
id,
|
|
437
|
-
score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost,
|
|
504
|
+
score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost * urlKeywordBoost * pathKeywordBoost,
|
|
438
505
|
result,
|
|
439
506
|
metadata,
|
|
440
507
|
});
|
|
@@ -490,7 +557,7 @@ export class SearchService {
|
|
|
490
557
|
* Phase 4: Strengthened boosts for better documentation ranking.
|
|
491
558
|
* Phase 1: Intent-based adjustments for context-aware ranking.
|
|
492
559
|
*/
|
|
493
|
-
private getFileTypeBoost(fileType: string | undefined,
|
|
560
|
+
private getFileTypeBoost(fileType: string | undefined, intents: ClassifiedIntent[]): number {
|
|
494
561
|
// Base file-type boosts
|
|
495
562
|
let baseBoost: number;
|
|
496
563
|
switch (fileType) {
|
|
@@ -519,11 +586,96 @@ export class SearchService {
|
|
|
519
586
|
baseBoost = 1.0;
|
|
520
587
|
}
|
|
521
588
|
|
|
522
|
-
//
|
|
523
|
-
|
|
524
|
-
|
|
589
|
+
// Blend intent-based multipliers weighted by confidence
|
|
590
|
+
let weightedMultiplier = 0;
|
|
591
|
+
let totalConfidence = 0;
|
|
592
|
+
|
|
593
|
+
for (const { intent, confidence } of intents) {
|
|
594
|
+
const intentBoosts = INTENT_FILE_BOOSTS[intent];
|
|
595
|
+
const multiplier = intentBoosts[fileType ?? 'other'] ?? 1.0;
|
|
596
|
+
weightedMultiplier += multiplier * confidence;
|
|
597
|
+
totalConfidence += confidence;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
const blendedMultiplier = totalConfidence > 0
|
|
601
|
+
? weightedMultiplier / totalConfidence
|
|
602
|
+
: 1.0;
|
|
603
|
+
|
|
604
|
+
return baseBoost * blendedMultiplier;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
/**
|
|
608
|
+
* Get a score multiplier based on URL keyword matching.
|
|
609
|
+
* Boosts results where URL path contains significant query keywords.
|
|
610
|
+
* This helps queries like "troubleshooting" rank /troubleshooting pages first.
|
|
611
|
+
*/
|
|
612
|
+
private getUrlKeywordBoost(query: string, result: SearchResult): number {
|
|
613
|
+
const url = result.metadata.url;
|
|
614
|
+
if (url === undefined || url === '') return 1.0;
|
|
615
|
+
|
|
616
|
+
// Extract path segments from URL and normalize
|
|
617
|
+
const urlPath = url.toLowerCase().replace(/[^a-z0-9]+/g, ' ');
|
|
618
|
+
|
|
619
|
+
// Common stop words to filter from queries
|
|
620
|
+
const stopWords = new Set([
|
|
621
|
+
'how', 'to', 'the', 'a', 'an', 'is', 'are', 'what', 'why', 'when',
|
|
622
|
+
'where', 'can', 'do', 'does', 'i', 'my', 'your', 'it', 'in', 'on',
|
|
623
|
+
'for', 'with', 'this', 'that', 'get', 'use', 'using'
|
|
624
|
+
]);
|
|
625
|
+
|
|
626
|
+
// Extract meaningful query terms
|
|
627
|
+
const queryTerms = query.toLowerCase()
|
|
628
|
+
.split(/\s+/)
|
|
629
|
+
.filter(t => t.length > 2 && !stopWords.has(t));
|
|
630
|
+
|
|
631
|
+
if (queryTerms.length === 0) return 1.0;
|
|
632
|
+
|
|
633
|
+
// Count matching terms in URL path
|
|
634
|
+
const matchingTerms = queryTerms.filter(term => urlPath.includes(term));
|
|
635
|
+
|
|
636
|
+
if (matchingTerms.length === 0) return 1.0;
|
|
637
|
+
|
|
638
|
+
// Boost based on proportion of matching terms
|
|
639
|
+
// Single match: ~1.5, all terms match: ~2.0
|
|
640
|
+
const matchRatio = matchingTerms.length / queryTerms.length;
|
|
641
|
+
return 1.0 + (1.0 * matchRatio);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
/**
|
|
645
|
+
* Get a score multiplier based on file path keyword matching.
|
|
646
|
+
* Boosts results where file path contains significant query keywords.
|
|
647
|
+
* This helps queries like "dispatcher" rank async_dispatcher.py higher.
|
|
648
|
+
*/
|
|
649
|
+
private getPathKeywordBoost(query: string, result: SearchResult): number {
|
|
650
|
+
const path = result.metadata.path;
|
|
651
|
+
if (path === undefined || path === '') return 1.0;
|
|
652
|
+
|
|
653
|
+
// Extract path segments and normalize (split on slashes, dots, underscores, etc.)
|
|
654
|
+
const pathSegments = path.toLowerCase().replace(/[^a-z0-9]+/g, ' ');
|
|
655
|
+
|
|
656
|
+
// Common stop words to filter from queries
|
|
657
|
+
const stopWords = new Set([
|
|
658
|
+
'how', 'to', 'the', 'a', 'an', 'is', 'are', 'what', 'why', 'when',
|
|
659
|
+
'where', 'can', 'do', 'does', 'i', 'my', 'your', 'it', 'in', 'on',
|
|
660
|
+
'for', 'with', 'this', 'that', 'get', 'use', 'using'
|
|
661
|
+
]);
|
|
662
|
+
|
|
663
|
+
// Extract meaningful query terms
|
|
664
|
+
const queryTerms = query.toLowerCase()
|
|
665
|
+
.split(/\s+/)
|
|
666
|
+
.filter(t => t.length > 2 && !stopWords.has(t));
|
|
667
|
+
|
|
668
|
+
if (queryTerms.length === 0) return 1.0;
|
|
669
|
+
|
|
670
|
+
// Count matching terms in file path
|
|
671
|
+
const matchingTerms = queryTerms.filter(term => pathSegments.includes(term));
|
|
672
|
+
|
|
673
|
+
if (matchingTerms.length === 0) return 1.0;
|
|
525
674
|
|
|
526
|
-
|
|
675
|
+
// Boost based on proportion of matching terms
|
|
676
|
+
// Single match: ~1.5, all terms match: ~2.0
|
|
677
|
+
const matchRatio = matchingTerms.length / queryTerms.length;
|
|
678
|
+
return 1.0 + (1.0 * matchRatio);
|
|
527
679
|
}
|
|
528
680
|
|
|
529
681
|
/**
|
|
@@ -54,6 +54,11 @@ export function extractSnippet(
|
|
|
54
54
|
|
|
55
55
|
/**
|
|
56
56
|
* Find the position in content where the most query terms cluster together.
|
|
57
|
+
* Uses multi-factor scoring:
|
|
58
|
+
* - Query term density (base score)
|
|
59
|
+
* - Sentence completeness bonus
|
|
60
|
+
* - Code example presence bonus
|
|
61
|
+
* - Section header proximity bonus
|
|
57
62
|
*/
|
|
58
63
|
function findBestMatchPosition(content: string, queryTerms: string[]): number {
|
|
59
64
|
const lowerContent = content.toLowerCase();
|
|
@@ -73,7 +78,6 @@ function findBestMatchPosition(content: string, queryTerms: string[]): number {
|
|
|
73
78
|
return -1;
|
|
74
79
|
}
|
|
75
80
|
|
|
76
|
-
// Score each position by how many other terms are nearby (within 200 chars)
|
|
77
81
|
const PROXIMITY_WINDOW = 200;
|
|
78
82
|
const firstTerm = termPositions[0];
|
|
79
83
|
if (firstTerm === undefined) {
|
|
@@ -83,15 +87,36 @@ function findBestMatchPosition(content: string, queryTerms: string[]): number {
|
|
|
83
87
|
let bestScore = 0;
|
|
84
88
|
|
|
85
89
|
for (const { position } of termPositions) {
|
|
86
|
-
//
|
|
90
|
+
// Base score: count unique terms within proximity window
|
|
87
91
|
const nearbyTerms = new Set<string>();
|
|
88
92
|
for (const { term, position: otherPos } of termPositions) {
|
|
89
93
|
if (Math.abs(position - otherPos) <= PROXIMITY_WINDOW) {
|
|
90
94
|
nearbyTerms.add(term);
|
|
91
95
|
}
|
|
92
96
|
}
|
|
97
|
+
let score = nearbyTerms.size * 10; // Base: 10 points per unique term
|
|
98
|
+
|
|
99
|
+
// Extract window around position for bonus scoring
|
|
100
|
+
const windowStart = Math.max(0, position - PROXIMITY_WINDOW / 2);
|
|
101
|
+
const windowEnd = Math.min(content.length, position + PROXIMITY_WINDOW / 2);
|
|
102
|
+
const window = content.slice(windowStart, windowEnd);
|
|
103
|
+
|
|
104
|
+
// Bonus: Sentence completeness (contains sentence-ending punctuation)
|
|
105
|
+
if (/[.!?]/.test(window)) {
|
|
106
|
+
score += 5;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Bonus: Code example presence (backticks, brackets, common code patterns)
|
|
110
|
+
if (/[`{}()[\]]|=>|function|const |let |var /.test(window)) {
|
|
111
|
+
score += 3;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Bonus: Near markdown section header
|
|
115
|
+
const headerMatch = content.slice(Math.max(0, position - 100), position).match(/^#{1,3}\s+.+$/m);
|
|
116
|
+
if (headerMatch) {
|
|
117
|
+
score += 4;
|
|
118
|
+
}
|
|
93
119
|
|
|
94
|
-
const score = nearbyTerms.size;
|
|
95
120
|
if (score > bestScore) {
|
|
96
121
|
bestScore = score;
|
|
97
122
|
bestPosition = position;
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { estimateTokens, formatTokenCount } from './token.service.js';
|
|
3
|
+
|
|
4
|
+
describe('token.service', () => {
|
|
5
|
+
describe('estimateTokens', () => {
|
|
6
|
+
it('returns 0 for empty string', () => {
|
|
7
|
+
expect(estimateTokens('')).toBe(0);
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
it('estimates tokens for short text', () => {
|
|
11
|
+
// "hello" = 5 chars, 5/3.5 = 1.43, ceil = 2
|
|
12
|
+
expect(estimateTokens('hello')).toBe(2);
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
it('estimates tokens for longer text', () => {
|
|
16
|
+
// 35 chars / 3.5 = 10 tokens
|
|
17
|
+
const text = 'a'.repeat(35);
|
|
18
|
+
expect(estimateTokens(text)).toBe(10);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
it('rounds up token count', () => {
|
|
22
|
+
// 7 chars / 3.5 = 2 tokens exactly
|
|
23
|
+
expect(estimateTokens('abcdefg')).toBe(2);
|
|
24
|
+
// 8 chars / 3.5 = 2.29, ceil = 3
|
|
25
|
+
expect(estimateTokens('abcdefgh')).toBe(3);
|
|
26
|
+
});
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
describe('formatTokenCount', () => {
|
|
30
|
+
it('formats small counts without suffix', () => {
|
|
31
|
+
expect(formatTokenCount(100)).toBe('~100');
|
|
32
|
+
expect(formatTokenCount(999)).toBe('~999');
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('formats counts >= 1000 with k suffix', () => {
|
|
36
|
+
expect(formatTokenCount(1000)).toBe('~1.0k');
|
|
37
|
+
expect(formatTokenCount(1500)).toBe('~1.5k');
|
|
38
|
+
expect(formatTokenCount(10000)).toBe('~10.0k');
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('formats zero', () => {
|
|
42
|
+
expect(formatTokenCount(0)).toBe('~0');
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token estimation service using Anthropic's recommended heuristic.
|
|
3
|
+
* For Claude 3+ models, Anthropic recommends ~3.5 characters per token
|
|
4
|
+
* for English text. This varies by language.
|
|
5
|
+
*
|
|
6
|
+
* Note: The official @anthropic-ai/tokenizer package only works for
|
|
7
|
+
* pre-Claude 3 models. For accurate counts on Claude 3+, use the
|
|
8
|
+
* Token Count API. This heuristic is suitable for display purposes.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
const CHARS_PER_TOKEN = 3.5;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Estimate token count for a string using character-based heuristic.
|
|
15
|
+
* @param text - The text to estimate tokens for
|
|
16
|
+
* @returns Estimated token count (rounded up)
|
|
17
|
+
*/
|
|
18
|
+
export function estimateTokens(text: string): number {
|
|
19
|
+
if (!text) return 0;
|
|
20
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Format token count for display with appropriate suffix.
|
|
25
|
+
* @param tokens - Token count
|
|
26
|
+
* @returns Formatted string like "~1.2k" or "~847"
|
|
27
|
+
*/
|
|
28
|
+
export function formatTokenCount(tokens: number): string {
|
|
29
|
+
if (tokens >= 1000) {
|
|
30
|
+
return `~${(tokens / 1000).toFixed(1)}k`;
|
|
31
|
+
}
|
|
32
|
+
return `~${String(tokens)}`;
|
|
33
|
+
}
|
package/src/types/result.test.ts
CHANGED
|
@@ -28,6 +28,16 @@ describe('Result type', () => {
|
|
|
28
28
|
const result = err(new Error('failed'));
|
|
29
29
|
expect(() => unwrap(result)).toThrow('failed');
|
|
30
30
|
});
|
|
31
|
+
|
|
32
|
+
it('throws wrapped error for non-Error error value', () => {
|
|
33
|
+
const result = err('string error message');
|
|
34
|
+
expect(() => unwrap(result)).toThrow('string error message');
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('converts non-string error to string', () => {
|
|
38
|
+
const result = err(404);
|
|
39
|
+
expect(() => unwrap(result)).toThrow('404');
|
|
40
|
+
});
|
|
31
41
|
});
|
|
32
42
|
|
|
33
43
|
describe('unwrapOr', () => {
|