bluera-knowledge 0.9.26 → 0.9.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude/commands/commit.md +4 -7
  2. package/.claude/hooks/post-edit-check.sh +21 -24
  3. package/.claude/skills/atomic-commits/SKILL.md +6 -0
  4. package/.claude-plugin/plugin.json +1 -1
  5. package/.env.example +4 -0
  6. package/.husky/pre-push +12 -2
  7. package/.versionrc.json +0 -4
  8. package/CHANGELOG.md +69 -0
  9. package/README.md +55 -20
  10. package/bun.lock +35 -1
  11. package/commands/crawl.md +2 -0
  12. package/dist/{chunk-BICFAWMN.js → chunk-DNOIM7BO.js} +73 -8
  13. package/dist/chunk-DNOIM7BO.js.map +1 -0
  14. package/dist/{chunk-5QMHZUC4.js → chunk-NJUMU4X2.js} +462 -105
  15. package/dist/chunk-NJUMU4X2.js.map +1 -0
  16. package/dist/{chunk-J7J6LXOJ.js → chunk-SZNTYLYT.js} +106 -41
  17. package/dist/chunk-SZNTYLYT.js.map +1 -0
  18. package/dist/index.js +65 -25
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp/server.js +2 -2
  21. package/dist/workers/background-worker-cli.js +2 -2
  22. package/eslint.config.js +1 -1
  23. package/package.json +3 -1
  24. package/src/analysis/ast-parser.test.ts +46 -0
  25. package/src/cli/commands/crawl.test.ts +99 -12
  26. package/src/cli/commands/crawl.ts +76 -24
  27. package/src/crawl/article-converter.ts +36 -1
  28. package/src/crawl/bridge.ts +18 -7
  29. package/src/crawl/intelligent-crawler.ts +45 -4
  30. package/src/db/embeddings.test.ts +16 -0
  31. package/src/logging/index.ts +29 -0
  32. package/src/logging/logger.test.ts +75 -0
  33. package/src/logging/logger.ts +147 -0
  34. package/src/logging/payload.test.ts +152 -0
  35. package/src/logging/payload.ts +121 -0
  36. package/src/mcp/handlers/search.handler.test.ts +28 -9
  37. package/src/mcp/handlers/search.handler.ts +69 -29
  38. package/src/mcp/handlers/store.handler.test.ts +1 -0
  39. package/src/mcp/server.ts +44 -16
  40. package/src/services/chunking.service.ts +23 -0
  41. package/src/services/index.service.test.ts +921 -1
  42. package/src/services/index.service.ts +76 -1
  43. package/src/services/index.ts +10 -1
  44. package/src/services/search.service.test.ts +573 -21
  45. package/src/services/search.service.ts +257 -105
  46. package/src/services/snippet.service.ts +28 -3
  47. package/src/services/token.service.test.ts +45 -0
  48. package/src/services/token.service.ts +33 -0
  49. package/src/types/result.test.ts +10 -0
  50. package/tests/integration/cli-consistency.test.ts +1 -4
  51. package/vitest.config.ts +4 -0
  52. package/dist/chunk-5QMHZUC4.js.map +0 -1
  53. package/dist/chunk-BICFAWMN.js.map +0 -1
  54. package/dist/chunk-J7J6LXOJ.js.map +0 -1
  55. package/scripts/readme-version-updater.cjs +0 -18
@@ -6,13 +6,24 @@ import { CodeUnitService } from './code-unit.service.js';
6
6
  import type { CodeUnit } from '../types/search.js';
7
7
  import type { CodeGraphService } from './code-graph.service.js';
8
8
  import type { CodeGraph } from '../analysis/code-graph.js';
9
+ import { createLogger } from '../logging/index.js';
10
+
11
+ const logger = createLogger('search-service');
9
12
 
10
13
  /**
11
14
  * Query intent classification for context-aware ranking.
12
- * Phase 1: Different intents prioritize different content types.
15
+ * Different intents prioritize different content types.
13
16
  */
14
17
  export type QueryIntent = 'how-to' | 'implementation' | 'conceptual' | 'comparison' | 'debugging';
15
18
 
19
+ /**
20
+ * Classified intent with confidence score for multi-intent queries.
21
+ */
22
+ export interface ClassifiedIntent {
23
+ intent: QueryIntent;
24
+ confidence: number;
25
+ }
26
+
16
27
  /**
17
28
  * Intent-based file type multipliers - CONSERVATIVE version.
18
29
  * Applied on top of base file-type boosts.
@@ -84,101 +95,120 @@ const FRAMEWORK_PATTERNS: Array<{ pattern: RegExp; terms: string[] }> = [
84
95
  { pattern: /\bjwt\b/i, terms: ['jwt', 'jsonwebtoken', 'json-web-token'] },
85
96
  ];
86
97
 
98
+ // Pattern definitions for intent classification
99
+ const HOW_TO_PATTERNS = [
100
+ /how (do|can|should|would) (i|you|we)/i,
101
+ /how to\b/i,
102
+ /what('s| is) the (best |right |correct )?(way|approach) to/i,
103
+ /i (need|want|have) to/i,
104
+ /show me how/i,
105
+ /\bwhat's the syntax\b/i,
106
+ /\bhow do i (use|create|make|set up|configure|implement|add|get)\b/i,
107
+ /\bi'm (trying|building|creating|making)\b/i,
108
+ ];
109
+
110
+ const IMPLEMENTATION_PATTERNS = [
111
+ /how (does|is) .* (implemented|work internally)/i,
112
+ /\binternal(ly)?\b/i,
113
+ /\bsource code\b/i,
114
+ /\bunder the hood\b/i,
115
+ /\bimplementation (of|details?)\b/i,
116
+ ];
117
+
118
+ const COMPARISON_PATTERNS = [
119
+ /\b(vs\.?|versus)\b/i,
120
+ /\bdifference(s)? between\b/i,
121
+ /\bcompare\b/i,
122
+ /\bshould (i|we) use .* or\b/i,
123
+ /\bwhat's the difference\b/i,
124
+ /\bwhich (one|is better)\b/i,
125
+ /\bwhen (should|to) use\b/i,
126
+ ];
127
+
128
+ const DEBUGGING_PATTERNS = [
129
+ /\b(error|bug|issue|problem|crash|fail|broken|wrong)\b/i,
130
+ /\bdoesn't (work|compile|run)\b/i,
131
+ /\bisn't (working|updating|rendering)\b/i,
132
+ /\bwhy (is|does|doesn't|isn't)\b/i,
133
+ /\bwhat('s| is) (wrong|happening|going on)\b/i,
134
+ /\bwhat am i doing wrong\b/i,
135
+ /\bnot (working|updating|showing)\b/i,
136
+ /\bhow do i (fix|debug|solve|resolve)\b/i,
137
+ ];
138
+
139
+ const CONCEPTUAL_PATTERNS = [
140
+ /\bwhat (is|are)\b/i,
141
+ /\bexplain\b/i,
142
+ /\bwhat does .* (mean|do)\b/i,
143
+ /\bhow does .* work\b/i,
144
+ /\bwhat('s| is) the (purpose|point|idea)\b/i,
145
+ ];
146
+
87
147
  /**
88
- * Classify the intent of a search query.
89
- * This helps adjust ranking based on what kind of answer the user wants.
148
+ * Classify query intents with confidence scores.
149
+ * Returns all matching intents, allowing queries to have multiple intents.
90
150
  */
91
- function classifyQueryIntent(query: string): QueryIntent {
151
+ function classifyQueryIntents(query: string): ClassifiedIntent[] {
92
152
  const q = query.toLowerCase();
153
+ const intents: ClassifiedIntent[] = [];
154
+
155
+ // Check all pattern groups and add matching intents with confidence
156
+ if (IMPLEMENTATION_PATTERNS.some(p => p.test(q))) {
157
+ intents.push({ intent: 'implementation', confidence: 0.9 });
158
+ }
159
+
160
+ if (DEBUGGING_PATTERNS.some(p => p.test(q))) {
161
+ intents.push({ intent: 'debugging', confidence: 0.85 });
162
+ }
93
163
 
94
- // How-to patterns: user wants to learn how to use/do something
95
- const howToPatterns = [
96
- /how (do|can|should|would) (i|you|we)/i,
97
- /how to\b/i,
98
- /what('s| is) the (best |right |correct )?(way|approach) to/i,
99
- /i (need|want|have) to/i,
100
- /show me how/i,
101
- /\bwhat's the syntax\b/i,
102
- /\bhow do i (use|create|make|set up|configure|implement|add|get)\b/i,
103
- /\bi'm (trying|building|creating|making)\b/i,
104
- ];
105
-
106
- // Implementation patterns: user wants to understand internals
107
- const implementationPatterns = [
108
- /how (does|is) .* (implemented|work internally)/i,
109
- /\binternal(ly)?\b/i,
110
- /\bsource code\b/i,
111
- /\bunder the hood\b/i,
112
- /\bimplementation (of|details?)\b/i,
113
- ];
114
-
115
- // Comparison patterns: user is deciding between options
116
- const comparisonPatterns = [
117
- /\b(vs\.?|versus)\b/i,
118
- /\bdifference(s)? between\b/i,
119
- /\bcompare\b/i,
120
- /\bshould (i|we) use .* or\b/i,
121
- /\bwhat's the difference\b/i,
122
- /\bwhich (one|is better)\b/i,
123
- /\bwhen (should|to) use\b/i,
124
- ];
125
-
126
- // Debugging patterns: user is troubleshooting a problem
127
- const debuggingPatterns = [
128
- /\b(error|bug|issue|problem|crash|fail|broken|wrong)\b/i,
129
- /\bdoesn't (work|compile|run)\b/i,
130
- /\bisn't (working|updating|rendering)\b/i,
131
- /\bwhy (is|does|doesn't|isn't)\b/i,
132
- /\bwhat('s| is) (wrong|happening|going on)\b/i,
133
- /\bwhat am i doing wrong\b/i,
134
- /\bnot (working|updating|showing)\b/i,
135
- /\bhow do i (fix|debug|solve|resolve)\b/i,
136
- ];
137
-
138
- // Conceptual patterns: user wants to understand a concept
139
- const conceptualPatterns = [
140
- /\bwhat (is|are)\b/i,
141
- /\bexplain\b/i,
142
- /\bwhat does .* (mean|do)\b/i,
143
- /\bhow does .* work\b/i,
144
- /\bwhat('s| is) the (purpose|point|idea)\b/i,
145
- ];
146
-
147
- // Check patterns in order of specificity
148
- if (implementationPatterns.some(p => p.test(q))) {
149
- return 'implementation';
150
- }
151
-
152
- if (debuggingPatterns.some(p => p.test(q))) {
153
- return 'debugging';
154
- }
155
-
156
- if (comparisonPatterns.some(p => p.test(q))) {
157
- return 'comparison';
158
- }
159
-
160
- if (howToPatterns.some(p => p.test(q))) {
161
- return 'how-to';
162
- }
163
-
164
- if (conceptualPatterns.some(p => p.test(q))) {
165
- return 'conceptual';
166
- }
167
-
168
- // Default to how-to as most queries are seeking practical usage
169
- return 'how-to';
164
+ if (COMPARISON_PATTERNS.some(p => p.test(q))) {
165
+ intents.push({ intent: 'comparison', confidence: 0.8 });
166
+ }
167
+
168
+ if (HOW_TO_PATTERNS.some(p => p.test(q))) {
169
+ intents.push({ intent: 'how-to', confidence: 0.75 });
170
+ }
171
+
172
+ if (CONCEPTUAL_PATTERNS.some(p => p.test(q))) {
173
+ intents.push({ intent: 'conceptual', confidence: 0.7 });
174
+ }
175
+
176
+ // If no patterns match, use how-to as the baseline intent
177
+ if (intents.length === 0) {
178
+ intents.push({ intent: 'how-to', confidence: 0.5 });
179
+ }
180
+
181
+ // Sort by confidence descending
182
+ return intents.sort((a, b) => b.confidence - a.confidence);
183
+ }
184
+
185
+ /**
186
+ * Get primary intent for logging/display purposes.
187
+ */
188
+ function getPrimaryIntent(intents: ClassifiedIntent[]): QueryIntent {
189
+ return intents[0]?.intent ?? 'how-to';
170
190
  }
171
191
 
172
- interface RRFConfig {
173
- k: number;
174
- vectorWeight: number;
175
- ftsWeight: number;
192
+ /**
193
+ * RRF presets for different content types.
194
+ * Web/docs content uses higher k to reduce noise from repetitive structure.
195
+ */
196
+ const RRF_PRESETS = {
197
+ code: { k: 20, vectorWeight: 0.6, ftsWeight: 0.4 },
198
+ web: { k: 30, vectorWeight: 0.55, ftsWeight: 0.45 },
199
+ } as const;
200
+
201
+ /**
202
+ * Detect if results are primarily web content (have urls vs file paths).
203
+ */
204
+ function detectContentType(results: SearchResult[]): 'web' | 'code' {
205
+ const webCount = results.filter(r => 'url' in r.metadata).length;
206
+ return webCount > results.length / 2 ? 'web' : 'code';
176
207
  }
177
208
 
178
209
  export class SearchService {
179
210
  private readonly lanceStore: LanceStore;
180
211
  private readonly embeddingEngine: EmbeddingEngine;
181
- private readonly rrfConfig: RRFConfig;
182
212
  private readonly codeUnitService: CodeUnitService;
183
213
  private readonly codeGraphService: CodeGraphService | undefined;
184
214
  private readonly graphCache: Map<string, CodeGraph | null>;
@@ -186,13 +216,10 @@ export class SearchService {
186
216
  constructor(
187
217
  lanceStore: LanceStore,
188
218
  embeddingEngine: EmbeddingEngine,
189
- // Lower k value (20 vs 60) produces more differentiated scores for top results
190
- rrfConfig: RRFConfig = { k: 20, vectorWeight: 0.6, ftsWeight: 0.4 },
191
219
  codeGraphService?: CodeGraphService
192
220
  ) {
193
221
  this.lanceStore = lanceStore;
194
222
  this.embeddingEngine = embeddingEngine;
195
- this.rrfConfig = rrfConfig;
196
223
  this.codeUnitService = new CodeUnitService();
197
224
  this.codeGraphService = codeGraphService;
198
225
  this.graphCache = new Map();
@@ -220,6 +247,18 @@ export class SearchService {
220
247
  const limit = query.limit ?? 10;
221
248
  const stores = query.stores ?? [];
222
249
  const detail = query.detail ?? 'minimal';
250
+ const intents = classifyQueryIntents(query.query);
251
+ const primaryIntent = getPrimaryIntent(intents);
252
+
253
+ logger.debug({
254
+ query: query.query,
255
+ mode,
256
+ limit,
257
+ stores,
258
+ detail,
259
+ intent: primaryIntent,
260
+ intents,
261
+ }, 'Search query received');
223
262
 
224
263
  let allResults: SearchResult[] = [];
225
264
 
@@ -254,13 +293,24 @@ export class SearchService {
254
293
  return this.addProgressiveContext(r, query.query, detail, graph);
255
294
  });
256
295
 
296
+ const timeMs = Date.now() - startTime;
297
+
298
+ logger.info({
299
+ query: query.query,
300
+ mode,
301
+ resultCount: enhancedResults.length,
302
+ dedupedFrom: allResults.length,
303
+ intents: intents.map(i => `${i.intent}(${i.confidence.toFixed(2)})`),
304
+ timeMs,
305
+ }, 'Search complete');
306
+
257
307
  return {
258
308
  query: query.query,
259
309
  mode,
260
310
  stores,
261
311
  results: enhancedResults,
262
312
  totalResults: enhancedResults.length,
263
- timeMs: Date.now() - startTime,
313
+ timeMs,
264
314
  };
265
315
  }
266
316
 
@@ -273,20 +323,22 @@ export class SearchService {
273
323
  const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
274
324
 
275
325
  for (const result of results) {
276
- // Use file path as the source key, fallback to document ID
326
+ // Use file path as the source key (or url for web content, or id as last resort)
277
327
  const sourceKey = result.metadata.path ?? result.metadata.url ?? result.id;
278
328
 
279
329
  const existing = bySource.get(sourceKey);
280
330
  if (!existing) {
281
331
  bySource.set(sourceKey, result);
282
332
  } else {
283
- // Compare: prefer chunk with more query terms in content
333
+ // Score-weighted relevance: accounts for fileType/framework boosts
284
334
  const existingTermCount = this.countQueryTerms(existing.content, queryTerms);
285
335
  const newTermCount = this.countQueryTerms(result.content, queryTerms);
286
336
 
287
- // Prefer chunk with more query terms, or higher score if same
288
- if (newTermCount > existingTermCount ||
289
- (newTermCount === existingTermCount && result.score > existing.score)) {
337
+ // Weight term count by score to account for ranking boosts
338
+ const existingRelevance = existingTermCount * existing.score;
339
+ const newRelevance = newTermCount * result.score;
340
+
341
+ if (newRelevance > existingRelevance) {
290
342
  bySource.set(sourceKey, result);
291
343
  }
292
344
  }
@@ -352,8 +404,8 @@ export class SearchService {
352
404
  limit: number,
353
405
  threshold?: number
354
406
  ): Promise<SearchResult[]> {
355
- // Phase 1: Classify query intent for context-aware ranking
356
- const intent = classifyQueryIntent(query);
407
+ // Classify query intents for context-aware ranking (supports multiple intents)
408
+ const intents = classifyQueryIntents(query);
357
409
 
358
410
  // Get both result sets
359
411
  const [vectorResults, ftsResults] = await Promise.all([
@@ -390,9 +442,14 @@ export class SearchService {
390
442
  ftsRRF: number;
391
443
  fileTypeBoost: number;
392
444
  frameworkBoost: number;
445
+ urlKeywordBoost: number;
446
+ pathKeywordBoost: number;
393
447
  };
394
448
  }> = [];
395
- const { k, vectorWeight, ftsWeight } = this.rrfConfig;
449
+
450
+ // Select RRF config based on content type (web vs code)
451
+ const contentType = detectContentType([...allDocs.values()]);
452
+ const { k, vectorWeight, ftsWeight } = RRF_PRESETS[contentType];
396
453
 
397
454
  for (const [id, result] of allDocs) {
398
455
  const vectorRank = vectorRanks.get(id) ?? Infinity;
@@ -401,16 +458,22 @@ export class SearchService {
401
458
  const vectorRRF = vectorRank !== Infinity ? vectorWeight / (k + vectorRank) : 0;
402
459
  const ftsRRF = ftsRank !== Infinity ? ftsWeight / (k + ftsRank) : 0;
403
460
 
404
- // Apply file-type boost (base + intent-adjusted)
461
+ // Apply file-type boost (base + multi-intent-adjusted)
405
462
  const fileTypeBoost = this.getFileTypeBoost(
406
463
  // eslint-disable-next-line @typescript-eslint/consistent-type-assertions
407
464
  result.metadata['fileType'] as string | undefined,
408
- intent
465
+ intents
409
466
  );
410
467
 
411
468
  // Apply framework context boost
412
469
  const frameworkBoost = this.getFrameworkContextBoost(query, result);
413
470
 
471
+ // Apply URL keyword boost (helps "troubleshooting" find /troubleshooting pages)
472
+ const urlKeywordBoost = this.getUrlKeywordBoost(query, result);
473
+
474
+ // Apply path keyword boost (helps "dispatcher" find async_dispatcher.py)
475
+ const pathKeywordBoost = this.getPathKeywordBoost(query, result);
476
+
414
477
  const metadata: {
415
478
  vectorRank?: number;
416
479
  ftsRank?: number;
@@ -418,11 +481,15 @@ export class SearchService {
418
481
  ftsRRF: number;
419
482
  fileTypeBoost: number;
420
483
  frameworkBoost: number;
484
+ urlKeywordBoost: number;
485
+ pathKeywordBoost: number;
421
486
  } = {
422
487
  vectorRRF,
423
488
  ftsRRF,
424
489
  fileTypeBoost,
425
490
  frameworkBoost,
491
+ urlKeywordBoost,
492
+ pathKeywordBoost,
426
493
  };
427
494
 
428
495
  if (vectorRank !== Infinity) {
@@ -434,7 +501,7 @@ export class SearchService {
434
501
 
435
502
  rrfScores.push({
436
503
  id,
437
- score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost,
504
+ score: (vectorRRF + ftsRRF) * fileTypeBoost * frameworkBoost * urlKeywordBoost * pathKeywordBoost,
438
505
  result,
439
506
  metadata,
440
507
  });
@@ -490,7 +557,7 @@ export class SearchService {
490
557
  * Phase 4: Strengthened boosts for better documentation ranking.
491
558
  * Phase 1: Intent-based adjustments for context-aware ranking.
492
559
  */
493
- private getFileTypeBoost(fileType: string | undefined, intent: QueryIntent): number {
560
+ private getFileTypeBoost(fileType: string | undefined, intents: ClassifiedIntent[]): number {
494
561
  // Base file-type boosts
495
562
  let baseBoost: number;
496
563
  switch (fileType) {
@@ -519,11 +586,96 @@ export class SearchService {
519
586
  baseBoost = 1.0;
520
587
  }
521
588
 
522
- // Apply intent-based multiplier
523
- const intentBoosts = INTENT_FILE_BOOSTS[intent];
524
- const intentMultiplier = intentBoosts[fileType ?? 'other'] ?? 1.0;
589
+ // Blend intent-based multipliers weighted by confidence
590
+ let weightedMultiplier = 0;
591
+ let totalConfidence = 0;
592
+
593
+ for (const { intent, confidence } of intents) {
594
+ const intentBoosts = INTENT_FILE_BOOSTS[intent];
595
+ const multiplier = intentBoosts[fileType ?? 'other'] ?? 1.0;
596
+ weightedMultiplier += multiplier * confidence;
597
+ totalConfidence += confidence;
598
+ }
599
+
600
+ const blendedMultiplier = totalConfidence > 0
601
+ ? weightedMultiplier / totalConfidence
602
+ : 1.0;
603
+
604
+ return baseBoost * blendedMultiplier;
605
+ }
606
+
607
+ /**
608
+ * Get a score multiplier based on URL keyword matching.
609
+ * Boosts results where URL path contains significant query keywords.
610
+ * This helps queries like "troubleshooting" rank /troubleshooting pages first.
611
+ */
612
+ private getUrlKeywordBoost(query: string, result: SearchResult): number {
613
+ const url = result.metadata.url;
614
+ if (url === undefined || url === '') return 1.0;
615
+
616
+ // Extract path segments from URL and normalize
617
+ const urlPath = url.toLowerCase().replace(/[^a-z0-9]+/g, ' ');
618
+
619
+ // Common stop words to filter from queries
620
+ const stopWords = new Set([
621
+ 'how', 'to', 'the', 'a', 'an', 'is', 'are', 'what', 'why', 'when',
622
+ 'where', 'can', 'do', 'does', 'i', 'my', 'your', 'it', 'in', 'on',
623
+ 'for', 'with', 'this', 'that', 'get', 'use', 'using'
624
+ ]);
625
+
626
+ // Extract meaningful query terms
627
+ const queryTerms = query.toLowerCase()
628
+ .split(/\s+/)
629
+ .filter(t => t.length > 2 && !stopWords.has(t));
630
+
631
+ if (queryTerms.length === 0) return 1.0;
632
+
633
+ // Count matching terms in URL path
634
+ const matchingTerms = queryTerms.filter(term => urlPath.includes(term));
635
+
636
+ if (matchingTerms.length === 0) return 1.0;
637
+
638
+ // Boost based on proportion of matching terms
639
+ // Single match: ~1.5, all terms match: ~2.0
640
+ const matchRatio = matchingTerms.length / queryTerms.length;
641
+ return 1.0 + (1.0 * matchRatio);
642
+ }
643
+
644
+ /**
645
+ * Get a score multiplier based on file path keyword matching.
646
+ * Boosts results where file path contains significant query keywords.
647
+ * This helps queries like "dispatcher" rank async_dispatcher.py higher.
648
+ */
649
+ private getPathKeywordBoost(query: string, result: SearchResult): number {
650
+ const path = result.metadata.path;
651
+ if (path === undefined || path === '') return 1.0;
652
+
653
+ // Extract path segments and normalize (split on slashes, dots, underscores, etc.)
654
+ const pathSegments = path.toLowerCase().replace(/[^a-z0-9]+/g, ' ');
655
+
656
+ // Common stop words to filter from queries
657
+ const stopWords = new Set([
658
+ 'how', 'to', 'the', 'a', 'an', 'is', 'are', 'what', 'why', 'when',
659
+ 'where', 'can', 'do', 'does', 'i', 'my', 'your', 'it', 'in', 'on',
660
+ 'for', 'with', 'this', 'that', 'get', 'use', 'using'
661
+ ]);
662
+
663
+ // Extract meaningful query terms
664
+ const queryTerms = query.toLowerCase()
665
+ .split(/\s+/)
666
+ .filter(t => t.length > 2 && !stopWords.has(t));
667
+
668
+ if (queryTerms.length === 0) return 1.0;
669
+
670
+ // Count matching terms in file path
671
+ const matchingTerms = queryTerms.filter(term => pathSegments.includes(term));
672
+
673
+ if (matchingTerms.length === 0) return 1.0;
525
674
 
526
- return baseBoost * intentMultiplier;
675
+ // Boost based on proportion of matching terms
676
+ // Single match: ~1.5, all terms match: ~2.0
677
+ const matchRatio = matchingTerms.length / queryTerms.length;
678
+ return 1.0 + (1.0 * matchRatio);
527
679
  }
528
680
 
529
681
  /**
@@ -54,6 +54,11 @@ export function extractSnippet(
54
54
 
55
55
  /**
56
56
  * Find the position in content where the most query terms cluster together.
57
+ * Uses multi-factor scoring:
58
+ * - Query term density (base score)
59
+ * - Sentence completeness bonus
60
+ * - Code example presence bonus
61
+ * - Section header proximity bonus
57
62
  */
58
63
  function findBestMatchPosition(content: string, queryTerms: string[]): number {
59
64
  const lowerContent = content.toLowerCase();
@@ -73,7 +78,6 @@ function findBestMatchPosition(content: string, queryTerms: string[]): number {
73
78
  return -1;
74
79
  }
75
80
 
76
- // Score each position by how many other terms are nearby (within 200 chars)
77
81
  const PROXIMITY_WINDOW = 200;
78
82
  const firstTerm = termPositions[0];
79
83
  if (firstTerm === undefined) {
@@ -83,15 +87,36 @@ function findBestMatchPosition(content: string, queryTerms: string[]): number {
83
87
  let bestScore = 0;
84
88
 
85
89
  for (const { position } of termPositions) {
86
- // Count unique terms within proximity window
90
+ // Base score: count unique terms within proximity window
87
91
  const nearbyTerms = new Set<string>();
88
92
  for (const { term, position: otherPos } of termPositions) {
89
93
  if (Math.abs(position - otherPos) <= PROXIMITY_WINDOW) {
90
94
  nearbyTerms.add(term);
91
95
  }
92
96
  }
97
+ let score = nearbyTerms.size * 10; // Base: 10 points per unique term
98
+
99
+ // Extract window around position for bonus scoring
100
+ const windowStart = Math.max(0, position - PROXIMITY_WINDOW / 2);
101
+ const windowEnd = Math.min(content.length, position + PROXIMITY_WINDOW / 2);
102
+ const window = content.slice(windowStart, windowEnd);
103
+
104
+ // Bonus: Sentence completeness (contains sentence-ending punctuation)
105
+ if (/[.!?]/.test(window)) {
106
+ score += 5;
107
+ }
108
+
109
+ // Bonus: Code example presence (backticks, brackets, common code patterns)
110
+ if (/[`{}()[\]]|=>|function|const |let |var /.test(window)) {
111
+ score += 3;
112
+ }
113
+
114
+ // Bonus: Near markdown section header
115
+ const headerMatch = content.slice(Math.max(0, position - 100), position).match(/^#{1,3}\s+.+$/m);
116
+ if (headerMatch) {
117
+ score += 4;
118
+ }
93
119
 
94
- const score = nearbyTerms.size;
95
120
  if (score > bestScore) {
96
121
  bestScore = score;
97
122
  bestPosition = position;
@@ -0,0 +1,45 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { estimateTokens, formatTokenCount } from './token.service.js';
3
+
4
+ describe('token.service', () => {
5
+ describe('estimateTokens', () => {
6
+ it('returns 0 for empty string', () => {
7
+ expect(estimateTokens('')).toBe(0);
8
+ });
9
+
10
+ it('estimates tokens for short text', () => {
11
+ // "hello" = 5 chars, 5/3.5 = 1.43, ceil = 2
12
+ expect(estimateTokens('hello')).toBe(2);
13
+ });
14
+
15
+ it('estimates tokens for longer text', () => {
16
+ // 35 chars / 3.5 = 10 tokens
17
+ const text = 'a'.repeat(35);
18
+ expect(estimateTokens(text)).toBe(10);
19
+ });
20
+
21
+ it('rounds up token count', () => {
22
+ // 7 chars / 3.5 = 2 tokens exactly
23
+ expect(estimateTokens('abcdefg')).toBe(2);
24
+ // 8 chars / 3.5 = 2.29, ceil = 3
25
+ expect(estimateTokens('abcdefgh')).toBe(3);
26
+ });
27
+ });
28
+
29
+ describe('formatTokenCount', () => {
30
+ it('formats small counts without suffix', () => {
31
+ expect(formatTokenCount(100)).toBe('~100');
32
+ expect(formatTokenCount(999)).toBe('~999');
33
+ });
34
+
35
+ it('formats counts >= 1000 with k suffix', () => {
36
+ expect(formatTokenCount(1000)).toBe('~1.0k');
37
+ expect(formatTokenCount(1500)).toBe('~1.5k');
38
+ expect(formatTokenCount(10000)).toBe('~10.0k');
39
+ });
40
+
41
+ it('formats zero', () => {
42
+ expect(formatTokenCount(0)).toBe('~0');
43
+ });
44
+ });
45
+ });
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Token estimation service using Anthropic's recommended heuristic.
3
+ * For Claude 3+ models, Anthropic recommends ~3.5 characters per token
4
+ * for English text. This varies by language.
5
+ *
6
+ * Note: The official @anthropic-ai/tokenizer package only works for
7
+ * pre-Claude 3 models. For accurate counts on Claude 3+, use the
8
+ * Token Count API. This heuristic is suitable for display purposes.
9
+ */
10
+
11
+ const CHARS_PER_TOKEN = 3.5;
12
+
13
+ /**
14
+ * Estimate token count for a string using character-based heuristic.
15
+ * @param text - The text to estimate tokens for
16
+ * @returns Estimated token count (rounded up)
17
+ */
18
+ export function estimateTokens(text: string): number {
19
+ if (!text) return 0;
20
+ return Math.ceil(text.length / CHARS_PER_TOKEN);
21
+ }
22
+
23
+ /**
24
+ * Format token count for display with appropriate suffix.
25
+ * @param tokens - Token count
26
+ * @returns Formatted string like "~1.2k" or "~847"
27
+ */
28
+ export function formatTokenCount(tokens: number): string {
29
+ if (tokens >= 1000) {
30
+ return `~${(tokens / 1000).toFixed(1)}k`;
31
+ }
32
+ return `~${String(tokens)}`;
33
+ }
@@ -28,6 +28,16 @@ describe('Result type', () => {
28
28
  const result = err(new Error('failed'));
29
29
  expect(() => unwrap(result)).toThrow('failed');
30
30
  });
31
+
32
+ it('throws wrapped error for non-Error error value', () => {
33
+ const result = err('string error message');
34
+ expect(() => unwrap(result)).toThrow('string error message');
35
+ });
36
+
37
+ it('converts non-string error to string', () => {
38
+ const result = err(404);
39
+ expect(() => unwrap(result)).toThrow('404');
40
+ });
31
41
  });
32
42
 
33
43
  describe('unwrapOr', () => {
@@ -231,10 +231,7 @@ describe('CLI Consistency', () => {
231
231
  expect(result.stderr).toMatch(/^Error: Store not found: nonexistent/m);
232
232
  });
233
233
 
234
- it('uses consistent "Error:" prefix for crawl store not found', () => {
235
- const result = runCli('crawl https://example.com nonexistent');
236
- expect(result.stderr).toMatch(/^Error: /m);
237
- });
234
+ // Note: crawl auto-creates stores when not found, so no error test needed
238
235
  });
239
236
 
240
237
  describe('store delete Confirmation', () => {