raggrep 0.13.2 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -8,12 +8,13 @@ RAGgrep indexes your code and lets you search it using natural language. Everyth
8
8
 
9
9
  - **Zero-config search** — Just run `raggrep query` and it works. Index is created and updated automatically.
10
10
  - **Multi-language support** — Deep understanding of TypeScript, JavaScript, Python, Go, and Rust with AST-aware parsing.
11
- - **Vocabulary-based search** — Search `user` to find `getUserById`, `fetchUserData`, `UserService`, etc. Understands code naming conventions.
11
+ - **Vocabulary-based search** — Search `user` to find `getUserById`, `fetchUserData`, `UserService`, etc. Natural language queries like "where is user session validated" find `validateUserSession()`.
12
12
  - **Local-first** — All indexing and search happens on your machine. No cloud dependencies.
13
13
  - **Incremental** — Only re-indexes files that have changed. Instant search when nothing changed.
14
14
  - **Watch mode** — Keep the index fresh in real-time as you code.
15
15
  - **Hybrid search** — Combines semantic similarity with keyword matching for best results.
16
16
  - **Literal boosting** — Exact identifier matches get priority. Use backticks for precise matching: `` `AuthService` ``.
17
+ - **Phrase matching** — Exact phrases in documentation are found even when semantic similarity is low.
17
18
  - **Semantic expansion** — Domain-specific synonyms improve recall (function ↔ method, auth ↔ authentication).
18
19
 
19
20
  ## Installation
package/dist/cli/main.js CHANGED
@@ -3598,6 +3598,30 @@ function extractVocabulary(literal) {
3598
3598
  const filtered = words.filter((w) => w.length > 1);
3599
3599
  return [...new Set(filtered)];
3600
3600
  }
3601
+ function extractQueryVocabulary(query) {
3602
+ if (!query || query.trim() === "") {
3603
+ return [];
3604
+ }
3605
+ const vocabularySet = new Set;
3606
+ const tokens = query.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
3607
+ for (const token of tokens) {
3608
+ if (QUERY_STOP_WORDS.has(token)) {
3609
+ continue;
3610
+ }
3611
+ const looksLikeIdentifier = /[A-Z]/.test(token) || token.includes("_") || token.includes("-");
3612
+ if (looksLikeIdentifier) {
3613
+ const vocabWords = extractVocabulary(token);
3614
+ for (const word of vocabWords) {
3615
+ if (!QUERY_STOP_WORDS.has(word)) {
3616
+ vocabularySet.add(word);
3617
+ }
3618
+ }
3619
+ } else {
3620
+ vocabularySet.add(token);
3621
+ }
3622
+ }
3623
+ return Array.from(vocabularySet);
3624
+ }
3601
3625
  function extractLiterals(chunk) {
3602
3626
  const literals = [];
3603
3627
  if (chunk.name) {
@@ -3612,7 +3636,7 @@ function extractLiterals(chunk) {
3612
3636
  }
3613
3637
  return literals;
3614
3638
  }
3615
- var COMMON_ABBREVIATIONS, STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
3639
+ var COMMON_ABBREVIATIONS, STOP_WORDS, QUERY_STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
3616
3640
  var init_literalExtractor = __esm(() => {
3617
3641
  COMMON_ABBREVIATIONS = new Set([
3618
3642
  "id",
@@ -3661,6 +3685,37 @@ var init_literalExtractor = __esm(() => {
3661
3685
  "as",
3662
3686
  "if"
3663
3687
  ]);
3688
+ QUERY_STOP_WORDS = new Set([
3689
+ ...STOP_WORDS,
3690
+ "what",
3691
+ "where",
3692
+ "when",
3693
+ "how",
3694
+ "why",
3695
+ "which",
3696
+ "who",
3697
+ "find",
3698
+ "show",
3699
+ "get",
3700
+ "list",
3701
+ "search",
3702
+ "and",
3703
+ "but",
3704
+ "with",
3705
+ "from",
3706
+ "that",
3707
+ "this",
3708
+ "these",
3709
+ "those",
3710
+ "it",
3711
+ "its",
3712
+ "code",
3713
+ "file",
3714
+ "function",
3715
+ "class",
3716
+ "method",
3717
+ "variable"
3718
+ ]);
3664
3719
  CHUNK_TYPE_TO_LITERAL_TYPE = {
3665
3720
  class: "className",
3666
3721
  function: "functionName",
@@ -4374,6 +4429,113 @@ function extractJsonKeywords(obj) {
4374
4429
  // src/domain/services/configValidator.ts
4375
4430
  var init_configValidator = () => {};
4376
4431
 
4432
+ // src/domain/services/phraseMatch.ts
4433
+ function tokenizeForMatching(text, filterStopWords = true) {
4434
+ if (!text || text.trim() === "") {
4435
+ return [];
4436
+ }
4437
+ const tokens = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
4438
+ if (filterStopWords) {
4439
+ return tokens.filter((t) => !PHRASE_STOP_WORDS.has(t));
4440
+ }
4441
+ return tokens;
4442
+ }
4443
+ function calculatePhraseMatch(content, query) {
4444
+ if (!content || !query || query.trim().length < PHRASE_MATCH_CONSTANTS.MIN_QUERY_LENGTH) {
4445
+ return {
4446
+ exactMatch: false,
4447
+ coverage: 0,
4448
+ matchedTokenCount: 0,
4449
+ totalTokenCount: 0,
4450
+ boost: 0,
4451
+ isSignificant: false
4452
+ };
4453
+ }
4454
+ const contentLower = content.toLowerCase();
4455
+ const queryLower = query.toLowerCase().trim();
4456
+ const exactMatch = contentLower.includes(queryLower);
4457
+ const queryTokens = tokenizeForMatching(query, true);
4458
+ const matchedTokens = queryTokens.filter((token) => contentLower.includes(token));
4459
+ const coverage = queryTokens.length > 0 ? matchedTokens.length / queryTokens.length : 0;
4460
+ let boost = 0;
4461
+ if (exactMatch) {
4462
+ boost = PHRASE_MATCH_CONSTANTS.EXACT_PHRASE_BOOST;
4463
+ } else if (coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD) {
4464
+ boost = PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_BOOST;
4465
+ } else if (coverage >= PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_THRESHOLD) {
4466
+ boost = PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_BOOST;
4467
+ }
4468
+ const isSignificant = exactMatch || coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD;
4469
+ return {
4470
+ exactMatch,
4471
+ coverage,
4472
+ matchedTokenCount: matchedTokens.length,
4473
+ totalTokenCount: queryTokens.length,
4474
+ boost,
4475
+ isSignificant
4476
+ };
4477
+ }
4478
+ var PHRASE_MATCH_CONSTANTS, PHRASE_STOP_WORDS;
4479
+ var init_phraseMatch = __esm(() => {
4480
+ PHRASE_MATCH_CONSTANTS = {
4481
+ EXACT_PHRASE_BOOST: 0.5,
4482
+ HIGH_COVERAGE_BOOST: 0.2,
4483
+ MEDIUM_COVERAGE_BOOST: 0.1,
4484
+ HIGH_COVERAGE_THRESHOLD: 0.8,
4485
+ MEDIUM_COVERAGE_THRESHOLD: 0.6,
4486
+ MIN_QUERY_LENGTH: 3
4487
+ };
4488
+ PHRASE_STOP_WORDS = new Set([
4489
+ "a",
4490
+ "an",
4491
+ "the",
4492
+ "in",
4493
+ "on",
4494
+ "at",
4495
+ "to",
4496
+ "for",
4497
+ "of",
4498
+ "with",
4499
+ "by",
4500
+ "from",
4501
+ "as",
4502
+ "and",
4503
+ "or",
4504
+ "but",
4505
+ "what",
4506
+ "where",
4507
+ "when",
4508
+ "how",
4509
+ "why",
4510
+ "which",
4511
+ "who",
4512
+ "is",
4513
+ "are",
4514
+ "was",
4515
+ "were",
4516
+ "be",
4517
+ "been",
4518
+ "being",
4519
+ "have",
4520
+ "has",
4521
+ "had",
4522
+ "do",
4523
+ "does",
4524
+ "did",
4525
+ "i",
4526
+ "you",
4527
+ "he",
4528
+ "she",
4529
+ "it",
4530
+ "we",
4531
+ "they",
4532
+ "this",
4533
+ "that",
4534
+ "these",
4535
+ "those"
4536
+ ]);
4537
+ });
4538
+
4377
4539
  // src/domain/services/index.ts
4378
4540
  var init_services = __esm(() => {
4379
4541
  init_keywords();
@@ -4384,6 +4546,7 @@ var init_services = __esm(() => {
4384
4546
  init_lexicon2();
4385
4547
  init_introspection();
4386
4548
  init_configValidator();
4549
+ init_phraseMatch();
4387
4550
  });
4388
4551
 
4389
4552
  // src/modules/language/typescript/parseCode.ts
@@ -5202,9 +5365,21 @@ class TypeScriptModule {
5202
5365
  const symbolicIndex = new SymbolicIndex(indexDir, this.id);
5203
5366
  const literalIndex = new LiteralIndex(indexDir, this.id);
5204
5367
  let literalMatchMap = new Map;
5368
+ let vocabularyScoreMap = new Map;
5205
5369
  try {
5206
5370
  await literalIndex.initialize();
5207
5371
  literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
5372
+ const queryVocabulary = extractQueryVocabulary(query);
5373
+ if (queryVocabulary.length > 0) {
5374
+ const vocabMatches = literalIndex.findByVocabularyWords(queryVocabulary);
5375
+ for (const { entry, matchedWords } of vocabMatches) {
5376
+ const vocabScore = matchedWords.length / queryVocabulary.length;
5377
+ const existingScore = vocabularyScoreMap.get(entry.chunkId) || 0;
5378
+ if (vocabScore > existingScore) {
5379
+ vocabularyScoreMap.set(entry.chunkId, vocabScore);
5380
+ }
5381
+ }
5382
+ }
5208
5383
  } catch {}
5209
5384
  let allFiles;
5210
5385
  try {
@@ -5284,18 +5459,20 @@ class TypeScriptModule {
5284
5459
  for (const { filepath, chunk, embedding } of allChunksData) {
5285
5460
  const semanticScore = cosineSimilarity(queryEmbedding, embedding);
5286
5461
  const bm25Score = bm25Scores.get(chunk.id) || 0;
5462
+ const vocabScore = vocabularyScoreMap.get(chunk.id) || 0;
5287
5463
  const pathBoost = pathBoosts.get(filepath) || 0;
5464
+ const phraseMatch = calculatePhraseMatch(chunk.content, query);
5288
5465
  const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
5289
5466
  const chunkTypeBoost = calculateChunkTypeBoost(chunk);
5290
5467
  const exportBoost = calculateExportBoost(chunk);
5291
- const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
5292
- const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score;
5468
+ const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
5469
+ const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore;
5293
5470
  const literalMatches = literalMatchMap.get(chunk.id) || [];
5294
5471
  const literalContribution = calculateLiteralContribution(literalMatches, true);
5295
5472
  const boostedScore = applyLiteralBoost(baseScore, literalMatches, true);
5296
5473
  const finalScore = boostedScore + additiveBoost;
5297
5474
  processedChunkIds.add(chunk.id);
5298
- if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0) {
5475
+ if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0 || vocabScore > VOCAB_THRESHOLD || phraseMatch.isSignificant) {
5299
5476
  results.push({
5300
5477
  filepath,
5301
5478
  chunk,
@@ -5304,6 +5481,9 @@ class TypeScriptModule {
5304
5481
  context: {
5305
5482
  semanticScore,
5306
5483
  bm25Score,
5484
+ vocabScore,
5485
+ phraseMatch: phraseMatch.exactMatch,
5486
+ phraseCoverage: phraseMatch.coverage,
5307
5487
  pathBoost,
5308
5488
  fileTypeBoost,
5309
5489
  chunkTypeBoost,
@@ -5353,13 +5533,15 @@ class TypeScriptModule {
5353
5533
  semanticScore = cosineSimilarity(queryEmbedding, embedding);
5354
5534
  }
5355
5535
  const bm25Score = bm25Scores.get(chunkId) || 0;
5536
+ const vocabScore = vocabularyScoreMap.get(chunkId) || 0;
5537
+ const phraseMatch = calculatePhraseMatch(chunk.content, query);
5356
5538
  const pathBoost = pathBoosts.get(filepath) || 0;
5357
5539
  const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
5358
5540
  const chunkTypeBoost = calculateChunkTypeBoost(chunk);
5359
5541
  const exportBoost = calculateExportBoost(chunk);
5360
- const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
5542
+ const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
5361
5543
  const literalContribution = calculateLiteralContribution(chunkLiteralMatches, false);
5362
- const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
5544
+ const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
5363
5545
  const boostedScore = applyLiteralBoost(baseScore, chunkLiteralMatches, semanticScore > 0);
5364
5546
  const finalScore = boostedScore + additiveBoost;
5365
5547
  processedChunkIds.add(chunkId);
@@ -5371,6 +5553,9 @@ class TypeScriptModule {
5371
5553
  context: {
5372
5554
  semanticScore,
5373
5555
  bm25Score,
5556
+ vocabScore,
5557
+ phraseMatch: phraseMatch.exactMatch,
5558
+ phraseCoverage: phraseMatch.coverage,
5374
5559
  pathBoost,
5375
5560
  fileTypeBoost,
5376
5561
  chunkTypeBoost,
@@ -5411,7 +5596,7 @@ class TypeScriptModule {
5411
5596
  return references;
5412
5597
  }
5413
5598
  }
5414
- var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.7, BM25_WEIGHT = 0.3, TYPESCRIPT_EXTENSIONS, supportsFile;
5599
+ var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.6, BM25_WEIGHT = 0.25, VOCAB_WEIGHT = 0.15, VOCAB_THRESHOLD = 0.4, TYPESCRIPT_EXTENSIONS, supportsFile;
5415
5600
  var init_typescript = __esm(() => {
5416
5601
  init_embeddings();
5417
5602
  init_services();
@@ -11486,6 +11671,7 @@ class MarkdownModule {
11486
11671
  for (const { filepath, chunk, embedding } of allChunksData) {
11487
11672
  const semanticScore = cosineSimilarity(queryEmbedding, embedding);
11488
11673
  const bm25Score = bm25Scores.get(chunk.id) || 0;
11674
+ const phraseMatch = calculatePhraseMatch(chunk.content, query);
11489
11675
  let docBoost = 0;
11490
11676
  if (queryTerms.some((t) => [
11491
11677
  "docs",
@@ -11499,8 +11685,8 @@ class MarkdownModule {
11499
11685
  docBoost = 0.05;
11500
11686
  }
11501
11687
  const headingBoost = calculateHeadingLevelBoost(chunk);
11502
- const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost;
11503
- if (hybridScore >= minScore || bm25Score > 0.3) {
11688
+ const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost + phraseMatch.boost;
11689
+ if (hybridScore >= minScore || bm25Score > 0.3 || phraseMatch.isSignificant) {
11504
11690
  results.push({
11505
11691
  filepath,
11506
11692
  chunk,
@@ -11509,6 +11695,8 @@ class MarkdownModule {
11509
11695
  context: {
11510
11696
  semanticScore,
11511
11697
  bm25Score,
11698
+ phraseMatch: phraseMatch.exactMatch,
11699
+ phraseCoverage: phraseMatch.coverage,
11512
11700
  docBoost,
11513
11701
  headingBoost,
11514
11702
  headingLevel: chunk.metadata?.headingLevel
@@ -15048,7 +15236,7 @@ init_logger();
15048
15236
  // package.json
15049
15237
  var package_default = {
15050
15238
  name: "raggrep",
15051
- version: "0.13.2",
15239
+ version: "0.14.0",
15052
15240
  description: "Local filesystem-based RAG system for codebases - semantic search using local embeddings",
15053
15241
  type: "module",
15054
15242
  main: "./dist/index.js",
@@ -15644,4 +15832,4 @@ Run 'raggrep <command> --help' for more information.
15644
15832
  }
15645
15833
  main();
15646
15834
 
15647
- //# debugId=5CD6138213DBFFD864756E2164756E21
15835
+ //# debugId=CF359982C72DD5D264756E2164756E21