raggrep 0.13.2 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/cli/main.js +199 -11
- package/dist/cli/main.js.map +8 -7
- package/dist/domain/services/index.d.ts +2 -1
- package/dist/domain/services/literalExtractor.d.ts +20 -0
- package/dist/domain/services/phraseMatch.d.ts +99 -0
- package/dist/domain/services/phraseMatch.test.d.ts +4 -0
- package/dist/index.js +198 -10
- package/dist/index.js.map +8 -7
- package/dist/tests/simulation-phrase-matching.test.d.ts +14 -0
- package/dist/tests/simulation-vocabulary.test.d.ts +17 -0
- package/dist/tests/vocabulary-scoring.test.d.ts +16 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -8,12 +8,13 @@ RAGgrep indexes your code and lets you search it using natural language. Everyth
|
|
|
8
8
|
|
|
9
9
|
- **Zero-config search** — Just run `raggrep query` and it works. Index is created and updated automatically.
|
|
10
10
|
- **Multi-language support** — Deep understanding of TypeScript, JavaScript, Python, Go, and Rust with AST-aware parsing.
|
|
11
|
-
- **Vocabulary-based search** — Search `user` to find `getUserById`, `fetchUserData`, `UserService`, etc.
|
|
11
|
+
- **Vocabulary-based search** — Search `user` to find `getUserById`, `fetchUserData`, `UserService`, etc. Natural language queries like "where is user session validated" find `validateUserSession()`.
|
|
12
12
|
- **Local-first** — All indexing and search happens on your machine. No cloud dependencies.
|
|
13
13
|
- **Incremental** — Only re-indexes files that have changed. Instant search when nothing changed.
|
|
14
14
|
- **Watch mode** — Keep the index fresh in real-time as you code.
|
|
15
15
|
- **Hybrid search** — Combines semantic similarity with keyword matching for best results.
|
|
16
16
|
- **Literal boosting** — Exact identifier matches get priority. Use backticks for precise matching: `` `AuthService` ``.
|
|
17
|
+
- **Phrase matching** — Exact phrases in documentation are found even when semantic similarity is low.
|
|
17
18
|
- **Semantic expansion** — Domain-specific synonyms improve recall (function ↔ method, auth ↔ authentication).
|
|
18
19
|
|
|
19
20
|
## Installation
|
package/dist/cli/main.js
CHANGED
|
@@ -3598,6 +3598,30 @@ function extractVocabulary(literal) {
|
|
|
3598
3598
|
const filtered = words.filter((w) => w.length > 1);
|
|
3599
3599
|
return [...new Set(filtered)];
|
|
3600
3600
|
}
|
|
3601
|
+
function extractQueryVocabulary(query) {
|
|
3602
|
+
if (!query || query.trim() === "") {
|
|
3603
|
+
return [];
|
|
3604
|
+
}
|
|
3605
|
+
const vocabularySet = new Set;
|
|
3606
|
+
const tokens = query.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
|
|
3607
|
+
for (const token of tokens) {
|
|
3608
|
+
if (QUERY_STOP_WORDS.has(token)) {
|
|
3609
|
+
continue;
|
|
3610
|
+
}
|
|
3611
|
+
const looksLikeIdentifier = /[A-Z]/.test(token) || token.includes("_") || token.includes("-");
|
|
3612
|
+
if (looksLikeIdentifier) {
|
|
3613
|
+
const vocabWords = extractVocabulary(token);
|
|
3614
|
+
for (const word of vocabWords) {
|
|
3615
|
+
if (!QUERY_STOP_WORDS.has(word)) {
|
|
3616
|
+
vocabularySet.add(word);
|
|
3617
|
+
}
|
|
3618
|
+
}
|
|
3619
|
+
} else {
|
|
3620
|
+
vocabularySet.add(token);
|
|
3621
|
+
}
|
|
3622
|
+
}
|
|
3623
|
+
return Array.from(vocabularySet);
|
|
3624
|
+
}
|
|
3601
3625
|
function extractLiterals(chunk) {
|
|
3602
3626
|
const literals = [];
|
|
3603
3627
|
if (chunk.name) {
|
|
@@ -3612,7 +3636,7 @@ function extractLiterals(chunk) {
|
|
|
3612
3636
|
}
|
|
3613
3637
|
return literals;
|
|
3614
3638
|
}
|
|
3615
|
-
var COMMON_ABBREVIATIONS, STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
|
|
3639
|
+
var COMMON_ABBREVIATIONS, STOP_WORDS, QUERY_STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
|
|
3616
3640
|
var init_literalExtractor = __esm(() => {
|
|
3617
3641
|
COMMON_ABBREVIATIONS = new Set([
|
|
3618
3642
|
"id",
|
|
@@ -3661,6 +3685,37 @@ var init_literalExtractor = __esm(() => {
|
|
|
3661
3685
|
"as",
|
|
3662
3686
|
"if"
|
|
3663
3687
|
]);
|
|
3688
|
+
QUERY_STOP_WORDS = new Set([
|
|
3689
|
+
...STOP_WORDS,
|
|
3690
|
+
"what",
|
|
3691
|
+
"where",
|
|
3692
|
+
"when",
|
|
3693
|
+
"how",
|
|
3694
|
+
"why",
|
|
3695
|
+
"which",
|
|
3696
|
+
"who",
|
|
3697
|
+
"find",
|
|
3698
|
+
"show",
|
|
3699
|
+
"get",
|
|
3700
|
+
"list",
|
|
3701
|
+
"search",
|
|
3702
|
+
"and",
|
|
3703
|
+
"but",
|
|
3704
|
+
"with",
|
|
3705
|
+
"from",
|
|
3706
|
+
"that",
|
|
3707
|
+
"this",
|
|
3708
|
+
"these",
|
|
3709
|
+
"those",
|
|
3710
|
+
"it",
|
|
3711
|
+
"its",
|
|
3712
|
+
"code",
|
|
3713
|
+
"file",
|
|
3714
|
+
"function",
|
|
3715
|
+
"class",
|
|
3716
|
+
"method",
|
|
3717
|
+
"variable"
|
|
3718
|
+
]);
|
|
3664
3719
|
CHUNK_TYPE_TO_LITERAL_TYPE = {
|
|
3665
3720
|
class: "className",
|
|
3666
3721
|
function: "functionName",
|
|
@@ -4374,6 +4429,113 @@ function extractJsonKeywords(obj) {
|
|
|
4374
4429
|
// src/domain/services/configValidator.ts
|
|
4375
4430
|
var init_configValidator = () => {};
|
|
4376
4431
|
|
|
4432
|
+
// src/domain/services/phraseMatch.ts
|
|
4433
|
+
function tokenizeForMatching(text, filterStopWords = true) {
|
|
4434
|
+
if (!text || text.trim() === "") {
|
|
4435
|
+
return [];
|
|
4436
|
+
}
|
|
4437
|
+
const tokens = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
|
|
4438
|
+
if (filterStopWords) {
|
|
4439
|
+
return tokens.filter((t) => !PHRASE_STOP_WORDS.has(t));
|
|
4440
|
+
}
|
|
4441
|
+
return tokens;
|
|
4442
|
+
}
|
|
4443
|
+
function calculatePhraseMatch(content, query) {
|
|
4444
|
+
if (!content || !query || query.trim().length < PHRASE_MATCH_CONSTANTS.MIN_QUERY_LENGTH) {
|
|
4445
|
+
return {
|
|
4446
|
+
exactMatch: false,
|
|
4447
|
+
coverage: 0,
|
|
4448
|
+
matchedTokenCount: 0,
|
|
4449
|
+
totalTokenCount: 0,
|
|
4450
|
+
boost: 0,
|
|
4451
|
+
isSignificant: false
|
|
4452
|
+
};
|
|
4453
|
+
}
|
|
4454
|
+
const contentLower = content.toLowerCase();
|
|
4455
|
+
const queryLower = query.toLowerCase().trim();
|
|
4456
|
+
const exactMatch = contentLower.includes(queryLower);
|
|
4457
|
+
const queryTokens = tokenizeForMatching(query, true);
|
|
4458
|
+
const matchedTokens = queryTokens.filter((token) => contentLower.includes(token));
|
|
4459
|
+
const coverage = queryTokens.length > 0 ? matchedTokens.length / queryTokens.length : 0;
|
|
4460
|
+
let boost = 0;
|
|
4461
|
+
if (exactMatch) {
|
|
4462
|
+
boost = PHRASE_MATCH_CONSTANTS.EXACT_PHRASE_BOOST;
|
|
4463
|
+
} else if (coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD) {
|
|
4464
|
+
boost = PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_BOOST;
|
|
4465
|
+
} else if (coverage >= PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_THRESHOLD) {
|
|
4466
|
+
boost = PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_BOOST;
|
|
4467
|
+
}
|
|
4468
|
+
const isSignificant = exactMatch || coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD;
|
|
4469
|
+
return {
|
|
4470
|
+
exactMatch,
|
|
4471
|
+
coverage,
|
|
4472
|
+
matchedTokenCount: matchedTokens.length,
|
|
4473
|
+
totalTokenCount: queryTokens.length,
|
|
4474
|
+
boost,
|
|
4475
|
+
isSignificant
|
|
4476
|
+
};
|
|
4477
|
+
}
|
|
4478
|
+
var PHRASE_MATCH_CONSTANTS, PHRASE_STOP_WORDS;
|
|
4479
|
+
var init_phraseMatch = __esm(() => {
|
|
4480
|
+
PHRASE_MATCH_CONSTANTS = {
|
|
4481
|
+
EXACT_PHRASE_BOOST: 0.5,
|
|
4482
|
+
HIGH_COVERAGE_BOOST: 0.2,
|
|
4483
|
+
MEDIUM_COVERAGE_BOOST: 0.1,
|
|
4484
|
+
HIGH_COVERAGE_THRESHOLD: 0.8,
|
|
4485
|
+
MEDIUM_COVERAGE_THRESHOLD: 0.6,
|
|
4486
|
+
MIN_QUERY_LENGTH: 3
|
|
4487
|
+
};
|
|
4488
|
+
PHRASE_STOP_WORDS = new Set([
|
|
4489
|
+
"a",
|
|
4490
|
+
"an",
|
|
4491
|
+
"the",
|
|
4492
|
+
"in",
|
|
4493
|
+
"on",
|
|
4494
|
+
"at",
|
|
4495
|
+
"to",
|
|
4496
|
+
"for",
|
|
4497
|
+
"of",
|
|
4498
|
+
"with",
|
|
4499
|
+
"by",
|
|
4500
|
+
"from",
|
|
4501
|
+
"as",
|
|
4502
|
+
"and",
|
|
4503
|
+
"or",
|
|
4504
|
+
"but",
|
|
4505
|
+
"what",
|
|
4506
|
+
"where",
|
|
4507
|
+
"when",
|
|
4508
|
+
"how",
|
|
4509
|
+
"why",
|
|
4510
|
+
"which",
|
|
4511
|
+
"who",
|
|
4512
|
+
"is",
|
|
4513
|
+
"are",
|
|
4514
|
+
"was",
|
|
4515
|
+
"were",
|
|
4516
|
+
"be",
|
|
4517
|
+
"been",
|
|
4518
|
+
"being",
|
|
4519
|
+
"have",
|
|
4520
|
+
"has",
|
|
4521
|
+
"had",
|
|
4522
|
+
"do",
|
|
4523
|
+
"does",
|
|
4524
|
+
"did",
|
|
4525
|
+
"i",
|
|
4526
|
+
"you",
|
|
4527
|
+
"he",
|
|
4528
|
+
"she",
|
|
4529
|
+
"it",
|
|
4530
|
+
"we",
|
|
4531
|
+
"they",
|
|
4532
|
+
"this",
|
|
4533
|
+
"that",
|
|
4534
|
+
"these",
|
|
4535
|
+
"those"
|
|
4536
|
+
]);
|
|
4537
|
+
});
|
|
4538
|
+
|
|
4377
4539
|
// src/domain/services/index.ts
|
|
4378
4540
|
var init_services = __esm(() => {
|
|
4379
4541
|
init_keywords();
|
|
@@ -4384,6 +4546,7 @@ var init_services = __esm(() => {
|
|
|
4384
4546
|
init_lexicon2();
|
|
4385
4547
|
init_introspection();
|
|
4386
4548
|
init_configValidator();
|
|
4549
|
+
init_phraseMatch();
|
|
4387
4550
|
});
|
|
4388
4551
|
|
|
4389
4552
|
// src/modules/language/typescript/parseCode.ts
|
|
@@ -5202,9 +5365,21 @@ class TypeScriptModule {
|
|
|
5202
5365
|
const symbolicIndex = new SymbolicIndex(indexDir, this.id);
|
|
5203
5366
|
const literalIndex = new LiteralIndex(indexDir, this.id);
|
|
5204
5367
|
let literalMatchMap = new Map;
|
|
5368
|
+
let vocabularyScoreMap = new Map;
|
|
5205
5369
|
try {
|
|
5206
5370
|
await literalIndex.initialize();
|
|
5207
5371
|
literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
|
|
5372
|
+
const queryVocabulary = extractQueryVocabulary(query);
|
|
5373
|
+
if (queryVocabulary.length > 0) {
|
|
5374
|
+
const vocabMatches = literalIndex.findByVocabularyWords(queryVocabulary);
|
|
5375
|
+
for (const { entry, matchedWords } of vocabMatches) {
|
|
5376
|
+
const vocabScore = matchedWords.length / queryVocabulary.length;
|
|
5377
|
+
const existingScore = vocabularyScoreMap.get(entry.chunkId) || 0;
|
|
5378
|
+
if (vocabScore > existingScore) {
|
|
5379
|
+
vocabularyScoreMap.set(entry.chunkId, vocabScore);
|
|
5380
|
+
}
|
|
5381
|
+
}
|
|
5382
|
+
}
|
|
5208
5383
|
} catch {}
|
|
5209
5384
|
let allFiles;
|
|
5210
5385
|
try {
|
|
@@ -5284,18 +5459,20 @@ class TypeScriptModule {
|
|
|
5284
5459
|
for (const { filepath, chunk, embedding } of allChunksData) {
|
|
5285
5460
|
const semanticScore = cosineSimilarity(queryEmbedding, embedding);
|
|
5286
5461
|
const bm25Score = bm25Scores.get(chunk.id) || 0;
|
|
5462
|
+
const vocabScore = vocabularyScoreMap.get(chunk.id) || 0;
|
|
5287
5463
|
const pathBoost = pathBoosts.get(filepath) || 0;
|
|
5464
|
+
const phraseMatch = calculatePhraseMatch(chunk.content, query);
|
|
5288
5465
|
const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
|
|
5289
5466
|
const chunkTypeBoost = calculateChunkTypeBoost(chunk);
|
|
5290
5467
|
const exportBoost = calculateExportBoost(chunk);
|
|
5291
|
-
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
|
|
5292
|
-
const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score;
|
|
5468
|
+
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
|
|
5469
|
+
const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore;
|
|
5293
5470
|
const literalMatches = literalMatchMap.get(chunk.id) || [];
|
|
5294
5471
|
const literalContribution = calculateLiteralContribution(literalMatches, true);
|
|
5295
5472
|
const boostedScore = applyLiteralBoost(baseScore, literalMatches, true);
|
|
5296
5473
|
const finalScore = boostedScore + additiveBoost;
|
|
5297
5474
|
processedChunkIds.add(chunk.id);
|
|
5298
|
-
if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0) {
|
|
5475
|
+
if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0 || vocabScore > VOCAB_THRESHOLD || phraseMatch.isSignificant) {
|
|
5299
5476
|
results.push({
|
|
5300
5477
|
filepath,
|
|
5301
5478
|
chunk,
|
|
@@ -5304,6 +5481,9 @@ class TypeScriptModule {
|
|
|
5304
5481
|
context: {
|
|
5305
5482
|
semanticScore,
|
|
5306
5483
|
bm25Score,
|
|
5484
|
+
vocabScore,
|
|
5485
|
+
phraseMatch: phraseMatch.exactMatch,
|
|
5486
|
+
phraseCoverage: phraseMatch.coverage,
|
|
5307
5487
|
pathBoost,
|
|
5308
5488
|
fileTypeBoost,
|
|
5309
5489
|
chunkTypeBoost,
|
|
@@ -5353,13 +5533,15 @@ class TypeScriptModule {
|
|
|
5353
5533
|
semanticScore = cosineSimilarity(queryEmbedding, embedding);
|
|
5354
5534
|
}
|
|
5355
5535
|
const bm25Score = bm25Scores.get(chunkId) || 0;
|
|
5536
|
+
const vocabScore = vocabularyScoreMap.get(chunkId) || 0;
|
|
5537
|
+
const phraseMatch = calculatePhraseMatch(chunk.content, query);
|
|
5356
5538
|
const pathBoost = pathBoosts.get(filepath) || 0;
|
|
5357
5539
|
const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
|
|
5358
5540
|
const chunkTypeBoost = calculateChunkTypeBoost(chunk);
|
|
5359
5541
|
const exportBoost = calculateExportBoost(chunk);
|
|
5360
|
-
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
|
|
5542
|
+
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
|
|
5361
5543
|
const literalContribution = calculateLiteralContribution(chunkLiteralMatches, false);
|
|
5362
|
-
const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
|
|
5544
|
+
const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
|
|
5363
5545
|
const boostedScore = applyLiteralBoost(baseScore, chunkLiteralMatches, semanticScore > 0);
|
|
5364
5546
|
const finalScore = boostedScore + additiveBoost;
|
|
5365
5547
|
processedChunkIds.add(chunkId);
|
|
@@ -5371,6 +5553,9 @@ class TypeScriptModule {
|
|
|
5371
5553
|
context: {
|
|
5372
5554
|
semanticScore,
|
|
5373
5555
|
bm25Score,
|
|
5556
|
+
vocabScore,
|
|
5557
|
+
phraseMatch: phraseMatch.exactMatch,
|
|
5558
|
+
phraseCoverage: phraseMatch.coverage,
|
|
5374
5559
|
pathBoost,
|
|
5375
5560
|
fileTypeBoost,
|
|
5376
5561
|
chunkTypeBoost,
|
|
@@ -5411,7 +5596,7 @@ class TypeScriptModule {
|
|
|
5411
5596
|
return references;
|
|
5412
5597
|
}
|
|
5413
5598
|
}
|
|
5414
|
-
var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.
|
|
5599
|
+
var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.6, BM25_WEIGHT = 0.25, VOCAB_WEIGHT = 0.15, VOCAB_THRESHOLD = 0.4, TYPESCRIPT_EXTENSIONS, supportsFile;
|
|
5415
5600
|
var init_typescript = __esm(() => {
|
|
5416
5601
|
init_embeddings();
|
|
5417
5602
|
init_services();
|
|
@@ -11486,6 +11671,7 @@ class MarkdownModule {
|
|
|
11486
11671
|
for (const { filepath, chunk, embedding } of allChunksData) {
|
|
11487
11672
|
const semanticScore = cosineSimilarity(queryEmbedding, embedding);
|
|
11488
11673
|
const bm25Score = bm25Scores.get(chunk.id) || 0;
|
|
11674
|
+
const phraseMatch = calculatePhraseMatch(chunk.content, query);
|
|
11489
11675
|
let docBoost = 0;
|
|
11490
11676
|
if (queryTerms.some((t) => [
|
|
11491
11677
|
"docs",
|
|
@@ -11499,8 +11685,8 @@ class MarkdownModule {
|
|
|
11499
11685
|
docBoost = 0.05;
|
|
11500
11686
|
}
|
|
11501
11687
|
const headingBoost = calculateHeadingLevelBoost(chunk);
|
|
11502
|
-
const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost;
|
|
11503
|
-
if (hybridScore >= minScore || bm25Score > 0.3) {
|
|
11688
|
+
const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost + phraseMatch.boost;
|
|
11689
|
+
if (hybridScore >= minScore || bm25Score > 0.3 || phraseMatch.isSignificant) {
|
|
11504
11690
|
results.push({
|
|
11505
11691
|
filepath,
|
|
11506
11692
|
chunk,
|
|
@@ -11509,6 +11695,8 @@ class MarkdownModule {
|
|
|
11509
11695
|
context: {
|
|
11510
11696
|
semanticScore,
|
|
11511
11697
|
bm25Score,
|
|
11698
|
+
phraseMatch: phraseMatch.exactMatch,
|
|
11699
|
+
phraseCoverage: phraseMatch.coverage,
|
|
11512
11700
|
docBoost,
|
|
11513
11701
|
headingBoost,
|
|
11514
11702
|
headingLevel: chunk.metadata?.headingLevel
|
|
@@ -15048,7 +15236,7 @@ init_logger();
|
|
|
15048
15236
|
// package.json
|
|
15049
15237
|
var package_default = {
|
|
15050
15238
|
name: "raggrep",
|
|
15051
|
-
version: "0.
|
|
15239
|
+
version: "0.14.0",
|
|
15052
15240
|
description: "Local filesystem-based RAG system for codebases - semantic search using local embeddings",
|
|
15053
15241
|
type: "module",
|
|
15054
15242
|
main: "./dist/index.js",
|
|
@@ -15644,4 +15832,4 @@ Run 'raggrep <command> --help' for more information.
|
|
|
15644
15832
|
}
|
|
15645
15833
|
main();
|
|
15646
15834
|
|
|
15647
|
-
//# debugId=
|
|
15835
|
+
//# debugId=CF359982C72DD5D264756E2164756E21
|