npm - raggrep - Versions diffs - 0.13.2 → 0.14.0 - Mend

raggrep 0.13.2 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +2 -1
package/dist/cli/main.js +199 -11
package/dist/cli/main.js.map +8 -7
package/dist/domain/services/index.d.ts +2 -1
package/dist/domain/services/literalExtractor.d.ts +20 -0
package/dist/domain/services/phraseMatch.d.ts +99 -0
package/dist/domain/services/phraseMatch.test.d.ts +4 -0
package/dist/index.js +198 -10
package/dist/index.js.map +8 -7
package/dist/tests/simulation-phrase-matching.test.d.ts +14 -0
package/dist/tests/simulation-vocabulary.test.d.ts +17 -0
package/dist/tests/vocabulary-scoring.test.d.ts +16 -0
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -8,12 +8,13 @@ RAGgrep indexes your code and lets you search it using natural language. Everyth
 - **Zero-config search** — Just run `raggrep query` and it works. Index is created and updated automatically.
 - **Multi-language support** — Deep understanding of TypeScript, JavaScript, Python, Go, and Rust with AST-aware parsing.
-- **Vocabulary-based search** — Search `user` to find `getUserById`, `fetchUserData`, `UserService`, etc. Understands code naming conventions.
+- **Vocabulary-based search** — Search `user` to find `getUserById`, `fetchUserData`, `UserService`, etc. Natural language queries like "where is user session validated" find `validateUserSession()`.
 - **Local-first** — All indexing and search happens on your machine. No cloud dependencies.
 - **Incremental** — Only re-indexes files that have changed. Instant search when nothing changed.
 - **Watch mode** — Keep the index fresh in real-time as you code.
 - **Hybrid search** — Combines semantic similarity with keyword matching for best results.
 - **Literal boosting** — Exact identifier matches get priority. Use backticks for precise matching: `` `AuthService` ``.
+- **Phrase matching** — Exact phrases in documentation are found even when semantic similarity is low.
 - **Semantic expansion** — Domain-specific synonyms improve recall (function ↔ method, auth ↔ authentication).
 ## Installation

package/dist/cli/main.js CHANGED Viewed

@@ -3598,6 +3598,30 @@ function extractVocabulary(literal) {
   const filtered = words.filter((w) => w.length > 1);
   return [...new Set(filtered)];
 }
+function extractQueryVocabulary(query) {
+  if (!query || query.trim() === "") {
+    return [];
+  }
+  const vocabularySet = new Set;
+  const tokens = query.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
+  for (const token of tokens) {
+    if (QUERY_STOP_WORDS.has(token)) {
+      continue;
+    }
+    const looksLikeIdentifier = /[A-Z]/.test(token) || token.includes("_") || token.includes("-");
+    if (looksLikeIdentifier) {
+      const vocabWords = extractVocabulary(token);
+      for (const word of vocabWords) {
+        if (!QUERY_STOP_WORDS.has(word)) {
+          vocabularySet.add(word);
+        }
+      }
+    } else {
+      vocabularySet.add(token);
+    }
+  }
+  return Array.from(vocabularySet);
+}
 function extractLiterals(chunk) {
   const literals = [];
   if (chunk.name) {
@@ -3612,7 +3636,7 @@ function extractLiterals(chunk) {
   }
   return literals;
 }
-var COMMON_ABBREVIATIONS, STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
+var COMMON_ABBREVIATIONS, STOP_WORDS, QUERY_STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
 var init_literalExtractor = __esm(() => {
   COMMON_ABBREVIATIONS = new Set([
     "id",
@@ -3661,6 +3685,37 @@ var init_literalExtractor = __esm(() => {
     "as",
     "if"
   ]);
+  QUERY_STOP_WORDS = new Set([
+    ...STOP_WORDS,
+    "what",
+    "where",
+    "when",
+    "how",
+    "why",
+    "which",
+    "who",
+    "find",
+    "show",
+    "get",
+    "list",
+    "search",
+    "and",
+    "but",
+    "with",
+    "from",
+    "that",
+    "this",
+    "these",
+    "those",
+    "it",
+    "its",
+    "code",
+    "file",
+    "function",
+    "class",
+    "method",
+    "variable"
+  ]);
   CHUNK_TYPE_TO_LITERAL_TYPE = {
     class: "className",
     function: "functionName",
@@ -4374,6 +4429,113 @@ function extractJsonKeywords(obj) {
 // src/domain/services/configValidator.ts
 var init_configValidator = () => {};
+// src/domain/services/phraseMatch.ts
+function tokenizeForMatching(text, filterStopWords = true) {
+  if (!text || text.trim() === "") {
+    return [];
+  }
+  const tokens = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
+  if (filterStopWords) {
+    return tokens.filter((t) => !PHRASE_STOP_WORDS.has(t));
+  }
+  return tokens;
+}
+function calculatePhraseMatch(content, query) {
+  if (!content || !query || query.trim().length < PHRASE_MATCH_CONSTANTS.MIN_QUERY_LENGTH) {
+    return {
+      exactMatch: false,
+      coverage: 0,
+      matchedTokenCount: 0,
+      totalTokenCount: 0,
+      boost: 0,
+      isSignificant: false
+    };
+  }
+  const contentLower = content.toLowerCase();
+  const queryLower = query.toLowerCase().trim();
+  const exactMatch = contentLower.includes(queryLower);
+  const queryTokens = tokenizeForMatching(query, true);
+  const matchedTokens = queryTokens.filter((token) => contentLower.includes(token));
+  const coverage = queryTokens.length > 0 ? matchedTokens.length / queryTokens.length : 0;
+  let boost = 0;
+  if (exactMatch) {
+    boost = PHRASE_MATCH_CONSTANTS.EXACT_PHRASE_BOOST;
+  } else if (coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD) {
+    boost = PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_BOOST;
+  } else if (coverage >= PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_THRESHOLD) {
+    boost = PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_BOOST;
+  }
+  const isSignificant = exactMatch || coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD;
+  return {
+    exactMatch,
+    coverage,
+    matchedTokenCount: matchedTokens.length,
+    totalTokenCount: queryTokens.length,
+    boost,
+    isSignificant
+  };
+}
+var PHRASE_MATCH_CONSTANTS, PHRASE_STOP_WORDS;
+var init_phraseMatch = __esm(() => {
+  PHRASE_MATCH_CONSTANTS = {
+    EXACT_PHRASE_BOOST: 0.5,
+    HIGH_COVERAGE_BOOST: 0.2,
+    MEDIUM_COVERAGE_BOOST: 0.1,
+    HIGH_COVERAGE_THRESHOLD: 0.8,
+    MEDIUM_COVERAGE_THRESHOLD: 0.6,
+    MIN_QUERY_LENGTH: 3
+  };
+  PHRASE_STOP_WORDS = new Set([
+    "a",
+    "an",
+    "the",
+    "in",
+    "on",
+    "at",
+    "to",
+    "for",
+    "of",
+    "with",
+    "by",
+    "from",
+    "as",
+    "and",
+    "or",
+    "but",
+    "what",
+    "where",
+    "when",
+    "how",
+    "why",
+    "which",
+    "who",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "being",
+    "have",
+    "has",
+    "had",
+    "do",
+    "does",
+    "did",
+    "i",
+    "you",
+    "he",
+    "she",
+    "it",
+    "we",
+    "they",
+    "this",
+    "that",
+    "these",
+    "those"
+  ]);
+});
 // src/domain/services/index.ts
 var init_services = __esm(() => {
   init_keywords();
@@ -4384,6 +4546,7 @@ var init_services = __esm(() => {
   init_lexicon2();
   init_introspection();
   init_configValidator();
+  init_phraseMatch();
 });
 // src/modules/language/typescript/parseCode.ts
@@ -5202,9 +5365,21 @@ class TypeScriptModule {
     const symbolicIndex = new SymbolicIndex(indexDir, this.id);
     const literalIndex = new LiteralIndex(indexDir, this.id);
     let literalMatchMap = new Map;
+    let vocabularyScoreMap = new Map;
     try {
       await literalIndex.initialize();
       literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
+      const queryVocabulary = extractQueryVocabulary(query);
+      if (queryVocabulary.length > 0) {
+        const vocabMatches = literalIndex.findByVocabularyWords(queryVocabulary);
+        for (const { entry, matchedWords } of vocabMatches) {
+          const vocabScore = matchedWords.length / queryVocabulary.length;
+          const existingScore = vocabularyScoreMap.get(entry.chunkId) || 0;
+          if (vocabScore > existingScore) {
+            vocabularyScoreMap.set(entry.chunkId, vocabScore);
+          }
+        }
+      }
     } catch {}
     let allFiles;
     try {
@@ -5284,18 +5459,20 @@ class TypeScriptModule {
     for (const { filepath, chunk, embedding } of allChunksData) {
       const semanticScore = cosineSimilarity(queryEmbedding, embedding);
       const bm25Score = bm25Scores.get(chunk.id) || 0;
+      const vocabScore = vocabularyScoreMap.get(chunk.id) || 0;
       const pathBoost = pathBoosts.get(filepath) || 0;
+      const phraseMatch = calculatePhraseMatch(chunk.content, query);
       const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
       const chunkTypeBoost = calculateChunkTypeBoost(chunk);
       const exportBoost = calculateExportBoost(chunk);
-      const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
-      const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score;
+      const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
+      const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore;
       const literalMatches = literalMatchMap.get(chunk.id) || [];
       const literalContribution = calculateLiteralContribution(literalMatches, true);
       const boostedScore = applyLiteralBoost(baseScore, literalMatches, true);
       const finalScore = boostedScore + additiveBoost;
       processedChunkIds.add(chunk.id);
-      if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0) {
+      if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0 || vocabScore > VOCAB_THRESHOLD || phraseMatch.isSignificant) {
         results.push({
           filepath,
           chunk,
@@ -5304,6 +5481,9 @@ class TypeScriptModule {
           context: {
             semanticScore,
             bm25Score,
+            vocabScore,
+            phraseMatch: phraseMatch.exactMatch,
+            phraseCoverage: phraseMatch.coverage,
             pathBoost,
             fileTypeBoost,
             chunkTypeBoost,
@@ -5353,13 +5533,15 @@ class TypeScriptModule {
           semanticScore = cosineSimilarity(queryEmbedding, embedding);
         }
         const bm25Score = bm25Scores.get(chunkId) || 0;
+        const vocabScore = vocabularyScoreMap.get(chunkId) || 0;
+        const phraseMatch = calculatePhraseMatch(chunk.content, query);
         const pathBoost = pathBoosts.get(filepath) || 0;
         const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
         const chunkTypeBoost = calculateChunkTypeBoost(chunk);
         const exportBoost = calculateExportBoost(chunk);
-        const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
+        const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
         const literalContribution = calculateLiteralContribution(chunkLiteralMatches, false);
-        const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
+        const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
         const boostedScore = applyLiteralBoost(baseScore, chunkLiteralMatches, semanticScore > 0);
         const finalScore = boostedScore + additiveBoost;
         processedChunkIds.add(chunkId);
@@ -5371,6 +5553,9 @@ class TypeScriptModule {
           context: {
             semanticScore,
             bm25Score,
+            vocabScore,
+            phraseMatch: phraseMatch.exactMatch,
+            phraseCoverage: phraseMatch.coverage,
             pathBoost,
             fileTypeBoost,
             chunkTypeBoost,
@@ -5411,7 +5596,7 @@ class TypeScriptModule {
     return references;
   }
 }
-var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.7, BM25_WEIGHT = 0.3, TYPESCRIPT_EXTENSIONS, supportsFile;
+var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.6, BM25_WEIGHT = 0.25, VOCAB_WEIGHT = 0.15, VOCAB_THRESHOLD = 0.4, TYPESCRIPT_EXTENSIONS, supportsFile;
 var init_typescript = __esm(() => {
   init_embeddings();
   init_services();
@@ -11486,6 +11671,7 @@ class MarkdownModule {
     for (const { filepath, chunk, embedding } of allChunksData) {
       const semanticScore = cosineSimilarity(queryEmbedding, embedding);
       const bm25Score = bm25Scores.get(chunk.id) || 0;
+      const phraseMatch = calculatePhraseMatch(chunk.content, query);
       let docBoost = 0;
       if (queryTerms.some((t) => [
         "docs",
@@ -11499,8 +11685,8 @@ class MarkdownModule {
         docBoost = 0.05;
       }
       const headingBoost = calculateHeadingLevelBoost(chunk);
-      const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost;
-      if (hybridScore >= minScore || bm25Score > 0.3) {
+      const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost + phraseMatch.boost;
+      if (hybridScore >= minScore || bm25Score > 0.3 || phraseMatch.isSignificant) {
         results.push({
           filepath,
           chunk,
@@ -11509,6 +11695,8 @@ class MarkdownModule {
           context: {
             semanticScore,
             bm25Score,
+            phraseMatch: phraseMatch.exactMatch,
+            phraseCoverage: phraseMatch.coverage,
             docBoost,
             headingBoost,
             headingLevel: chunk.metadata?.headingLevel
@@ -15048,7 +15236,7 @@ init_logger();
 // package.json
 var package_default = {
   name: "raggrep",
-  version: "0.13.2",
+  version: "0.14.0",
   description: "Local filesystem-based RAG system for codebases - semantic search using local embeddings",
   type: "module",
   main: "./dist/index.js",
@@ -15644,4 +15832,4 @@ Run 'raggrep <command> --help' for more information.
 }
 main();
-//# debugId=5CD6138213DBFFD864756E2164756E21
+//# debugId=CF359982C72DD5D264756E2164756E21