npm - raggrep - Versions diffs - 0.13.2 → 0.14.0 - Mend

raggrep 0.13.2 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +2 -1
package/dist/cli/main.js +199 -11
package/dist/cli/main.js.map +8 -7
package/dist/domain/services/index.d.ts +2 -1
package/dist/domain/services/literalExtractor.d.ts +20 -0
package/dist/domain/services/phraseMatch.d.ts +99 -0
package/dist/domain/services/phraseMatch.test.d.ts +4 -0
package/dist/index.js +198 -10
package/dist/index.js.map +8 -7
package/dist/tests/simulation-phrase-matching.test.d.ts +14 -0
package/dist/tests/simulation-vocabulary.test.d.ts +17 -0
package/dist/tests/vocabulary-scoring.test.d.ts +16 -0
package/package.json +1 -1

package/dist/domain/services/index.d.ts CHANGED Viewed

@@ -10,9 +10,10 @@ export { cosineSimilarity, euclideanDistance } from "./similarity";
 export { detectQueryIntent, extractQueryTerms, calculateFileTypeBoost, isSourceCodeFile, isDocFile, isDataFile, IMPLEMENTATION_TERMS, DOCUMENTATION_TERMS, SOURCE_CODE_EXTENSIONS, DOC_EXTENSIONS, DATA_EXTENSIONS, type QueryIntent, } from "./queryIntent";
 export { createLineBasedChunks, createSingleChunk, generateChunkId, DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, type TextChunk, type ChunkingOptions, } from "./chunking";
 export { parseQueryLiterals } from "./queryLiteralParser";
-export { extractLiterals, extractLiteralsWithReferences, extractVocabulary, } from "./literalExtractor";
+export { extractLiterals, extractLiteralsWithReferences, extractVocabulary, extractQueryVocabulary, } from "./literalExtractor";
 export { calculateLiteralMultiplier, calculateMaxMultiplier, calculateLiteralContribution, calculateVocabularyMatch, applyLiteralBoost, mergeWithLiteralBoost, LITERAL_SCORING_CONSTANTS, type LiteralScoreContribution, type VocabularyMatchResult, type MergeInput, type MergeOutput, } from "./literalScorer";
 export { getSynonyms, expandQuery, DEFAULT_LEXICON, EXPANSION_WEIGHTS, DEFAULT_EXPANSION_OPTIONS, } from "./lexicon";
 export { extractJsonPaths, extractJsonKeywords } from "./jsonPathExtractor";
 export { introspectFile, findNearestReadme, introspectionToKeywords, detectScopeFromName, findProjectForFile, calculateIntrospectionBoost, type IntrospectFileOptions, } from "./introspection";
 export { validateConfig, formatValidationIssues, type ValidationIssue, type ValidationResult, } from "./configValidator";
+export { calculatePhraseMatch, hasExactPhrase, calculateTokenCoverage, tokenizeForMatching, PHRASE_MATCH_CONSTANTS, type PhraseMatchResult, } from "./phraseMatch";

package/dist/domain/services/literalExtractor.d.ts CHANGED Viewed

@@ -22,6 +22,26 @@ import type { ExtractedLiteral } from "../entities/literal";
  * @returns Array of unique vocabulary words (lowercase, length > 1)
  */
 export declare function extractVocabulary(literal: string): string[];
+/**
+ * Extract vocabulary words from a natural language query.
+ *
+ * Unlike extractVocabulary (for identifiers), this:
+ * 1. Tokenizes the query into words
+ * 2. Filters out stop words
+ * 3. Handles both natural language and embedded identifiers
+ * 4. Returns unique, normalized vocabulary words
+ *
+ * @param query - The search query string
+ * @returns Array of unique vocabulary words (lowercase, length > 1)
+ *
+ * @example
+ * extractQueryVocabulary("where is user session validated")
+ * // → ["user", "session", "validated"]
+ *
+ * extractQueryVocabulary("find the authenticateUser function")
+ * // → ["authenticate", "user"] (identifier decomposed)
+ */
+export declare function extractQueryVocabulary(query: string): string[];
 /**
  * Extract literals from a code chunk.
  *

package/dist/domain/services/phraseMatch.d.ts ADDED Viewed

@@ -0,0 +1,99 @@
+/**
+ * Phrase Matching Service
+ *
+ * Pure functions for content-based phrase matching. This enables
+ * exact phrase searches to find results even when semantic/BM25
+ * scores are low.
+ *
+ * @module domain/services/phraseMatch
+ */
+/**
+ * Result of phrase matching analysis.
+ */
+export interface PhraseMatchResult {
+    /** Whether the exact query phrase was found in content */
+    exactMatch: boolean;
+    /** Proportion of query tokens found in content (0-1) */
+    coverage: number;
+    /** Number of query tokens found in content */
+    matchedTokenCount: number;
+    /** Total number of tokens in query */
+    totalTokenCount: number;
+    /** Additive score boost based on match quality */
+    boost: number;
+    /** Whether this match is significant enough to bypass filters */
+    isSignificant: boolean;
+}
+/**
+ * Constants for phrase matching scoring.
+ */
+export declare const PHRASE_MATCH_CONSTANTS: {
+    /** Major boost for exact phrase match */
+    readonly EXACT_PHRASE_BOOST: 0.5;
+    /** Boost for high token coverage (80%+) */
+    readonly HIGH_COVERAGE_BOOST: 0.2;
+    /** Boost for medium token coverage (60%+) */
+    readonly MEDIUM_COVERAGE_BOOST: 0.1;
+    /** Coverage threshold for "high" classification */
+    readonly HIGH_COVERAGE_THRESHOLD: 0.8;
+    /** Coverage threshold for "medium" classification */
+    readonly MEDIUM_COVERAGE_THRESHOLD: 0.6;
+    /** Minimum query length to consider for exact matching */
+    readonly MIN_QUERY_LENGTH: 3;
+};
+/**
+ * Tokenize a string into words for matching.
+ * Normalizes to lowercase and filters out punctuation.
+ *
+ * @param text - Text to tokenize
+ * @param filterStopWords - Whether to filter out stop words
+ * @returns Array of normalized tokens
+ */
+export declare function tokenizeForMatching(text: string, filterStopWords?: boolean): string[];
+/**
+ * Calculate phrase match score for content against a query.
+ *
+ * This function checks:
+ * 1. Exact phrase match (query substring in content)
+ * 2. Token coverage (what % of query tokens appear in content)
+ *
+ * @param content - The chunk content to search in
+ * @param query - The search query
+ * @returns PhraseMatchResult with match details and boost
+ *
+ * @example
+ * const result = calculatePhraseMatch(
+ *   "This explains the authentication flow for new users",
+ *   "authentication flow for new users"
+ * );
+ * // result.exactMatch = true
+ * // result.boost = 0.5 (EXACT_PHRASE_BOOST)
+ *
+ * @example
+ * const result = calculatePhraseMatch(
+ *   "User authentication and session flow",
+ *   "authentication flow for users"
+ * );
+ * // result.exactMatch = false
+ * // result.coverage = 0.75 (3/4 tokens found)
+ * // result.boost = 0.1 (MEDIUM_COVERAGE_BOOST)
+ */
+export declare function calculatePhraseMatch(content: string, query: string): PhraseMatchResult;
+/**
+ * Quick check if content might contain the query phrase.
+ * Useful for early filtering before full phrase matching.
+ *
+ * @param content - The chunk content
+ * @param query - The search query
+ * @returns true if exact phrase is found
+ */
+export declare function hasExactPhrase(content: string, query: string): boolean;
+/**
+ * Calculate token coverage between content and query.
+ * Faster than full phrase matching when only coverage is needed.
+ *
+ * @param content - The chunk content
+ * @param query - The search query
+ * @returns Coverage ratio (0-1)
+ */
+export declare function calculateTokenCoverage(content: string, query: string): number;

package/dist/domain/services/phraseMatch.test.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+/**
+ * Tests for Phrase Matching Service
+ */
+export {};

package/dist/index.js CHANGED Viewed

@@ -2873,6 +2873,30 @@ function extractVocabulary(literal) {
   const filtered = words.filter((w) => w.length > 1);
   return [...new Set(filtered)];
 }
+function extractQueryVocabulary(query) {
+  if (!query || query.trim() === "") {
+    return [];
+  }
+  const vocabularySet = new Set;
+  const tokens = query.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
+  for (const token of tokens) {
+    if (QUERY_STOP_WORDS.has(token)) {
+      continue;
+    }
+    const looksLikeIdentifier = /[A-Z]/.test(token) || token.includes("_") || token.includes("-");
+    if (looksLikeIdentifier) {
+      const vocabWords = extractVocabulary(token);
+      for (const word of vocabWords) {
+        if (!QUERY_STOP_WORDS.has(word)) {
+          vocabularySet.add(word);
+        }
+      }
+    } else {
+      vocabularySet.add(token);
+    }
+  }
+  return Array.from(vocabularySet);
+}
 function extractLiterals(chunk) {
   const literals = [];
   if (chunk.name) {
@@ -2887,7 +2911,7 @@ function extractLiterals(chunk) {
   }
   return literals;
 }
-var COMMON_ABBREVIATIONS, STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
+var COMMON_ABBREVIATIONS, STOP_WORDS, QUERY_STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
 var init_literalExtractor = __esm(() => {
   COMMON_ABBREVIATIONS = new Set([
     "id",
@@ -2936,6 +2960,37 @@ var init_literalExtractor = __esm(() => {
     "as",
     "if"
   ]);
+  QUERY_STOP_WORDS = new Set([
+    ...STOP_WORDS,
+    "what",
+    "where",
+    "when",
+    "how",
+    "why",
+    "which",
+    "who",
+    "find",
+    "show",
+    "get",
+    "list",
+    "search",
+    "and",
+    "but",
+    "with",
+    "from",
+    "that",
+    "this",
+    "these",
+    "those",
+    "it",
+    "its",
+    "code",
+    "file",
+    "function",
+    "class",
+    "method",
+    "variable"
+  ]);
   CHUNK_TYPE_TO_LITERAL_TYPE = {
     class: "className",
     function: "functionName",
@@ -3649,6 +3704,113 @@ function extractJsonKeywords(obj) {
 // src/domain/services/configValidator.ts
 var init_configValidator = () => {};
+// src/domain/services/phraseMatch.ts
+function tokenizeForMatching(text, filterStopWords = true) {
+  if (!text || text.trim() === "") {
+    return [];
+  }
+  const tokens = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
+  if (filterStopWords) {
+    return tokens.filter((t) => !PHRASE_STOP_WORDS.has(t));
+  }
+  return tokens;
+}
+function calculatePhraseMatch(content, query) {
+  if (!content || !query || query.trim().length < PHRASE_MATCH_CONSTANTS.MIN_QUERY_LENGTH) {
+    return {
+      exactMatch: false,
+      coverage: 0,
+      matchedTokenCount: 0,
+      totalTokenCount: 0,
+      boost: 0,
+      isSignificant: false
+    };
+  }
+  const contentLower = content.toLowerCase();
+  const queryLower = query.toLowerCase().trim();
+  const exactMatch = contentLower.includes(queryLower);
+  const queryTokens = tokenizeForMatching(query, true);
+  const matchedTokens = queryTokens.filter((token) => contentLower.includes(token));
+  const coverage = queryTokens.length > 0 ? matchedTokens.length / queryTokens.length : 0;
+  let boost = 0;
+  if (exactMatch) {
+    boost = PHRASE_MATCH_CONSTANTS.EXACT_PHRASE_BOOST;
+  } else if (coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD) {
+    boost = PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_BOOST;
+  } else if (coverage >= PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_THRESHOLD) {
+    boost = PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_BOOST;
+  }
+  const isSignificant = exactMatch || coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD;
+  return {
+    exactMatch,
+    coverage,
+    matchedTokenCount: matchedTokens.length,
+    totalTokenCount: queryTokens.length,
+    boost,
+    isSignificant
+  };
+}
+var PHRASE_MATCH_CONSTANTS, PHRASE_STOP_WORDS;
+var init_phraseMatch = __esm(() => {
+  PHRASE_MATCH_CONSTANTS = {
+    EXACT_PHRASE_BOOST: 0.5,
+    HIGH_COVERAGE_BOOST: 0.2,
+    MEDIUM_COVERAGE_BOOST: 0.1,
+    HIGH_COVERAGE_THRESHOLD: 0.8,
+    MEDIUM_COVERAGE_THRESHOLD: 0.6,
+    MIN_QUERY_LENGTH: 3
+  };
+  PHRASE_STOP_WORDS = new Set([
+    "a",
+    "an",
+    "the",
+    "in",
+    "on",
+    "at",
+    "to",
+    "for",
+    "of",
+    "with",
+    "by",
+    "from",
+    "as",
+    "and",
+    "or",
+    "but",
+    "what",
+    "where",
+    "when",
+    "how",
+    "why",
+    "which",
+    "who",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "being",
+    "have",
+    "has",
+    "had",
+    "do",
+    "does",
+    "did",
+    "i",
+    "you",
+    "he",
+    "she",
+    "it",
+    "we",
+    "they",
+    "this",
+    "that",
+    "these",
+    "those"
+  ]);
+});
 // src/domain/services/index.ts
 var init_services = __esm(() => {
   init_keywords();
@@ -3659,6 +3821,7 @@ var init_services = __esm(() => {
   init_lexicon2();
   init_introspection();
   init_configValidator();
+  init_phraseMatch();
 });
 // src/modules/language/typescript/parseCode.ts
@@ -4477,9 +4640,21 @@ class TypeScriptModule {
     const symbolicIndex = new SymbolicIndex(indexDir, this.id);
     const literalIndex = new LiteralIndex(indexDir, this.id);
     let literalMatchMap = new Map;
+    let vocabularyScoreMap = new Map;
     try {
       await literalIndex.initialize();
       literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
+      const queryVocabulary = extractQueryVocabulary(query);
+      if (queryVocabulary.length > 0) {
+        const vocabMatches = literalIndex.findByVocabularyWords(queryVocabulary);
+        for (const { entry, matchedWords } of vocabMatches) {
+          const vocabScore = matchedWords.length / queryVocabulary.length;
+          const existingScore = vocabularyScoreMap.get(entry.chunkId) || 0;
+          if (vocabScore > existingScore) {
+            vocabularyScoreMap.set(entry.chunkId, vocabScore);
+          }
+        }
+      }
     } catch {}
     let allFiles;
     try {
@@ -4559,18 +4734,20 @@ class TypeScriptModule {
     for (const { filepath, chunk, embedding } of allChunksData) {
       const semanticScore = cosineSimilarity(queryEmbedding, embedding);
       const bm25Score = bm25Scores.get(chunk.id) || 0;
+      const vocabScore = vocabularyScoreMap.get(chunk.id) || 0;
       const pathBoost = pathBoosts.get(filepath) || 0;
+      const phraseMatch = calculatePhraseMatch(chunk.content, query);
       const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
       const chunkTypeBoost = calculateChunkTypeBoost(chunk);
       const exportBoost = calculateExportBoost(chunk);
-      const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
-      const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score;
+      const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
+      const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore;
       const literalMatches = literalMatchMap.get(chunk.id) || [];
       const literalContribution = calculateLiteralContribution(literalMatches, true);
       const boostedScore = applyLiteralBoost(baseScore, literalMatches, true);
       const finalScore = boostedScore + additiveBoost;
       processedChunkIds.add(chunk.id);
-      if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0) {
+      if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0 || vocabScore > VOCAB_THRESHOLD || phraseMatch.isSignificant) {
         results.push({
           filepath,
           chunk,
@@ -4579,6 +4756,9 @@ class TypeScriptModule {
           context: {
             semanticScore,
             bm25Score,
+            vocabScore,
+            phraseMatch: phraseMatch.exactMatch,
+            phraseCoverage: phraseMatch.coverage,
             pathBoost,
             fileTypeBoost,
             chunkTypeBoost,
@@ -4628,13 +4808,15 @@ class TypeScriptModule {
           semanticScore = cosineSimilarity(queryEmbedding, embedding);
         }
         const bm25Score = bm25Scores.get(chunkId) || 0;
+        const vocabScore = vocabularyScoreMap.get(chunkId) || 0;
+        const phraseMatch = calculatePhraseMatch(chunk.content, query);
         const pathBoost = pathBoosts.get(filepath) || 0;
         const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
         const chunkTypeBoost = calculateChunkTypeBoost(chunk);
         const exportBoost = calculateExportBoost(chunk);
-        const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
+        const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
         const literalContribution = calculateLiteralContribution(chunkLiteralMatches, false);
-        const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
+        const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
         const boostedScore = applyLiteralBoost(baseScore, chunkLiteralMatches, semanticScore > 0);
         const finalScore = boostedScore + additiveBoost;
         processedChunkIds.add(chunkId);
@@ -4646,6 +4828,9 @@ class TypeScriptModule {
           context: {
             semanticScore,
             bm25Score,
+            vocabScore,
+            phraseMatch: phraseMatch.exactMatch,
+            phraseCoverage: phraseMatch.coverage,
             pathBoost,
             fileTypeBoost,
             chunkTypeBoost,
@@ -4686,7 +4871,7 @@ class TypeScriptModule {
     return references;
   }
 }
-var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.7, BM25_WEIGHT = 0.3, TYPESCRIPT_EXTENSIONS, supportsFile;
+var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.6, BM25_WEIGHT = 0.25, VOCAB_WEIGHT = 0.15, VOCAB_THRESHOLD = 0.4, TYPESCRIPT_EXTENSIONS, supportsFile;
 var init_typescript = __esm(() => {
   init_embeddings();
   init_services();
@@ -10761,6 +10946,7 @@ class MarkdownModule {
     for (const { filepath, chunk, embedding } of allChunksData) {
       const semanticScore = cosineSimilarity(queryEmbedding, embedding);
       const bm25Score = bm25Scores.get(chunk.id) || 0;
+      const phraseMatch = calculatePhraseMatch(chunk.content, query);
       let docBoost = 0;
       if (queryTerms.some((t) => [
         "docs",
@@ -10774,8 +10960,8 @@ class MarkdownModule {
         docBoost = 0.05;
       }
       const headingBoost = calculateHeadingLevelBoost(chunk);
-      const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost;
-      if (hybridScore >= minScore || bm25Score > 0.3) {
+      const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost + phraseMatch.boost;
+      if (hybridScore >= minScore || bm25Score > 0.3 || phraseMatch.isSignificant) {
         results.push({
           filepath,
           chunk,
@@ -10784,6 +10970,8 @@ class MarkdownModule {
           context: {
             semanticScore,
             bm25Score,
+            phraseMatch: phraseMatch.exactMatch,
+            phraseCoverage: phraseMatch.coverage,
             docBoost,
             headingBoost,
             headingLevel: chunk.metadata?.headingLevel
@@ -14397,4 +14585,4 @@ export {
   ConsoleLogger
 };
-//# debugId=CA60BFDCCC29D83C64756E2164756E21
+//# debugId=EED23FCAC08F026464756E2164756E21