raggrep 0.13.2 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,9 +10,10 @@ export { cosineSimilarity, euclideanDistance } from "./similarity";
10
10
  export { detectQueryIntent, extractQueryTerms, calculateFileTypeBoost, isSourceCodeFile, isDocFile, isDataFile, IMPLEMENTATION_TERMS, DOCUMENTATION_TERMS, SOURCE_CODE_EXTENSIONS, DOC_EXTENSIONS, DATA_EXTENSIONS, type QueryIntent, } from "./queryIntent";
11
11
  export { createLineBasedChunks, createSingleChunk, generateChunkId, DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, type TextChunk, type ChunkingOptions, } from "./chunking";
12
12
  export { parseQueryLiterals } from "./queryLiteralParser";
13
- export { extractLiterals, extractLiteralsWithReferences, extractVocabulary, } from "./literalExtractor";
13
+ export { extractLiterals, extractLiteralsWithReferences, extractVocabulary, extractQueryVocabulary, } from "./literalExtractor";
14
14
  export { calculateLiteralMultiplier, calculateMaxMultiplier, calculateLiteralContribution, calculateVocabularyMatch, applyLiteralBoost, mergeWithLiteralBoost, LITERAL_SCORING_CONSTANTS, type LiteralScoreContribution, type VocabularyMatchResult, type MergeInput, type MergeOutput, } from "./literalScorer";
15
15
  export { getSynonyms, expandQuery, DEFAULT_LEXICON, EXPANSION_WEIGHTS, DEFAULT_EXPANSION_OPTIONS, } from "./lexicon";
16
16
  export { extractJsonPaths, extractJsonKeywords } from "./jsonPathExtractor";
17
17
  export { introspectFile, findNearestReadme, introspectionToKeywords, detectScopeFromName, findProjectForFile, calculateIntrospectionBoost, type IntrospectFileOptions, } from "./introspection";
18
18
  export { validateConfig, formatValidationIssues, type ValidationIssue, type ValidationResult, } from "./configValidator";
19
+ export { calculatePhraseMatch, hasExactPhrase, calculateTokenCoverage, tokenizeForMatching, PHRASE_MATCH_CONSTANTS, type PhraseMatchResult, } from "./phraseMatch";
@@ -22,6 +22,26 @@ import type { ExtractedLiteral } from "../entities/literal";
22
22
  * @returns Array of unique vocabulary words (lowercase, length > 1)
23
23
  */
24
24
  export declare function extractVocabulary(literal: string): string[];
25
+ /**
26
+ * Extract vocabulary words from a natural language query.
27
+ *
28
+ * Unlike extractVocabulary (for identifiers), this:
29
+ * 1. Tokenizes the query into words
30
+ * 2. Filters out stop words
31
+ * 3. Handles both natural language and embedded identifiers
32
+ * 4. Returns unique, normalized vocabulary words
33
+ *
34
+ * @param query - The search query string
35
+ * @returns Array of unique vocabulary words (lowercase, length > 1)
36
+ *
37
+ * @example
38
+ * extractQueryVocabulary("where is user session validated")
39
+ * // → ["user", "session", "validated"]
40
+ *
41
+ * extractQueryVocabulary("find the authenticateUser function")
42
+ * // → ["authenticate", "user"] (identifier decomposed)
43
+ */
44
+ export declare function extractQueryVocabulary(query: string): string[];
25
45
  /**
26
46
  * Extract literals from a code chunk.
27
47
  *
@@ -0,0 +1,99 @@
1
+ /**
2
+ * Phrase Matching Service
3
+ *
4
+ * Pure functions for content-based phrase matching. This enables
5
+ * exact phrase searches to find results even when semantic/BM25
6
+ * scores are low.
7
+ *
8
+ * @module domain/services/phraseMatch
9
+ */
10
+ /**
11
+ * Result of phrase matching analysis.
12
+ */
13
+ export interface PhraseMatchResult {
14
+ /** Whether the exact query phrase was found in content */
15
+ exactMatch: boolean;
16
+ /** Proportion of query tokens found in content (0-1) */
17
+ coverage: number;
18
+ /** Number of query tokens found in content */
19
+ matchedTokenCount: number;
20
+ /** Total number of tokens in query */
21
+ totalTokenCount: number;
22
+ /** Additive score boost based on match quality */
23
+ boost: number;
24
+ /** Whether this match is significant enough to bypass filters */
25
+ isSignificant: boolean;
26
+ }
27
+ /**
28
+ * Constants for phrase matching scoring.
29
+ */
30
+ export declare const PHRASE_MATCH_CONSTANTS: {
31
+ /** Major boost for exact phrase match */
32
+ readonly EXACT_PHRASE_BOOST: 0.5;
33
+ /** Boost for high token coverage (80%+) */
34
+ readonly HIGH_COVERAGE_BOOST: 0.2;
35
+ /** Boost for medium token coverage (60%+) */
36
+ readonly MEDIUM_COVERAGE_BOOST: 0.1;
37
+ /** Coverage threshold for "high" classification */
38
+ readonly HIGH_COVERAGE_THRESHOLD: 0.8;
39
+ /** Coverage threshold for "medium" classification */
40
+ readonly MEDIUM_COVERAGE_THRESHOLD: 0.6;
41
+ /** Minimum query length to consider for exact matching */
42
+ readonly MIN_QUERY_LENGTH: 3;
43
+ };
44
+ /**
45
+ * Tokenize a string into words for matching.
46
+ * Normalizes to lowercase and filters out punctuation.
47
+ *
48
+ * @param text - Text to tokenize
49
+ * @param filterStopWords - Whether to filter out stop words
50
+ * @returns Array of normalized tokens
51
+ */
52
+ export declare function tokenizeForMatching(text: string, filterStopWords?: boolean): string[];
53
+ /**
54
+ * Calculate phrase match score for content against a query.
55
+ *
56
+ * This function checks:
57
+ * 1. Exact phrase match (query substring in content)
58
+ * 2. Token coverage (what % of query tokens appear in content)
59
+ *
60
+ * @param content - The chunk content to search in
61
+ * @param query - The search query
62
+ * @returns PhraseMatchResult with match details and boost
63
+ *
64
+ * @example
65
+ * const result = calculatePhraseMatch(
66
+ * "This explains the authentication flow for new users",
67
+ * "authentication flow for new users"
68
+ * );
69
+ * // result.exactMatch = true
70
+ * // result.boost = 0.5 (EXACT_PHRASE_BOOST)
71
+ *
72
+ * @example
73
+ * const result = calculatePhraseMatch(
74
+ * "User authentication and session flow",
75
+ * "authentication flow for users"
76
+ * );
77
+ * // result.exactMatch = false
78
+ * // result.coverage = 0.75 (3/4 tokens found)
79
+ * // result.boost = 0.1 (MEDIUM_COVERAGE_BOOST)
80
+ */
81
+ export declare function calculatePhraseMatch(content: string, query: string): PhraseMatchResult;
82
+ /**
83
+ * Quick check if content might contain the query phrase.
84
+ * Useful for early filtering before full phrase matching.
85
+ *
86
+ * @param content - The chunk content
87
+ * @param query - The search query
88
+ * @returns true if exact phrase is found
89
+ */
90
+ export declare function hasExactPhrase(content: string, query: string): boolean;
91
+ /**
92
+ * Calculate token coverage between content and query.
93
+ * Faster than full phrase matching when only coverage is needed.
94
+ *
95
+ * @param content - The chunk content
96
+ * @param query - The search query
97
+ * @returns Coverage ratio (0-1)
98
+ */
99
+ export declare function calculateTokenCoverage(content: string, query: string): number;
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Tests for Phrase Matching Service
3
+ */
4
+ export {};
package/dist/index.js CHANGED
@@ -2873,6 +2873,30 @@ function extractVocabulary(literal) {
2873
2873
  const filtered = words.filter((w) => w.length > 1);
2874
2874
  return [...new Set(filtered)];
2875
2875
  }
2876
+ function extractQueryVocabulary(query) {
2877
+ if (!query || query.trim() === "") {
2878
+ return [];
2879
+ }
2880
+ const vocabularySet = new Set;
2881
+ const tokens = query.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
2882
+ for (const token of tokens) {
2883
+ if (QUERY_STOP_WORDS.has(token)) {
2884
+ continue;
2885
+ }
2886
+ const looksLikeIdentifier = /[A-Z]/.test(token) || token.includes("_") || token.includes("-");
2887
+ if (looksLikeIdentifier) {
2888
+ const vocabWords = extractVocabulary(token);
2889
+ for (const word of vocabWords) {
2890
+ if (!QUERY_STOP_WORDS.has(word)) {
2891
+ vocabularySet.add(word);
2892
+ }
2893
+ }
2894
+ } else {
2895
+ vocabularySet.add(token);
2896
+ }
2897
+ }
2898
+ return Array.from(vocabularySet);
2899
+ }
2876
2900
  function extractLiterals(chunk) {
2877
2901
  const literals = [];
2878
2902
  if (chunk.name) {
@@ -2887,7 +2911,7 @@ function extractLiterals(chunk) {
2887
2911
  }
2888
2912
  return literals;
2889
2913
  }
2890
- var COMMON_ABBREVIATIONS, STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
2914
+ var COMMON_ABBREVIATIONS, STOP_WORDS, QUERY_STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
2891
2915
  var init_literalExtractor = __esm(() => {
2892
2916
  COMMON_ABBREVIATIONS = new Set([
2893
2917
  "id",
@@ -2936,6 +2960,37 @@ var init_literalExtractor = __esm(() => {
2936
2960
  "as",
2937
2961
  "if"
2938
2962
  ]);
2963
+ QUERY_STOP_WORDS = new Set([
2964
+ ...STOP_WORDS,
2965
+ "what",
2966
+ "where",
2967
+ "when",
2968
+ "how",
2969
+ "why",
2970
+ "which",
2971
+ "who",
2972
+ "find",
2973
+ "show",
2974
+ "get",
2975
+ "list",
2976
+ "search",
2977
+ "and",
2978
+ "but",
2979
+ "with",
2980
+ "from",
2981
+ "that",
2982
+ "this",
2983
+ "these",
2984
+ "those",
2985
+ "it",
2986
+ "its",
2987
+ "code",
2988
+ "file",
2989
+ "function",
2990
+ "class",
2991
+ "method",
2992
+ "variable"
2993
+ ]);
2939
2994
  CHUNK_TYPE_TO_LITERAL_TYPE = {
2940
2995
  class: "className",
2941
2996
  function: "functionName",
@@ -3649,6 +3704,113 @@ function extractJsonKeywords(obj) {
3649
3704
  // src/domain/services/configValidator.ts
3650
3705
  var init_configValidator = () => {};
3651
3706
 
3707
+ // src/domain/services/phraseMatch.ts
3708
+ function tokenizeForMatching(text, filterStopWords = true) {
3709
+ if (!text || text.trim() === "") {
3710
+ return [];
3711
+ }
3712
+ const tokens = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
3713
+ if (filterStopWords) {
3714
+ return tokens.filter((t) => !PHRASE_STOP_WORDS.has(t));
3715
+ }
3716
+ return tokens;
3717
+ }
3718
+ function calculatePhraseMatch(content, query) {
3719
+ if (!content || !query || query.trim().length < PHRASE_MATCH_CONSTANTS.MIN_QUERY_LENGTH) {
3720
+ return {
3721
+ exactMatch: false,
3722
+ coverage: 0,
3723
+ matchedTokenCount: 0,
3724
+ totalTokenCount: 0,
3725
+ boost: 0,
3726
+ isSignificant: false
3727
+ };
3728
+ }
3729
+ const contentLower = content.toLowerCase();
3730
+ const queryLower = query.toLowerCase().trim();
3731
+ const exactMatch = contentLower.includes(queryLower);
3732
+ const queryTokens = tokenizeForMatching(query, true);
3733
+ const matchedTokens = queryTokens.filter((token) => contentLower.includes(token));
3734
+ const coverage = queryTokens.length > 0 ? matchedTokens.length / queryTokens.length : 0;
3735
+ let boost = 0;
3736
+ if (exactMatch) {
3737
+ boost = PHRASE_MATCH_CONSTANTS.EXACT_PHRASE_BOOST;
3738
+ } else if (coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD) {
3739
+ boost = PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_BOOST;
3740
+ } else if (coverage >= PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_THRESHOLD) {
3741
+ boost = PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_BOOST;
3742
+ }
3743
+ const isSignificant = exactMatch || coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD;
3744
+ return {
3745
+ exactMatch,
3746
+ coverage,
3747
+ matchedTokenCount: matchedTokens.length,
3748
+ totalTokenCount: queryTokens.length,
3749
+ boost,
3750
+ isSignificant
3751
+ };
3752
+ }
3753
+ var PHRASE_MATCH_CONSTANTS, PHRASE_STOP_WORDS;
3754
+ var init_phraseMatch = __esm(() => {
3755
+ PHRASE_MATCH_CONSTANTS = {
3756
+ EXACT_PHRASE_BOOST: 0.5,
3757
+ HIGH_COVERAGE_BOOST: 0.2,
3758
+ MEDIUM_COVERAGE_BOOST: 0.1,
3759
+ HIGH_COVERAGE_THRESHOLD: 0.8,
3760
+ MEDIUM_COVERAGE_THRESHOLD: 0.6,
3761
+ MIN_QUERY_LENGTH: 3
3762
+ };
3763
+ PHRASE_STOP_WORDS = new Set([
3764
+ "a",
3765
+ "an",
3766
+ "the",
3767
+ "in",
3768
+ "on",
3769
+ "at",
3770
+ "to",
3771
+ "for",
3772
+ "of",
3773
+ "with",
3774
+ "by",
3775
+ "from",
3776
+ "as",
3777
+ "and",
3778
+ "or",
3779
+ "but",
3780
+ "what",
3781
+ "where",
3782
+ "when",
3783
+ "how",
3784
+ "why",
3785
+ "which",
3786
+ "who",
3787
+ "is",
3788
+ "are",
3789
+ "was",
3790
+ "were",
3791
+ "be",
3792
+ "been",
3793
+ "being",
3794
+ "have",
3795
+ "has",
3796
+ "had",
3797
+ "do",
3798
+ "does",
3799
+ "did",
3800
+ "i",
3801
+ "you",
3802
+ "he",
3803
+ "she",
3804
+ "it",
3805
+ "we",
3806
+ "they",
3807
+ "this",
3808
+ "that",
3809
+ "these",
3810
+ "those"
3811
+ ]);
3812
+ });
3813
+
3652
3814
  // src/domain/services/index.ts
3653
3815
  var init_services = __esm(() => {
3654
3816
  init_keywords();
@@ -3659,6 +3821,7 @@ var init_services = __esm(() => {
3659
3821
  init_lexicon2();
3660
3822
  init_introspection();
3661
3823
  init_configValidator();
3824
+ init_phraseMatch();
3662
3825
  });
3663
3826
 
3664
3827
  // src/modules/language/typescript/parseCode.ts
@@ -4477,9 +4640,21 @@ class TypeScriptModule {
4477
4640
  const symbolicIndex = new SymbolicIndex(indexDir, this.id);
4478
4641
  const literalIndex = new LiteralIndex(indexDir, this.id);
4479
4642
  let literalMatchMap = new Map;
4643
+ let vocabularyScoreMap = new Map;
4480
4644
  try {
4481
4645
  await literalIndex.initialize();
4482
4646
  literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
4647
+ const queryVocabulary = extractQueryVocabulary(query);
4648
+ if (queryVocabulary.length > 0) {
4649
+ const vocabMatches = literalIndex.findByVocabularyWords(queryVocabulary);
4650
+ for (const { entry, matchedWords } of vocabMatches) {
4651
+ const vocabScore = matchedWords.length / queryVocabulary.length;
4652
+ const existingScore = vocabularyScoreMap.get(entry.chunkId) || 0;
4653
+ if (vocabScore > existingScore) {
4654
+ vocabularyScoreMap.set(entry.chunkId, vocabScore);
4655
+ }
4656
+ }
4657
+ }
4483
4658
  } catch {}
4484
4659
  let allFiles;
4485
4660
  try {
@@ -4559,18 +4734,20 @@ class TypeScriptModule {
4559
4734
  for (const { filepath, chunk, embedding } of allChunksData) {
4560
4735
  const semanticScore = cosineSimilarity(queryEmbedding, embedding);
4561
4736
  const bm25Score = bm25Scores.get(chunk.id) || 0;
4737
+ const vocabScore = vocabularyScoreMap.get(chunk.id) || 0;
4562
4738
  const pathBoost = pathBoosts.get(filepath) || 0;
4739
+ const phraseMatch = calculatePhraseMatch(chunk.content, query);
4563
4740
  const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
4564
4741
  const chunkTypeBoost = calculateChunkTypeBoost(chunk);
4565
4742
  const exportBoost = calculateExportBoost(chunk);
4566
- const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
4567
- const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score;
4743
+ const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
4744
+ const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore;
4568
4745
  const literalMatches = literalMatchMap.get(chunk.id) || [];
4569
4746
  const literalContribution = calculateLiteralContribution(literalMatches, true);
4570
4747
  const boostedScore = applyLiteralBoost(baseScore, literalMatches, true);
4571
4748
  const finalScore = boostedScore + additiveBoost;
4572
4749
  processedChunkIds.add(chunk.id);
4573
- if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0) {
4750
+ if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0 || vocabScore > VOCAB_THRESHOLD || phraseMatch.isSignificant) {
4574
4751
  results.push({
4575
4752
  filepath,
4576
4753
  chunk,
@@ -4579,6 +4756,9 @@ class TypeScriptModule {
4579
4756
  context: {
4580
4757
  semanticScore,
4581
4758
  bm25Score,
4759
+ vocabScore,
4760
+ phraseMatch: phraseMatch.exactMatch,
4761
+ phraseCoverage: phraseMatch.coverage,
4582
4762
  pathBoost,
4583
4763
  fileTypeBoost,
4584
4764
  chunkTypeBoost,
@@ -4628,13 +4808,15 @@ class TypeScriptModule {
4628
4808
  semanticScore = cosineSimilarity(queryEmbedding, embedding);
4629
4809
  }
4630
4810
  const bm25Score = bm25Scores.get(chunkId) || 0;
4811
+ const vocabScore = vocabularyScoreMap.get(chunkId) || 0;
4812
+ const phraseMatch = calculatePhraseMatch(chunk.content, query);
4631
4813
  const pathBoost = pathBoosts.get(filepath) || 0;
4632
4814
  const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
4633
4815
  const chunkTypeBoost = calculateChunkTypeBoost(chunk);
4634
4816
  const exportBoost = calculateExportBoost(chunk);
4635
- const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
4817
+ const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
4636
4818
  const literalContribution = calculateLiteralContribution(chunkLiteralMatches, false);
4637
- const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
4819
+ const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
4638
4820
  const boostedScore = applyLiteralBoost(baseScore, chunkLiteralMatches, semanticScore > 0);
4639
4821
  const finalScore = boostedScore + additiveBoost;
4640
4822
  processedChunkIds.add(chunkId);
@@ -4646,6 +4828,9 @@ class TypeScriptModule {
4646
4828
  context: {
4647
4829
  semanticScore,
4648
4830
  bm25Score,
4831
+ vocabScore,
4832
+ phraseMatch: phraseMatch.exactMatch,
4833
+ phraseCoverage: phraseMatch.coverage,
4649
4834
  pathBoost,
4650
4835
  fileTypeBoost,
4651
4836
  chunkTypeBoost,
@@ -4686,7 +4871,7 @@ class TypeScriptModule {
4686
4871
  return references;
4687
4872
  }
4688
4873
  }
4689
- var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.7, BM25_WEIGHT = 0.3, TYPESCRIPT_EXTENSIONS, supportsFile;
4874
+ var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.6, BM25_WEIGHT = 0.25, VOCAB_WEIGHT = 0.15, VOCAB_THRESHOLD = 0.4, TYPESCRIPT_EXTENSIONS, supportsFile;
4690
4875
  var init_typescript = __esm(() => {
4691
4876
  init_embeddings();
4692
4877
  init_services();
@@ -10761,6 +10946,7 @@ class MarkdownModule {
10761
10946
  for (const { filepath, chunk, embedding } of allChunksData) {
10762
10947
  const semanticScore = cosineSimilarity(queryEmbedding, embedding);
10763
10948
  const bm25Score = bm25Scores.get(chunk.id) || 0;
10949
+ const phraseMatch = calculatePhraseMatch(chunk.content, query);
10764
10950
  let docBoost = 0;
10765
10951
  if (queryTerms.some((t) => [
10766
10952
  "docs",
@@ -10774,8 +10960,8 @@ class MarkdownModule {
10774
10960
  docBoost = 0.05;
10775
10961
  }
10776
10962
  const headingBoost = calculateHeadingLevelBoost(chunk);
10777
- const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost;
10778
- if (hybridScore >= minScore || bm25Score > 0.3) {
10963
+ const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost + phraseMatch.boost;
10964
+ if (hybridScore >= minScore || bm25Score > 0.3 || phraseMatch.isSignificant) {
10779
10965
  results.push({
10780
10966
  filepath,
10781
10967
  chunk,
@@ -10784,6 +10970,8 @@ class MarkdownModule {
10784
10970
  context: {
10785
10971
  semanticScore,
10786
10972
  bm25Score,
10973
+ phraseMatch: phraseMatch.exactMatch,
10974
+ phraseCoverage: phraseMatch.coverage,
10787
10975
  docBoost,
10788
10976
  headingBoost,
10789
10977
  headingLevel: chunk.metadata?.headingLevel
@@ -14397,4 +14585,4 @@ export {
14397
14585
  ConsoleLogger
14398
14586
  };
14399
14587
 
14400
- //# debugId=CA60BFDCCC29D83C64756E2164756E21
14588
+ //# debugId=EED23FCAC08F026464756E2164756E21