raggrep 0.13.2 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/cli/main.js +199 -11
- package/dist/cli/main.js.map +8 -7
- package/dist/domain/services/index.d.ts +2 -1
- package/dist/domain/services/literalExtractor.d.ts +20 -0
- package/dist/domain/services/phraseMatch.d.ts +99 -0
- package/dist/domain/services/phraseMatch.test.d.ts +4 -0
- package/dist/index.js +198 -10
- package/dist/index.js.map +8 -7
- package/dist/tests/simulation-phrase-matching.test.d.ts +14 -0
- package/dist/tests/simulation-vocabulary.test.d.ts +17 -0
- package/dist/tests/vocabulary-scoring.test.d.ts +16 -0
- package/package.json +1 -1
|
@@ -10,9 +10,10 @@ export { cosineSimilarity, euclideanDistance } from "./similarity";
|
|
|
10
10
|
export { detectQueryIntent, extractQueryTerms, calculateFileTypeBoost, isSourceCodeFile, isDocFile, isDataFile, IMPLEMENTATION_TERMS, DOCUMENTATION_TERMS, SOURCE_CODE_EXTENSIONS, DOC_EXTENSIONS, DATA_EXTENSIONS, type QueryIntent, } from "./queryIntent";
|
|
11
11
|
export { createLineBasedChunks, createSingleChunk, generateChunkId, DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, type TextChunk, type ChunkingOptions, } from "./chunking";
|
|
12
12
|
export { parseQueryLiterals } from "./queryLiteralParser";
|
|
13
|
-
export { extractLiterals, extractLiteralsWithReferences, extractVocabulary, } from "./literalExtractor";
|
|
13
|
+
export { extractLiterals, extractLiteralsWithReferences, extractVocabulary, extractQueryVocabulary, } from "./literalExtractor";
|
|
14
14
|
export { calculateLiteralMultiplier, calculateMaxMultiplier, calculateLiteralContribution, calculateVocabularyMatch, applyLiteralBoost, mergeWithLiteralBoost, LITERAL_SCORING_CONSTANTS, type LiteralScoreContribution, type VocabularyMatchResult, type MergeInput, type MergeOutput, } from "./literalScorer";
|
|
15
15
|
export { getSynonyms, expandQuery, DEFAULT_LEXICON, EXPANSION_WEIGHTS, DEFAULT_EXPANSION_OPTIONS, } from "./lexicon";
|
|
16
16
|
export { extractJsonPaths, extractJsonKeywords } from "./jsonPathExtractor";
|
|
17
17
|
export { introspectFile, findNearestReadme, introspectionToKeywords, detectScopeFromName, findProjectForFile, calculateIntrospectionBoost, type IntrospectFileOptions, } from "./introspection";
|
|
18
18
|
export { validateConfig, formatValidationIssues, type ValidationIssue, type ValidationResult, } from "./configValidator";
|
|
19
|
+
export { calculatePhraseMatch, hasExactPhrase, calculateTokenCoverage, tokenizeForMatching, PHRASE_MATCH_CONSTANTS, type PhraseMatchResult, } from "./phraseMatch";
|
|
@@ -22,6 +22,26 @@ import type { ExtractedLiteral } from "../entities/literal";
|
|
|
22
22
|
* @returns Array of unique vocabulary words (lowercase, length > 1)
|
|
23
23
|
*/
|
|
24
24
|
export declare function extractVocabulary(literal: string): string[];
|
|
25
|
+
/**
|
|
26
|
+
* Extract vocabulary words from a natural language query.
|
|
27
|
+
*
|
|
28
|
+
* Unlike extractVocabulary (for identifiers), this:
|
|
29
|
+
* 1. Tokenizes the query into words
|
|
30
|
+
* 2. Filters out stop words
|
|
31
|
+
* 3. Handles both natural language and embedded identifiers
|
|
32
|
+
* 4. Returns unique, normalized vocabulary words
|
|
33
|
+
*
|
|
34
|
+
* @param query - The search query string
|
|
35
|
+
* @returns Array of unique vocabulary words (lowercase, length > 1)
|
|
36
|
+
*
|
|
37
|
+
* @example
|
|
38
|
+
* extractQueryVocabulary("where is user session validated")
|
|
39
|
+
* // → ["user", "session", "validated"]
|
|
40
|
+
*
|
|
41
|
+
* extractQueryVocabulary("find the authenticateUser function")
|
|
42
|
+
* // → ["authenticate", "user"] (identifier decomposed)
|
|
43
|
+
*/
|
|
44
|
+
export declare function extractQueryVocabulary(query: string): string[];
|
|
25
45
|
/**
|
|
26
46
|
* Extract literals from a code chunk.
|
|
27
47
|
*
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phrase Matching Service
|
|
3
|
+
*
|
|
4
|
+
* Pure functions for content-based phrase matching. This enables
|
|
5
|
+
* exact phrase searches to find results even when semantic/BM25
|
|
6
|
+
* scores are low.
|
|
7
|
+
*
|
|
8
|
+
* @module domain/services/phraseMatch
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Result of phrase matching analysis.
|
|
12
|
+
*/
|
|
13
|
+
export interface PhraseMatchResult {
|
|
14
|
+
/** Whether the exact query phrase was found in content */
|
|
15
|
+
exactMatch: boolean;
|
|
16
|
+
/** Proportion of query tokens found in content (0-1) */
|
|
17
|
+
coverage: number;
|
|
18
|
+
/** Number of query tokens found in content */
|
|
19
|
+
matchedTokenCount: number;
|
|
20
|
+
/** Total number of tokens in query */
|
|
21
|
+
totalTokenCount: number;
|
|
22
|
+
/** Additive score boost based on match quality */
|
|
23
|
+
boost: number;
|
|
24
|
+
/** Whether this match is significant enough to bypass filters */
|
|
25
|
+
isSignificant: boolean;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Constants for phrase matching scoring.
|
|
29
|
+
*/
|
|
30
|
+
export declare const PHRASE_MATCH_CONSTANTS: {
|
|
31
|
+
/** Major boost for exact phrase match */
|
|
32
|
+
readonly EXACT_PHRASE_BOOST: 0.5;
|
|
33
|
+
/** Boost for high token coverage (80%+) */
|
|
34
|
+
readonly HIGH_COVERAGE_BOOST: 0.2;
|
|
35
|
+
/** Boost for medium token coverage (60%+) */
|
|
36
|
+
readonly MEDIUM_COVERAGE_BOOST: 0.1;
|
|
37
|
+
/** Coverage threshold for "high" classification */
|
|
38
|
+
readonly HIGH_COVERAGE_THRESHOLD: 0.8;
|
|
39
|
+
/** Coverage threshold for "medium" classification */
|
|
40
|
+
readonly MEDIUM_COVERAGE_THRESHOLD: 0.6;
|
|
41
|
+
/** Minimum query length to consider for exact matching */
|
|
42
|
+
readonly MIN_QUERY_LENGTH: 3;
|
|
43
|
+
};
|
|
44
|
+
/**
|
|
45
|
+
* Tokenize a string into words for matching.
|
|
46
|
+
* Normalizes to lowercase and filters out punctuation.
|
|
47
|
+
*
|
|
48
|
+
* @param text - Text to tokenize
|
|
49
|
+
* @param filterStopWords - Whether to filter out stop words
|
|
50
|
+
* @returns Array of normalized tokens
|
|
51
|
+
*/
|
|
52
|
+
export declare function tokenizeForMatching(text: string, filterStopWords?: boolean): string[];
|
|
53
|
+
/**
|
|
54
|
+
* Calculate phrase match score for content against a query.
|
|
55
|
+
*
|
|
56
|
+
* This function checks:
|
|
57
|
+
* 1. Exact phrase match (query substring in content)
|
|
58
|
+
* 2. Token coverage (what % of query tokens appear in content)
|
|
59
|
+
*
|
|
60
|
+
* @param content - The chunk content to search in
|
|
61
|
+
* @param query - The search query
|
|
62
|
+
* @returns PhraseMatchResult with match details and boost
|
|
63
|
+
*
|
|
64
|
+
* @example
|
|
65
|
+
* const result = calculatePhraseMatch(
|
|
66
|
+
* "This explains the authentication flow for new users",
|
|
67
|
+
* "authentication flow for new users"
|
|
68
|
+
* );
|
|
69
|
+
* // result.exactMatch = true
|
|
70
|
+
* // result.boost = 0.5 (EXACT_PHRASE_BOOST)
|
|
71
|
+
*
|
|
72
|
+
* @example
|
|
73
|
+
* const result = calculatePhraseMatch(
|
|
74
|
+
* "User authentication and session flow",
|
|
75
|
+
* "authentication flow for users"
|
|
76
|
+
* );
|
|
77
|
+
* // result.exactMatch = false
|
|
78
|
+
* // result.coverage = 0.75 (3/4 tokens found)
|
|
79
|
+
* // result.boost = 0.1 (MEDIUM_COVERAGE_BOOST)
|
|
80
|
+
*/
|
|
81
|
+
export declare function calculatePhraseMatch(content: string, query: string): PhraseMatchResult;
|
|
82
|
+
/**
|
|
83
|
+
* Quick check if content might contain the query phrase.
|
|
84
|
+
* Useful for early filtering before full phrase matching.
|
|
85
|
+
*
|
|
86
|
+
* @param content - The chunk content
|
|
87
|
+
* @param query - The search query
|
|
88
|
+
* @returns true if exact phrase is found
|
|
89
|
+
*/
|
|
90
|
+
export declare function hasExactPhrase(content: string, query: string): boolean;
|
|
91
|
+
/**
|
|
92
|
+
* Calculate token coverage between content and query.
|
|
93
|
+
* Faster than full phrase matching when only coverage is needed.
|
|
94
|
+
*
|
|
95
|
+
* @param content - The chunk content
|
|
96
|
+
* @param query - The search query
|
|
97
|
+
* @returns Coverage ratio (0-1)
|
|
98
|
+
*/
|
|
99
|
+
export declare function calculateTokenCoverage(content: string, query: string): number;
|
package/dist/index.js
CHANGED
|
@@ -2873,6 +2873,30 @@ function extractVocabulary(literal) {
|
|
|
2873
2873
|
const filtered = words.filter((w) => w.length > 1);
|
|
2874
2874
|
return [...new Set(filtered)];
|
|
2875
2875
|
}
|
|
2876
|
+
function extractQueryVocabulary(query) {
|
|
2877
|
+
if (!query || query.trim() === "") {
|
|
2878
|
+
return [];
|
|
2879
|
+
}
|
|
2880
|
+
const vocabularySet = new Set;
|
|
2881
|
+
const tokens = query.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
|
|
2882
|
+
for (const token of tokens) {
|
|
2883
|
+
if (QUERY_STOP_WORDS.has(token)) {
|
|
2884
|
+
continue;
|
|
2885
|
+
}
|
|
2886
|
+
const looksLikeIdentifier = /[A-Z]/.test(token) || token.includes("_") || token.includes("-");
|
|
2887
|
+
if (looksLikeIdentifier) {
|
|
2888
|
+
const vocabWords = extractVocabulary(token);
|
|
2889
|
+
for (const word of vocabWords) {
|
|
2890
|
+
if (!QUERY_STOP_WORDS.has(word)) {
|
|
2891
|
+
vocabularySet.add(word);
|
|
2892
|
+
}
|
|
2893
|
+
}
|
|
2894
|
+
} else {
|
|
2895
|
+
vocabularySet.add(token);
|
|
2896
|
+
}
|
|
2897
|
+
}
|
|
2898
|
+
return Array.from(vocabularySet);
|
|
2899
|
+
}
|
|
2876
2900
|
function extractLiterals(chunk) {
|
|
2877
2901
|
const literals = [];
|
|
2878
2902
|
if (chunk.name) {
|
|
@@ -2887,7 +2911,7 @@ function extractLiterals(chunk) {
|
|
|
2887
2911
|
}
|
|
2888
2912
|
return literals;
|
|
2889
2913
|
}
|
|
2890
|
-
var COMMON_ABBREVIATIONS, STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
|
|
2914
|
+
var COMMON_ABBREVIATIONS, STOP_WORDS, QUERY_STOP_WORDS, CHUNK_TYPE_TO_LITERAL_TYPE;
|
|
2891
2915
|
var init_literalExtractor = __esm(() => {
|
|
2892
2916
|
COMMON_ABBREVIATIONS = new Set([
|
|
2893
2917
|
"id",
|
|
@@ -2936,6 +2960,37 @@ var init_literalExtractor = __esm(() => {
|
|
|
2936
2960
|
"as",
|
|
2937
2961
|
"if"
|
|
2938
2962
|
]);
|
|
2963
|
+
QUERY_STOP_WORDS = new Set([
|
|
2964
|
+
...STOP_WORDS,
|
|
2965
|
+
"what",
|
|
2966
|
+
"where",
|
|
2967
|
+
"when",
|
|
2968
|
+
"how",
|
|
2969
|
+
"why",
|
|
2970
|
+
"which",
|
|
2971
|
+
"who",
|
|
2972
|
+
"find",
|
|
2973
|
+
"show",
|
|
2974
|
+
"get",
|
|
2975
|
+
"list",
|
|
2976
|
+
"search",
|
|
2977
|
+
"and",
|
|
2978
|
+
"but",
|
|
2979
|
+
"with",
|
|
2980
|
+
"from",
|
|
2981
|
+
"that",
|
|
2982
|
+
"this",
|
|
2983
|
+
"these",
|
|
2984
|
+
"those",
|
|
2985
|
+
"it",
|
|
2986
|
+
"its",
|
|
2987
|
+
"code",
|
|
2988
|
+
"file",
|
|
2989
|
+
"function",
|
|
2990
|
+
"class",
|
|
2991
|
+
"method",
|
|
2992
|
+
"variable"
|
|
2993
|
+
]);
|
|
2939
2994
|
CHUNK_TYPE_TO_LITERAL_TYPE = {
|
|
2940
2995
|
class: "className",
|
|
2941
2996
|
function: "functionName",
|
|
@@ -3649,6 +3704,113 @@ function extractJsonKeywords(obj) {
|
|
|
3649
3704
|
// src/domain/services/configValidator.ts
|
|
3650
3705
|
var init_configValidator = () => {};
|
|
3651
3706
|
|
|
3707
|
+
// src/domain/services/phraseMatch.ts
|
|
3708
|
+
function tokenizeForMatching(text, filterStopWords = true) {
|
|
3709
|
+
if (!text || text.trim() === "") {
|
|
3710
|
+
return [];
|
|
3711
|
+
}
|
|
3712
|
+
const tokens = text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 1);
|
|
3713
|
+
if (filterStopWords) {
|
|
3714
|
+
return tokens.filter((t) => !PHRASE_STOP_WORDS.has(t));
|
|
3715
|
+
}
|
|
3716
|
+
return tokens;
|
|
3717
|
+
}
|
|
3718
|
+
function calculatePhraseMatch(content, query) {
|
|
3719
|
+
if (!content || !query || query.trim().length < PHRASE_MATCH_CONSTANTS.MIN_QUERY_LENGTH) {
|
|
3720
|
+
return {
|
|
3721
|
+
exactMatch: false,
|
|
3722
|
+
coverage: 0,
|
|
3723
|
+
matchedTokenCount: 0,
|
|
3724
|
+
totalTokenCount: 0,
|
|
3725
|
+
boost: 0,
|
|
3726
|
+
isSignificant: false
|
|
3727
|
+
};
|
|
3728
|
+
}
|
|
3729
|
+
const contentLower = content.toLowerCase();
|
|
3730
|
+
const queryLower = query.toLowerCase().trim();
|
|
3731
|
+
const exactMatch = contentLower.includes(queryLower);
|
|
3732
|
+
const queryTokens = tokenizeForMatching(query, true);
|
|
3733
|
+
const matchedTokens = queryTokens.filter((token) => contentLower.includes(token));
|
|
3734
|
+
const coverage = queryTokens.length > 0 ? matchedTokens.length / queryTokens.length : 0;
|
|
3735
|
+
let boost = 0;
|
|
3736
|
+
if (exactMatch) {
|
|
3737
|
+
boost = PHRASE_MATCH_CONSTANTS.EXACT_PHRASE_BOOST;
|
|
3738
|
+
} else if (coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD) {
|
|
3739
|
+
boost = PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_BOOST;
|
|
3740
|
+
} else if (coverage >= PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_THRESHOLD) {
|
|
3741
|
+
boost = PHRASE_MATCH_CONSTANTS.MEDIUM_COVERAGE_BOOST;
|
|
3742
|
+
}
|
|
3743
|
+
const isSignificant = exactMatch || coverage >= PHRASE_MATCH_CONSTANTS.HIGH_COVERAGE_THRESHOLD;
|
|
3744
|
+
return {
|
|
3745
|
+
exactMatch,
|
|
3746
|
+
coverage,
|
|
3747
|
+
matchedTokenCount: matchedTokens.length,
|
|
3748
|
+
totalTokenCount: queryTokens.length,
|
|
3749
|
+
boost,
|
|
3750
|
+
isSignificant
|
|
3751
|
+
};
|
|
3752
|
+
}
|
|
3753
|
+
var PHRASE_MATCH_CONSTANTS, PHRASE_STOP_WORDS;
|
|
3754
|
+
var init_phraseMatch = __esm(() => {
|
|
3755
|
+
PHRASE_MATCH_CONSTANTS = {
|
|
3756
|
+
EXACT_PHRASE_BOOST: 0.5,
|
|
3757
|
+
HIGH_COVERAGE_BOOST: 0.2,
|
|
3758
|
+
MEDIUM_COVERAGE_BOOST: 0.1,
|
|
3759
|
+
HIGH_COVERAGE_THRESHOLD: 0.8,
|
|
3760
|
+
MEDIUM_COVERAGE_THRESHOLD: 0.6,
|
|
3761
|
+
MIN_QUERY_LENGTH: 3
|
|
3762
|
+
};
|
|
3763
|
+
PHRASE_STOP_WORDS = new Set([
|
|
3764
|
+
"a",
|
|
3765
|
+
"an",
|
|
3766
|
+
"the",
|
|
3767
|
+
"in",
|
|
3768
|
+
"on",
|
|
3769
|
+
"at",
|
|
3770
|
+
"to",
|
|
3771
|
+
"for",
|
|
3772
|
+
"of",
|
|
3773
|
+
"with",
|
|
3774
|
+
"by",
|
|
3775
|
+
"from",
|
|
3776
|
+
"as",
|
|
3777
|
+
"and",
|
|
3778
|
+
"or",
|
|
3779
|
+
"but",
|
|
3780
|
+
"what",
|
|
3781
|
+
"where",
|
|
3782
|
+
"when",
|
|
3783
|
+
"how",
|
|
3784
|
+
"why",
|
|
3785
|
+
"which",
|
|
3786
|
+
"who",
|
|
3787
|
+
"is",
|
|
3788
|
+
"are",
|
|
3789
|
+
"was",
|
|
3790
|
+
"were",
|
|
3791
|
+
"be",
|
|
3792
|
+
"been",
|
|
3793
|
+
"being",
|
|
3794
|
+
"have",
|
|
3795
|
+
"has",
|
|
3796
|
+
"had",
|
|
3797
|
+
"do",
|
|
3798
|
+
"does",
|
|
3799
|
+
"did",
|
|
3800
|
+
"i",
|
|
3801
|
+
"you",
|
|
3802
|
+
"he",
|
|
3803
|
+
"she",
|
|
3804
|
+
"it",
|
|
3805
|
+
"we",
|
|
3806
|
+
"they",
|
|
3807
|
+
"this",
|
|
3808
|
+
"that",
|
|
3809
|
+
"these",
|
|
3810
|
+
"those"
|
|
3811
|
+
]);
|
|
3812
|
+
});
|
|
3813
|
+
|
|
3652
3814
|
// src/domain/services/index.ts
|
|
3653
3815
|
var init_services = __esm(() => {
|
|
3654
3816
|
init_keywords();
|
|
@@ -3659,6 +3821,7 @@ var init_services = __esm(() => {
|
|
|
3659
3821
|
init_lexicon2();
|
|
3660
3822
|
init_introspection();
|
|
3661
3823
|
init_configValidator();
|
|
3824
|
+
init_phraseMatch();
|
|
3662
3825
|
});
|
|
3663
3826
|
|
|
3664
3827
|
// src/modules/language/typescript/parseCode.ts
|
|
@@ -4477,9 +4640,21 @@ class TypeScriptModule {
|
|
|
4477
4640
|
const symbolicIndex = new SymbolicIndex(indexDir, this.id);
|
|
4478
4641
|
const literalIndex = new LiteralIndex(indexDir, this.id);
|
|
4479
4642
|
let literalMatchMap = new Map;
|
|
4643
|
+
let vocabularyScoreMap = new Map;
|
|
4480
4644
|
try {
|
|
4481
4645
|
await literalIndex.initialize();
|
|
4482
4646
|
literalMatchMap = literalIndex.buildMatchMap(queryLiterals);
|
|
4647
|
+
const queryVocabulary = extractQueryVocabulary(query);
|
|
4648
|
+
if (queryVocabulary.length > 0) {
|
|
4649
|
+
const vocabMatches = literalIndex.findByVocabularyWords(queryVocabulary);
|
|
4650
|
+
for (const { entry, matchedWords } of vocabMatches) {
|
|
4651
|
+
const vocabScore = matchedWords.length / queryVocabulary.length;
|
|
4652
|
+
const existingScore = vocabularyScoreMap.get(entry.chunkId) || 0;
|
|
4653
|
+
if (vocabScore > existingScore) {
|
|
4654
|
+
vocabularyScoreMap.set(entry.chunkId, vocabScore);
|
|
4655
|
+
}
|
|
4656
|
+
}
|
|
4657
|
+
}
|
|
4483
4658
|
} catch {}
|
|
4484
4659
|
let allFiles;
|
|
4485
4660
|
try {
|
|
@@ -4559,18 +4734,20 @@ class TypeScriptModule {
|
|
|
4559
4734
|
for (const { filepath, chunk, embedding } of allChunksData) {
|
|
4560
4735
|
const semanticScore = cosineSimilarity(queryEmbedding, embedding);
|
|
4561
4736
|
const bm25Score = bm25Scores.get(chunk.id) || 0;
|
|
4737
|
+
const vocabScore = vocabularyScoreMap.get(chunk.id) || 0;
|
|
4562
4738
|
const pathBoost = pathBoosts.get(filepath) || 0;
|
|
4739
|
+
const phraseMatch = calculatePhraseMatch(chunk.content, query);
|
|
4563
4740
|
const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
|
|
4564
4741
|
const chunkTypeBoost = calculateChunkTypeBoost(chunk);
|
|
4565
4742
|
const exportBoost = calculateExportBoost(chunk);
|
|
4566
|
-
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
|
|
4567
|
-
const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score;
|
|
4743
|
+
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
|
|
4744
|
+
const baseScore = SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore;
|
|
4568
4745
|
const literalMatches = literalMatchMap.get(chunk.id) || [];
|
|
4569
4746
|
const literalContribution = calculateLiteralContribution(literalMatches, true);
|
|
4570
4747
|
const boostedScore = applyLiteralBoost(baseScore, literalMatches, true);
|
|
4571
4748
|
const finalScore = boostedScore + additiveBoost;
|
|
4572
4749
|
processedChunkIds.add(chunk.id);
|
|
4573
|
-
if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0) {
|
|
4750
|
+
if (finalScore >= minScore || bm25Score > 0.3 || literalMatches.length > 0 || vocabScore > VOCAB_THRESHOLD || phraseMatch.isSignificant) {
|
|
4574
4751
|
results.push({
|
|
4575
4752
|
filepath,
|
|
4576
4753
|
chunk,
|
|
@@ -4579,6 +4756,9 @@ class TypeScriptModule {
|
|
|
4579
4756
|
context: {
|
|
4580
4757
|
semanticScore,
|
|
4581
4758
|
bm25Score,
|
|
4759
|
+
vocabScore,
|
|
4760
|
+
phraseMatch: phraseMatch.exactMatch,
|
|
4761
|
+
phraseCoverage: phraseMatch.coverage,
|
|
4582
4762
|
pathBoost,
|
|
4583
4763
|
fileTypeBoost,
|
|
4584
4764
|
chunkTypeBoost,
|
|
@@ -4628,13 +4808,15 @@ class TypeScriptModule {
|
|
|
4628
4808
|
semanticScore = cosineSimilarity(queryEmbedding, embedding);
|
|
4629
4809
|
}
|
|
4630
4810
|
const bm25Score = bm25Scores.get(chunkId) || 0;
|
|
4811
|
+
const vocabScore = vocabularyScoreMap.get(chunkId) || 0;
|
|
4812
|
+
const phraseMatch = calculatePhraseMatch(chunk.content, query);
|
|
4631
4813
|
const pathBoost = pathBoosts.get(filepath) || 0;
|
|
4632
4814
|
const fileTypeBoost = calculateFileTypeBoost(filepath, queryTerms);
|
|
4633
4815
|
const chunkTypeBoost = calculateChunkTypeBoost(chunk);
|
|
4634
4816
|
const exportBoost = calculateExportBoost(chunk);
|
|
4635
|
-
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost;
|
|
4817
|
+
const additiveBoost = pathBoost + fileTypeBoost + chunkTypeBoost + exportBoost + phraseMatch.boost;
|
|
4636
4818
|
const literalContribution = calculateLiteralContribution(chunkLiteralMatches, false);
|
|
4637
|
-
const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
|
|
4819
|
+
const baseScore = semanticScore > 0 ? SEMANTIC_WEIGHT * semanticScore + BM25_WEIGHT * bm25Score + VOCAB_WEIGHT * vocabScore : LITERAL_SCORING_CONSTANTS.BASE_SCORE;
|
|
4638
4820
|
const boostedScore = applyLiteralBoost(baseScore, chunkLiteralMatches, semanticScore > 0);
|
|
4639
4821
|
const finalScore = boostedScore + additiveBoost;
|
|
4640
4822
|
processedChunkIds.add(chunkId);
|
|
@@ -4646,6 +4828,9 @@ class TypeScriptModule {
|
|
|
4646
4828
|
context: {
|
|
4647
4829
|
semanticScore,
|
|
4648
4830
|
bm25Score,
|
|
4831
|
+
vocabScore,
|
|
4832
|
+
phraseMatch: phraseMatch.exactMatch,
|
|
4833
|
+
phraseCoverage: phraseMatch.coverage,
|
|
4649
4834
|
pathBoost,
|
|
4650
4835
|
fileTypeBoost,
|
|
4651
4836
|
chunkTypeBoost,
|
|
@@ -4686,7 +4871,7 @@ class TypeScriptModule {
|
|
|
4686
4871
|
return references;
|
|
4687
4872
|
}
|
|
4688
4873
|
}
|
|
4689
|
-
var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.
|
|
4874
|
+
var DEFAULT_MIN_SCORE2 = 0.15, DEFAULT_TOP_K2 = 10, SEMANTIC_WEIGHT = 0.6, BM25_WEIGHT = 0.25, VOCAB_WEIGHT = 0.15, VOCAB_THRESHOLD = 0.4, TYPESCRIPT_EXTENSIONS, supportsFile;
|
|
4690
4875
|
var init_typescript = __esm(() => {
|
|
4691
4876
|
init_embeddings();
|
|
4692
4877
|
init_services();
|
|
@@ -10761,6 +10946,7 @@ class MarkdownModule {
|
|
|
10761
10946
|
for (const { filepath, chunk, embedding } of allChunksData) {
|
|
10762
10947
|
const semanticScore = cosineSimilarity(queryEmbedding, embedding);
|
|
10763
10948
|
const bm25Score = bm25Scores.get(chunk.id) || 0;
|
|
10949
|
+
const phraseMatch = calculatePhraseMatch(chunk.content, query);
|
|
10764
10950
|
let docBoost = 0;
|
|
10765
10951
|
if (queryTerms.some((t) => [
|
|
10766
10952
|
"docs",
|
|
@@ -10774,8 +10960,8 @@ class MarkdownModule {
|
|
|
10774
10960
|
docBoost = 0.05;
|
|
10775
10961
|
}
|
|
10776
10962
|
const headingBoost = calculateHeadingLevelBoost(chunk);
|
|
10777
|
-
const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost;
|
|
10778
|
-
if (hybridScore >= minScore || bm25Score > 0.3) {
|
|
10963
|
+
const hybridScore = SEMANTIC_WEIGHT5 * semanticScore + BM25_WEIGHT6 * bm25Score + docBoost + headingBoost + phraseMatch.boost;
|
|
10964
|
+
if (hybridScore >= minScore || bm25Score > 0.3 || phraseMatch.isSignificant) {
|
|
10779
10965
|
results.push({
|
|
10780
10966
|
filepath,
|
|
10781
10967
|
chunk,
|
|
@@ -10784,6 +10970,8 @@ class MarkdownModule {
|
|
|
10784
10970
|
context: {
|
|
10785
10971
|
semanticScore,
|
|
10786
10972
|
bm25Score,
|
|
10973
|
+
phraseMatch: phraseMatch.exactMatch,
|
|
10974
|
+
phraseCoverage: phraseMatch.coverage,
|
|
10787
10975
|
docBoost,
|
|
10788
10976
|
headingBoost,
|
|
10789
10977
|
headingLevel: chunk.metadata?.headingLevel
|
|
@@ -14397,4 +14585,4 @@ export {
|
|
|
14397
14585
|
ConsoleLogger
|
|
14398
14586
|
};
|
|
14399
14587
|
|
|
14400
|
-
//# debugId=
|
|
14588
|
+
//# debugId=EED23FCAC08F026464756E2164756E21
|