cto-ai-cli 8.0.1 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli/index.js +103 -28
- package/dist/engine/index.d.ts +11 -1
- package/dist/engine/index.js +315 -35
- package/dist/mcp/index.js +103 -28
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -38,14 +38,14 @@ This runs a self-contained presentation that shows: project analysis, semantic m
|
|
|
38
38
|
|
|
39
39
|
## Benchmark Results
|
|
40
40
|
|
|
41
|
-
**Eval Harness v8.
|
|
42
|
-
|
|
43
|
-
| Metric |
|
|
44
|
-
|
|
45
|
-
| **Must-have recall** | **100%**
|
|
46
|
-
| **Precision** | **
|
|
47
|
-
| **F1** | **
|
|
48
|
-
| **Noise rate** |
|
|
41
|
+
**Eval Harness v8.1** — 20-file Java enterprise project, 4 tasks with expert-labeled ground truth:
|
|
42
|
+
|
|
43
|
+
| Metric | v8.0 | **v8.1** |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| **Must-have recall** | 100% | **100%** |
|
|
46
|
+
| **Precision** | 38% | **60%** (+22pp) |
|
|
47
|
+
| **F1** | 55% | **74%** (+19pp) |
|
|
48
|
+
| **Noise rate** | 11.3% | **5.7%** (-5.6pp) |
|
|
49
49
|
|
|
50
50
|
**Real production repos** (Java monoliths):
|
|
51
51
|
|
package/dist/cli/index.js
CHANGED
|
@@ -5674,61 +5674,88 @@ var init_query_intent = __esm({
|
|
|
5674
5674
|
|
|
5675
5675
|
// src/engine/embeddings.ts
|
|
5676
5676
|
function buildTfIdfEmbeddingIndex(index) {
|
|
5677
|
-
const
|
|
5677
|
+
const origTerms = [...index.idf.keys()];
|
|
5678
|
+
const allTerms = [];
|
|
5679
|
+
const termSet = /* @__PURE__ */ new Set();
|
|
5680
|
+
for (const t of origTerms) {
|
|
5681
|
+
if (!termSet.has(t)) {
|
|
5682
|
+
allTerms.push(t);
|
|
5683
|
+
termSet.add(t);
|
|
5684
|
+
}
|
|
5685
|
+
}
|
|
5686
|
+
for (const t of origTerms) {
|
|
5687
|
+
for (const s of getStemVariants(t)) {
|
|
5688
|
+
const stemKey = `\xA7${s}`;
|
|
5689
|
+
if (!termSet.has(stemKey)) {
|
|
5690
|
+
allTerms.push(stemKey);
|
|
5691
|
+
termSet.add(stemKey);
|
|
5692
|
+
}
|
|
5693
|
+
}
|
|
5694
|
+
}
|
|
5678
5695
|
const termToIdx = new Map(allTerms.map((t, i) => [t, i]));
|
|
5679
5696
|
const dimensions = allTerms.length;
|
|
5680
5697
|
const docVectors = /* @__PURE__ */ new Map();
|
|
5681
|
-
const
|
|
5698
|
+
const docNonZero = /* @__PURE__ */ new Map();
|
|
5682
5699
|
for (const [filePath, doc] of index.documents) {
|
|
5683
5700
|
const vec = new Float32Array(dimensions);
|
|
5684
|
-
|
|
5701
|
+
const nonZero = [];
|
|
5685
5702
|
for (const [term, tf] of doc.terms) {
|
|
5686
|
-
const idx = termToIdx.get(term);
|
|
5687
|
-
if (idx === void 0) continue;
|
|
5688
5703
|
const idf = index.idf.get(term) ?? 0;
|
|
5689
5704
|
const weight = tf * idf;
|
|
5690
|
-
|
|
5691
|
-
|
|
5705
|
+
const idx = termToIdx.get(term);
|
|
5706
|
+
if (idx !== void 0) {
|
|
5707
|
+
vec[idx] += weight;
|
|
5708
|
+
nonZero.push(idx);
|
|
5709
|
+
}
|
|
5710
|
+
for (const s of getStemVariants(term)) {
|
|
5711
|
+
const stemIdx = termToIdx.get(`\xA7${s}`);
|
|
5712
|
+
if (stemIdx !== void 0) {
|
|
5713
|
+
vec[stemIdx] += weight * 0.5;
|
|
5714
|
+
nonZero.push(stemIdx);
|
|
5715
|
+
}
|
|
5716
|
+
}
|
|
5692
5717
|
}
|
|
5718
|
+
let norm = 0;
|
|
5719
|
+
for (const i of nonZero) norm += vec[i] * vec[i];
|
|
5693
5720
|
norm = Math.sqrt(norm);
|
|
5694
5721
|
if (norm > 0) {
|
|
5695
|
-
for (
|
|
5696
|
-
vec[i] /= norm;
|
|
5697
|
-
}
|
|
5722
|
+
for (const i of nonZero) vec[i] /= norm;
|
|
5698
5723
|
}
|
|
5699
5724
|
docVectors.set(filePath, vec);
|
|
5700
|
-
|
|
5725
|
+
docNonZero.set(filePath, [...new Set(nonZero)]);
|
|
5701
5726
|
}
|
|
5702
5727
|
function queryFn(text, topK) {
|
|
5703
5728
|
const queryTerms = tokenizeForEmbedding(text);
|
|
5704
|
-
const
|
|
5729
|
+
const expandedCounts = /* @__PURE__ */ new Map();
|
|
5705
5730
|
for (const t of queryTerms) {
|
|
5706
|
-
|
|
5731
|
+
expandedCounts.set(t, (expandedCounts.get(t) ?? 0) + 1);
|
|
5732
|
+
for (const s of getStemVariants(t)) {
|
|
5733
|
+
const stemKey = `\xA7${s}`;
|
|
5734
|
+
expandedCounts.set(stemKey, (expandedCounts.get(stemKey) ?? 0) + 0.5);
|
|
5735
|
+
}
|
|
5707
5736
|
}
|
|
5708
5737
|
const queryVec = new Float32Array(dimensions);
|
|
5709
|
-
|
|
5710
|
-
for (const [term, count] of
|
|
5738
|
+
const queryNonZero = [];
|
|
5739
|
+
for (const [term, count] of expandedCounts) {
|
|
5711
5740
|
const idx = termToIdx.get(term);
|
|
5712
5741
|
if (idx === void 0) continue;
|
|
5713
|
-
const
|
|
5714
|
-
const
|
|
5715
|
-
queryVec[idx] =
|
|
5716
|
-
|
|
5742
|
+
const rawTerm = term.startsWith("\xA7") ? term.slice(1) : term;
|
|
5743
|
+
const idf = index.idf.get(rawTerm) ?? 1;
|
|
5744
|
+
queryVec[idx] = count * idf;
|
|
5745
|
+
queryNonZero.push(idx);
|
|
5717
5746
|
}
|
|
5747
|
+
let queryNorm = 0;
|
|
5748
|
+
for (const i of queryNonZero) queryNorm += queryVec[i] * queryVec[i];
|
|
5718
5749
|
queryNorm = Math.sqrt(queryNorm);
|
|
5719
5750
|
if (queryNorm > 0) {
|
|
5720
|
-
for (
|
|
5721
|
-
queryVec[i] /= queryNorm;
|
|
5722
|
-
}
|
|
5751
|
+
for (const i of queryNonZero) queryVec[i] /= queryNorm;
|
|
5723
5752
|
}
|
|
5724
5753
|
const results = [];
|
|
5754
|
+
const queryIdxSet = new Set(queryNonZero);
|
|
5725
5755
|
for (const [filePath, docVec] of docVectors) {
|
|
5726
5756
|
let dot = 0;
|
|
5727
|
-
for (const
|
|
5728
|
-
|
|
5729
|
-
if (idx !== void 0) {
|
|
5730
|
-
dot += queryVec[idx] * docVec[idx];
|
|
5731
|
-
}
|
|
5757
|
+
for (const i of queryNonZero) {
|
|
5758
|
+
if (docVec[i] !== 0) dot += queryVec[i] * docVec[i];
|
|
5732
5759
|
}
|
|
5733
5760
|
if (dot > 0) {
|
|
5734
5761
|
results.push({ filePath, score: dot });
|
|
@@ -5757,6 +5784,54 @@ function reciprocalRankFusion(bm25Results, embeddingResults, k = 60, bm25Weight
|
|
|
5757
5784
|
}
|
|
5758
5785
|
return [...scores.entries()].map(([filePath, score]) => ({ filePath, score })).sort((a, b) => b.score - a.score);
|
|
5759
5786
|
}
|
|
5787
|
+
function stem2(word) {
|
|
5788
|
+
if (word.length < 4) return word;
|
|
5789
|
+
const rules = [
|
|
5790
|
+
["ization", 4],
|
|
5791
|
+
["isation", 4],
|
|
5792
|
+
["ation", 4],
|
|
5793
|
+
["ition", 4],
|
|
5794
|
+
["tion", 3],
|
|
5795
|
+
["sion", 3],
|
|
5796
|
+
["ment", 3],
|
|
5797
|
+
["ness", 3],
|
|
5798
|
+
["able", 3],
|
|
5799
|
+
["ible", 3],
|
|
5800
|
+
["ive", 3],
|
|
5801
|
+
["ing", 3],
|
|
5802
|
+
["ity", 3],
|
|
5803
|
+
["ous", 3],
|
|
5804
|
+
["ful", 3],
|
|
5805
|
+
["ate", 3],
|
|
5806
|
+
["ize", 3],
|
|
5807
|
+
["ise", 3],
|
|
5808
|
+
["ure", 3],
|
|
5809
|
+
["ent", 3],
|
|
5810
|
+
["ant", 3],
|
|
5811
|
+
["al", 3],
|
|
5812
|
+
["er", 3],
|
|
5813
|
+
["or", 3],
|
|
5814
|
+
["ed", 3],
|
|
5815
|
+
["ly", 3],
|
|
5816
|
+
["es", 3],
|
|
5817
|
+
["s", 3]
|
|
5818
|
+
];
|
|
5819
|
+
for (const [suffix, minRemaining] of rules) {
|
|
5820
|
+
if (word.endsWith(suffix) && word.length - suffix.length >= minRemaining) {
|
|
5821
|
+
return word.slice(0, word.length - suffix.length);
|
|
5822
|
+
}
|
|
5823
|
+
}
|
|
5824
|
+
return word;
|
|
5825
|
+
}
|
|
5826
|
+
function getStemVariants(word) {
|
|
5827
|
+
const variants = /* @__PURE__ */ new Set();
|
|
5828
|
+
variants.add(word);
|
|
5829
|
+
const stripped = stem2(word);
|
|
5830
|
+
if (stripped !== word && stripped.length >= 3) variants.add(stripped);
|
|
5831
|
+
if (word.length >= 6) variants.add(word.slice(0, 5));
|
|
5832
|
+
if (word.endsWith("e") && word.length >= 5) variants.add(word.slice(0, -1));
|
|
5833
|
+
return [...variants];
|
|
5834
|
+
}
|
|
5760
5835
|
function tokenizeForEmbedding(text) {
|
|
5761
5836
|
return text.toLowerCase().replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]/g, " ").split(/\s+/).filter((t) => t.length >= 2);
|
|
5762
5837
|
}
|
|
@@ -6124,7 +6199,7 @@ async function runContextPipeline(input) {
|
|
|
6124
6199
|
const rerankerApproved = new Set(rerankResult.files.map((rf) => rf.filePath));
|
|
6125
6200
|
const rerankedMatches = boostedMatches.map((m) => ({
|
|
6126
6201
|
filePath: m.filePath,
|
|
6127
|
-
score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score,
|
|
6202
|
+
score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score * 0.25,
|
|
6128
6203
|
matchedTerms: [...m.matchedTerms]
|
|
6129
6204
|
}));
|
|
6130
6205
|
for (const m of rerankedMatches) {
|
package/dist/engine/index.d.ts
CHANGED
|
@@ -1688,6 +1688,16 @@ declare function reciprocalRankFusion(bm25Results: {
|
|
|
1688
1688
|
filePath: string;
|
|
1689
1689
|
score: number;
|
|
1690
1690
|
}[];
|
|
1691
|
+
/**
|
|
1692
|
+
* Simple Porter-like stemmer for code/English terms.
|
|
1693
|
+
* Catches: invalidation↔invalidate, authentication↔authenticate, processing↔process
|
|
1694
|
+
*/
|
|
1695
|
+
declare function stem(word: string): string;
|
|
1696
|
+
/**
|
|
1697
|
+
* Generate stem variants for a term.
|
|
1698
|
+
* Returns original + suffix-stripped + 5-char prefix + trailing-e stripped.
|
|
1699
|
+
*/
|
|
1700
|
+
declare function getStemVariants(word: string): string[];
|
|
1691
1701
|
/**
|
|
1692
1702
|
* Check if ONNX Runtime is available for neural embeddings.
|
|
1693
1703
|
*/
|
|
@@ -1829,4 +1839,4 @@ interface AuditOptions {
|
|
|
1829
1839
|
}
|
|
1830
1840
|
declare function auditProject(projectPath: string, filePaths: string[], options?: AuditOptions): Promise<AuditResult>;
|
|
1831
1841
|
|
|
1832
|
-
export { type ActionType, type ArchLayer, type AssignmentResult, type CallGraphResult, type ChunkKind, type ChunkRetrievalResult, type CoChangeEntry, type CoChangeMatrix, type CodeChunk, type ContextPipelineInput, type ContextPipelineResult, type CorpusEmbeddings, CtoError, type CtoErrorCode, type DocumentVector, type EmbeddingIndex, type EmbeddingResult, type Experiment, type ExperimentConclusion, type ExperimentGroup, type FileOpenEvent, type FilteredFile, type GroupMetrics, type HopDetail, type ImportSpec, type IndexCacheStats, type LearnerBoost, type LearnerBoostInput, type LearnerModel, type LogEntry, type LogLevel, type Logger, type MethodCall, type MethodDefinition, type MultiHopConfig, type MultiHopResult, type MultiRepoResult, type PatternStats, type QueryIntent, type RerankInput, type RerankResult, type RerankedFile, type SecretFinding, type SecretType, type SelectionInput, type SemanticExpansion, type SemanticMatch, type SemanticScore, type SiblingMatch, type SiblingRepo, type SignalWeight, type SignificanceResult, type StructuralTokens, type SupportedLanguage, type SynonymExpansion, type TelemetryModel, type TelemetrySession, type TfIdfIndex, type TunedWeights, type WeightTunerModel, analyzeProject, assignGroup, attributeToSignal, auditProject, augmentContentWithStructure, bfsBidirectional, boostByCallGraph, boostByGitCoChange, boostByImports, boostByLayer, boostByPath, buildAdjacencyList, buildCallGraph, buildCoChangeMatrix, buildCorpusEmbeddings, buildIndex, buildIndexCached, buildNeuralEmbeddingIndex, buildProjectGraph, buildTfIdfEmbeddingIndex, buildWeightedQuery, calculateCoverage, chunkFile, classifyFileKind, countTokensChars4, countTokensTiktoken, createExperiment, createFreshModel, createLogger, createProject, detectLanguage, detectStack, discoverSiblingRepos, embedQuery, reciprocalRankFusion as embeddingRRF, estimateComplexity, estimateFileTokens, estimateTokens, expandLayers, expandQuery, expandQueryWithPMI, expandTerm, extractPattern, extractStructuralTokens, freeEncoder, getActiveExperiment, getCacheInfo, getConcludedExperiments, getExpansionDetails, getGitRecency, getLearnerBoosts, getLearnerStats, getOptimizedWeights, getPruneLevelForRisk, getStructuralSummary, getSynonymStats, getTelemetryBoosts, invalidateCache, isCtoError, isOnnxAvailable, loadExperiments, loadLearner, loadTelemetry, loadWeightTuner, multiHopQuery, optimizeBudget, parseAllPolyglotImports, parseImports, parseQueryIntent, parseSiblingPaths, pruneFile, pruneFiles, query, queryByEmbedding, querySiblingRepos, reciprocalRankFusion$1 as reciprocalRankFusion, recordFeedback, recordFileOpen, recordOutcome, recordSelection, recordSession, renderExperimentSummary, renderFileChunks, renderMultiRepoSummary, renderTelemetrySummary, renderWeightStatus, rerank, retrieveChunks, runContextPipeline, sanitizeContent, saveExperiments, saveLearner, saveTelemetry, saveWeightTuner, scanContentForSecrets, scanFileForSecrets, scanProjectForSecrets, scoreAllFiles, scoreChunks, scoreFile, selectContext, setJsonLogging, setLogLevel, similarity, testSignificance, tokenize, walkProject, wrapError };
|
|
1842
|
+
export { type ActionType, type ArchLayer, type AssignmentResult, type CallGraphResult, type ChunkKind, type ChunkRetrievalResult, type CoChangeEntry, type CoChangeMatrix, type CodeChunk, type ContextPipelineInput, type ContextPipelineResult, type CorpusEmbeddings, CtoError, type CtoErrorCode, type DocumentVector, type EmbeddingIndex, type EmbeddingResult, type Experiment, type ExperimentConclusion, type ExperimentGroup, type FileOpenEvent, type FilteredFile, type GroupMetrics, type HopDetail, type ImportSpec, type IndexCacheStats, type LearnerBoost, type LearnerBoostInput, type LearnerModel, type LogEntry, type LogLevel, type Logger, type MethodCall, type MethodDefinition, type MultiHopConfig, type MultiHopResult, type MultiRepoResult, type PatternStats, type QueryIntent, type RerankInput, type RerankResult, type RerankedFile, type SecretFinding, type SecretType, type SelectionInput, type SemanticExpansion, type SemanticMatch, type SemanticScore, type SiblingMatch, type SiblingRepo, type SignalWeight, type SignificanceResult, type StructuralTokens, type SupportedLanguage, type SynonymExpansion, type TelemetryModel, type TelemetrySession, type TfIdfIndex, type TunedWeights, type WeightTunerModel, analyzeProject, assignGroup, attributeToSignal, auditProject, augmentContentWithStructure, bfsBidirectional, boostByCallGraph, boostByGitCoChange, boostByImports, boostByLayer, boostByPath, buildAdjacencyList, buildCallGraph, buildCoChangeMatrix, buildCorpusEmbeddings, buildIndex, buildIndexCached, buildNeuralEmbeddingIndex, buildProjectGraph, buildTfIdfEmbeddingIndex, buildWeightedQuery, calculateCoverage, chunkFile, classifyFileKind, countTokensChars4, countTokensTiktoken, createExperiment, createFreshModel, createLogger, createProject, detectLanguage, detectStack, discoverSiblingRepos, embedQuery, reciprocalRankFusion as embeddingRRF, estimateComplexity, estimateFileTokens, estimateTokens, expandLayers, expandQuery, expandQueryWithPMI, expandTerm, extractPattern, extractStructuralTokens, freeEncoder, getActiveExperiment, getCacheInfo, getConcludedExperiments, getExpansionDetails, getGitRecency, getLearnerBoosts, getLearnerStats, getOptimizedWeights, getPruneLevelForRisk, getStemVariants, getStructuralSummary, getSynonymStats, getTelemetryBoosts, invalidateCache, isCtoError, isOnnxAvailable, loadExperiments, loadLearner, loadTelemetry, loadWeightTuner, multiHopQuery, optimizeBudget, parseAllPolyglotImports, parseImports, parseQueryIntent, parseSiblingPaths, pruneFile, pruneFiles, query, queryByEmbedding, querySiblingRepos, reciprocalRankFusion$1 as reciprocalRankFusion, recordFeedback, recordFileOpen, recordOutcome, recordSelection, recordSession, renderExperimentSummary, renderFileChunks, renderMultiRepoSummary, renderTelemetrySummary, renderWeightStatus, rerank, retrieveChunks, runContextPipeline, sanitizeContent, saveExperiments, saveLearner, saveTelemetry, saveWeightTuner, scanContentForSecrets, scanFileForSecrets, scanProjectForSecrets, scoreAllFiles, scoreChunks, scoreFile, selectContext, setJsonLogging, setLogLevel, similarity, stem, testSignificance, tokenize, walkProject, wrapError };
|
package/dist/engine/index.js
CHANGED
|
@@ -5754,61 +5754,88 @@ function expandLayers(layers) {
|
|
|
5754
5754
|
|
|
5755
5755
|
// src/engine/embeddings.ts
|
|
5756
5756
|
function buildTfIdfEmbeddingIndex(index) {
|
|
5757
|
-
const
|
|
5757
|
+
const origTerms = [...index.idf.keys()];
|
|
5758
|
+
const allTerms = [];
|
|
5759
|
+
const termSet = /* @__PURE__ */ new Set();
|
|
5760
|
+
for (const t of origTerms) {
|
|
5761
|
+
if (!termSet.has(t)) {
|
|
5762
|
+
allTerms.push(t);
|
|
5763
|
+
termSet.add(t);
|
|
5764
|
+
}
|
|
5765
|
+
}
|
|
5766
|
+
for (const t of origTerms) {
|
|
5767
|
+
for (const s of getStemVariants(t)) {
|
|
5768
|
+
const stemKey = `\xA7${s}`;
|
|
5769
|
+
if (!termSet.has(stemKey)) {
|
|
5770
|
+
allTerms.push(stemKey);
|
|
5771
|
+
termSet.add(stemKey);
|
|
5772
|
+
}
|
|
5773
|
+
}
|
|
5774
|
+
}
|
|
5758
5775
|
const termToIdx = new Map(allTerms.map((t, i) => [t, i]));
|
|
5759
5776
|
const dimensions = allTerms.length;
|
|
5760
5777
|
const docVectors = /* @__PURE__ */ new Map();
|
|
5761
|
-
const
|
|
5778
|
+
const docNonZero = /* @__PURE__ */ new Map();
|
|
5762
5779
|
for (const [filePath, doc] of index.documents) {
|
|
5763
5780
|
const vec = new Float32Array(dimensions);
|
|
5764
|
-
|
|
5781
|
+
const nonZero = [];
|
|
5765
5782
|
for (const [term, tf] of doc.terms) {
|
|
5766
|
-
const idx = termToIdx.get(term);
|
|
5767
|
-
if (idx === void 0) continue;
|
|
5768
5783
|
const idf = index.idf.get(term) ?? 0;
|
|
5769
5784
|
const weight = tf * idf;
|
|
5770
|
-
|
|
5771
|
-
|
|
5785
|
+
const idx = termToIdx.get(term);
|
|
5786
|
+
if (idx !== void 0) {
|
|
5787
|
+
vec[idx] += weight;
|
|
5788
|
+
nonZero.push(idx);
|
|
5789
|
+
}
|
|
5790
|
+
for (const s of getStemVariants(term)) {
|
|
5791
|
+
const stemIdx = termToIdx.get(`\xA7${s}`);
|
|
5792
|
+
if (stemIdx !== void 0) {
|
|
5793
|
+
vec[stemIdx] += weight * 0.5;
|
|
5794
|
+
nonZero.push(stemIdx);
|
|
5795
|
+
}
|
|
5796
|
+
}
|
|
5772
5797
|
}
|
|
5798
|
+
let norm = 0;
|
|
5799
|
+
for (const i of nonZero) norm += vec[i] * vec[i];
|
|
5773
5800
|
norm = Math.sqrt(norm);
|
|
5774
5801
|
if (norm > 0) {
|
|
5775
|
-
for (
|
|
5776
|
-
vec[i] /= norm;
|
|
5777
|
-
}
|
|
5802
|
+
for (const i of nonZero) vec[i] /= norm;
|
|
5778
5803
|
}
|
|
5779
5804
|
docVectors.set(filePath, vec);
|
|
5780
|
-
|
|
5805
|
+
docNonZero.set(filePath, [...new Set(nonZero)]);
|
|
5781
5806
|
}
|
|
5782
5807
|
function queryFn(text, topK) {
|
|
5783
5808
|
const queryTerms = tokenizeForEmbedding(text);
|
|
5784
|
-
const
|
|
5809
|
+
const expandedCounts = /* @__PURE__ */ new Map();
|
|
5785
5810
|
for (const t of queryTerms) {
|
|
5786
|
-
|
|
5811
|
+
expandedCounts.set(t, (expandedCounts.get(t) ?? 0) + 1);
|
|
5812
|
+
for (const s of getStemVariants(t)) {
|
|
5813
|
+
const stemKey = `\xA7${s}`;
|
|
5814
|
+
expandedCounts.set(stemKey, (expandedCounts.get(stemKey) ?? 0) + 0.5);
|
|
5815
|
+
}
|
|
5787
5816
|
}
|
|
5788
5817
|
const queryVec = new Float32Array(dimensions);
|
|
5789
|
-
|
|
5790
|
-
for (const [term, count] of
|
|
5818
|
+
const queryNonZero = [];
|
|
5819
|
+
for (const [term, count] of expandedCounts) {
|
|
5791
5820
|
const idx = termToIdx.get(term);
|
|
5792
5821
|
if (idx === void 0) continue;
|
|
5793
|
-
const
|
|
5794
|
-
const
|
|
5795
|
-
queryVec[idx] =
|
|
5796
|
-
|
|
5822
|
+
const rawTerm = term.startsWith("\xA7") ? term.slice(1) : term;
|
|
5823
|
+
const idf = index.idf.get(rawTerm) ?? 1;
|
|
5824
|
+
queryVec[idx] = count * idf;
|
|
5825
|
+
queryNonZero.push(idx);
|
|
5797
5826
|
}
|
|
5827
|
+
let queryNorm = 0;
|
|
5828
|
+
for (const i of queryNonZero) queryNorm += queryVec[i] * queryVec[i];
|
|
5798
5829
|
queryNorm = Math.sqrt(queryNorm);
|
|
5799
5830
|
if (queryNorm > 0) {
|
|
5800
|
-
for (
|
|
5801
|
-
queryVec[i] /= queryNorm;
|
|
5802
|
-
}
|
|
5831
|
+
for (const i of queryNonZero) queryVec[i] /= queryNorm;
|
|
5803
5832
|
}
|
|
5804
5833
|
const results = [];
|
|
5834
|
+
const queryIdxSet = new Set(queryNonZero);
|
|
5805
5835
|
for (const [filePath, docVec] of docVectors) {
|
|
5806
5836
|
let dot = 0;
|
|
5807
|
-
for (const
|
|
5808
|
-
|
|
5809
|
-
if (idx !== void 0) {
|
|
5810
|
-
dot += queryVec[idx] * docVec[idx];
|
|
5811
|
-
}
|
|
5837
|
+
for (const i of queryNonZero) {
|
|
5838
|
+
if (docVec[i] !== 0) dot += queryVec[i] * docVec[i];
|
|
5812
5839
|
}
|
|
5813
5840
|
if (dot > 0) {
|
|
5814
5841
|
results.push({ filePath, score: dot });
|
|
@@ -5837,6 +5864,54 @@ function reciprocalRankFusion2(bm25Results, embeddingResults, k = 60, bm25Weight
|
|
|
5837
5864
|
}
|
|
5838
5865
|
return [...scores.entries()].map(([filePath, score]) => ({ filePath, score })).sort((a, b) => b.score - a.score);
|
|
5839
5866
|
}
|
|
5867
|
+
function stem2(word) {
|
|
5868
|
+
if (word.length < 4) return word;
|
|
5869
|
+
const rules = [
|
|
5870
|
+
["ization", 4],
|
|
5871
|
+
["isation", 4],
|
|
5872
|
+
["ation", 4],
|
|
5873
|
+
["ition", 4],
|
|
5874
|
+
["tion", 3],
|
|
5875
|
+
["sion", 3],
|
|
5876
|
+
["ment", 3],
|
|
5877
|
+
["ness", 3],
|
|
5878
|
+
["able", 3],
|
|
5879
|
+
["ible", 3],
|
|
5880
|
+
["ive", 3],
|
|
5881
|
+
["ing", 3],
|
|
5882
|
+
["ity", 3],
|
|
5883
|
+
["ous", 3],
|
|
5884
|
+
["ful", 3],
|
|
5885
|
+
["ate", 3],
|
|
5886
|
+
["ize", 3],
|
|
5887
|
+
["ise", 3],
|
|
5888
|
+
["ure", 3],
|
|
5889
|
+
["ent", 3],
|
|
5890
|
+
["ant", 3],
|
|
5891
|
+
["al", 3],
|
|
5892
|
+
["er", 3],
|
|
5893
|
+
["or", 3],
|
|
5894
|
+
["ed", 3],
|
|
5895
|
+
["ly", 3],
|
|
5896
|
+
["es", 3],
|
|
5897
|
+
["s", 3]
|
|
5898
|
+
];
|
|
5899
|
+
for (const [suffix, minRemaining] of rules) {
|
|
5900
|
+
if (word.endsWith(suffix) && word.length - suffix.length >= minRemaining) {
|
|
5901
|
+
return word.slice(0, word.length - suffix.length);
|
|
5902
|
+
}
|
|
5903
|
+
}
|
|
5904
|
+
return word;
|
|
5905
|
+
}
|
|
5906
|
+
function getStemVariants(word) {
|
|
5907
|
+
const variants = /* @__PURE__ */ new Set();
|
|
5908
|
+
variants.add(word);
|
|
5909
|
+
const stripped = stem2(word);
|
|
5910
|
+
if (stripped !== word && stripped.length >= 3) variants.add(stripped);
|
|
5911
|
+
if (word.length >= 6) variants.add(word.slice(0, 5));
|
|
5912
|
+
if (word.endsWith("e") && word.length >= 5) variants.add(word.slice(0, -1));
|
|
5913
|
+
return [...variants];
|
|
5914
|
+
}
|
|
5840
5915
|
function tokenizeForEmbedding(text) {
|
|
5841
5916
|
return text.toLowerCase().replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]/g, " ").split(/\s+/).filter((t) => t.length >= 2);
|
|
5842
5917
|
}
|
|
@@ -6013,7 +6088,7 @@ async function runContextPipeline(input) {
|
|
|
6013
6088
|
const rerankerApproved = new Set(rerankResult.files.map((rf) => rf.filePath));
|
|
6014
6089
|
const rerankedMatches = boostedMatches.map((m) => ({
|
|
6015
6090
|
filePath: m.filePath,
|
|
6016
|
-
score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score,
|
|
6091
|
+
score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score * 0.25,
|
|
6017
6092
|
matchedTerms: [...m.matchedTerms]
|
|
6018
6093
|
}));
|
|
6019
6094
|
for (const m of rerankedMatches) {
|
|
@@ -6633,10 +6708,31 @@ function chunkJava(content, filePath) {
|
|
|
6633
6708
|
tokens: estimateTokens2(lines.slice(i, classEnd + 1).join("\n"))
|
|
6634
6709
|
});
|
|
6635
6710
|
}
|
|
6636
|
-
const
|
|
6711
|
+
const ctorMatch = !classMatch && line.match(/^(?:public|private|protected)\s+([A-Z]\w+)\s*\(/);
|
|
6712
|
+
if (ctorMatch && !line.match(/\s+\w+\s+\w+\s*\(/)) {
|
|
6713
|
+
const name = ctorMatch[1];
|
|
6714
|
+
let ctorStart = i;
|
|
6715
|
+
while (ctorStart > 0 && lines[ctorStart - 1].trim().startsWith("@")) ctorStart--;
|
|
6716
|
+
const ctorEnd = findBraceEnd(lines, i);
|
|
6717
|
+
const className = findEnclosingClass(lines, i);
|
|
6718
|
+
chunks.push({
|
|
6719
|
+
filePath,
|
|
6720
|
+
startLine: ctorStart + 1,
|
|
6721
|
+
endLine: ctorEnd + 1,
|
|
6722
|
+
content: lines.slice(ctorStart, ctorEnd + 1).join("\n"),
|
|
6723
|
+
kind: "method",
|
|
6724
|
+
name,
|
|
6725
|
+
className,
|
|
6726
|
+
score: 0,
|
|
6727
|
+
tokens: estimateTokens2(lines.slice(ctorStart, ctorEnd + 1).join("\n"))
|
|
6728
|
+
});
|
|
6729
|
+
i = ctorEnd + 1;
|
|
6730
|
+
continue;
|
|
6731
|
+
}
|
|
6732
|
+
const methodMatch = line.match(/^(?:@\w+[\s(].*)*(?:public|private|protected|static|final|synchronized|abstract|\s)+\s+[\w<>\[\],\s?]+\s+(\w+)\s*\(/);
|
|
6637
6733
|
if (methodMatch && !classMatch) {
|
|
6638
6734
|
const name = methodMatch[1];
|
|
6639
|
-
if (!["if", "for", "while", "switch", "catch", "return"].includes(name)) {
|
|
6735
|
+
if (!["if", "for", "while", "switch", "catch", "return", "class", "interface", "enum"].includes(name)) {
|
|
6640
6736
|
let methodStart = i;
|
|
6641
6737
|
while (methodStart > 0 && lines[methodStart - 1].trim().startsWith("@")) {
|
|
6642
6738
|
methodStart--;
|
|
@@ -6860,8 +6956,46 @@ function chunkGo(content, filePath) {
|
|
|
6860
6956
|
function findBraceEnd(lines, start) {
|
|
6861
6957
|
let depth = 0;
|
|
6862
6958
|
let foundOpen = false;
|
|
6959
|
+
let inSingleLineComment = false;
|
|
6960
|
+
let inMultiLineComment = false;
|
|
6961
|
+
let inString = false;
|
|
6863
6962
|
for (let i = start; i < lines.length; i++) {
|
|
6864
|
-
|
|
6963
|
+
const line = lines[i];
|
|
6964
|
+
inSingleLineComment = false;
|
|
6965
|
+
for (let j = 0; j < line.length; j++) {
|
|
6966
|
+
const ch = line[j];
|
|
6967
|
+
const next = j < line.length - 1 ? line[j + 1] : "";
|
|
6968
|
+
const prev = j > 0 ? line[j - 1] : "";
|
|
6969
|
+
if (inString && ch === "\\") {
|
|
6970
|
+
j++;
|
|
6971
|
+
continue;
|
|
6972
|
+
}
|
|
6973
|
+
if (!inSingleLineComment && !inMultiLineComment) {
|
|
6974
|
+
if ((ch === '"' || ch === "'" || ch === "`") && !inString) {
|
|
6975
|
+
inString = ch;
|
|
6976
|
+
continue;
|
|
6977
|
+
}
|
|
6978
|
+
if (inString && ch === inString) {
|
|
6979
|
+
inString = false;
|
|
6980
|
+
continue;
|
|
6981
|
+
}
|
|
6982
|
+
}
|
|
6983
|
+
if (inString) continue;
|
|
6984
|
+
if (!inMultiLineComment && ch === "/" && next === "/") {
|
|
6985
|
+
inSingleLineComment = true;
|
|
6986
|
+
break;
|
|
6987
|
+
}
|
|
6988
|
+
if (!inSingleLineComment && ch === "/" && next === "*") {
|
|
6989
|
+
inMultiLineComment = true;
|
|
6990
|
+
j++;
|
|
6991
|
+
continue;
|
|
6992
|
+
}
|
|
6993
|
+
if (inMultiLineComment && ch === "*" && next === "/") {
|
|
6994
|
+
inMultiLineComment = false;
|
|
6995
|
+
j++;
|
|
6996
|
+
continue;
|
|
6997
|
+
}
|
|
6998
|
+
if (inSingleLineComment || inMultiLineComment) continue;
|
|
6865
6999
|
if (ch === "{") {
|
|
6866
7000
|
depth++;
|
|
6867
7001
|
foundOpen = true;
|
|
@@ -6884,12 +7018,138 @@ function findEnclosingClass(lines, methodLine) {
|
|
|
6884
7018
|
function estimateTokens2(content) {
|
|
6885
7019
|
return Math.ceil(content.length / 4);
|
|
6886
7020
|
}
|
|
7021
|
+
function chunkRust(content, filePath) {
|
|
7022
|
+
const lines = content.split("\n");
|
|
7023
|
+
const chunks = [];
|
|
7024
|
+
let i = 0;
|
|
7025
|
+
while (i < lines.length) {
|
|
7026
|
+
const line = lines[i].trim();
|
|
7027
|
+
if (line.startsWith("use ") || line.startsWith("mod ")) {
|
|
7028
|
+
const blockStart = i;
|
|
7029
|
+
while (i < lines.length && (lines[i].trim().startsWith("use ") || lines[i].trim().startsWith("mod ") || lines[i].trim() === "")) i++;
|
|
7030
|
+
if (i > blockStart) {
|
|
7031
|
+
chunks.push({
|
|
7032
|
+
filePath,
|
|
7033
|
+
startLine: blockStart + 1,
|
|
7034
|
+
endLine: i,
|
|
7035
|
+
content: lines.slice(blockStart, i).join("\n"),
|
|
7036
|
+
kind: "import",
|
|
7037
|
+
name: "imports",
|
|
7038
|
+
score: 0,
|
|
7039
|
+
tokens: estimateTokens2(lines.slice(blockStart, i).join("\n"))
|
|
7040
|
+
});
|
|
7041
|
+
}
|
|
7042
|
+
continue;
|
|
7043
|
+
}
|
|
7044
|
+
const fnMatch = line.match(/^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/);
|
|
7045
|
+
const implMatch = line.match(/^(?:pub\s+)?impl(?:<[^>]*>)?\s+(?:(\w+)\s+for\s+)?(\w+)/);
|
|
7046
|
+
const typeMatch = line.match(/^(?:pub\s+)?(?:struct|enum|trait)\s+(\w+)/);
|
|
7047
|
+
if (fnMatch) {
|
|
7048
|
+
let fnStart = i;
|
|
7049
|
+
while (fnStart > 0 && lines[fnStart - 1].trim().startsWith("#[")) fnStart--;
|
|
7050
|
+
const end = findBraceEnd(lines, i);
|
|
7051
|
+
chunks.push({
|
|
7052
|
+
filePath,
|
|
7053
|
+
startLine: fnStart + 1,
|
|
7054
|
+
endLine: end + 1,
|
|
7055
|
+
content: lines.slice(fnStart, end + 1).join("\n"),
|
|
7056
|
+
kind: "function",
|
|
7057
|
+
name: fnMatch[1],
|
|
7058
|
+
score: 0,
|
|
7059
|
+
tokens: estimateTokens2(lines.slice(fnStart, end + 1).join("\n"))
|
|
7060
|
+
});
|
|
7061
|
+
i = end + 1;
|
|
7062
|
+
continue;
|
|
7063
|
+
}
|
|
7064
|
+
if (implMatch || typeMatch) {
|
|
7065
|
+
const name = typeMatch ? typeMatch[1] : implMatch[2] ?? implMatch[1] ?? "impl";
|
|
7066
|
+
const end = findBraceEnd(lines, i);
|
|
7067
|
+
const kind = typeMatch ? line.includes("trait") ? "interface" : "class" : "class";
|
|
7068
|
+
chunks.push({
|
|
7069
|
+
filePath,
|
|
7070
|
+
startLine: i + 1,
|
|
7071
|
+
endLine: end + 1,
|
|
7072
|
+
content: lines.slice(i, end + 1).join("\n"),
|
|
7073
|
+
kind,
|
|
7074
|
+
name,
|
|
7075
|
+
score: 0,
|
|
7076
|
+
tokens: estimateTokens2(lines.slice(i, end + 1).join("\n"))
|
|
7077
|
+
});
|
|
7078
|
+
}
|
|
7079
|
+
i++;
|
|
7080
|
+
}
|
|
7081
|
+
return chunks;
|
|
7082
|
+
}
|
|
7083
|
+
function chunkKotlin(content, filePath) {
|
|
7084
|
+
const lines = content.split("\n");
|
|
7085
|
+
const chunks = [];
|
|
7086
|
+
let i = 0;
|
|
7087
|
+
const importStart = lines.findIndex((l) => l.trim().startsWith("import "));
|
|
7088
|
+
if (importStart >= 0) {
|
|
7089
|
+
let importEnd = importStart;
|
|
7090
|
+
while (importEnd < lines.length && (lines[importEnd].trim().startsWith("import ") || lines[importEnd].trim() === "")) importEnd++;
|
|
7091
|
+
if (importEnd > importStart) {
|
|
7092
|
+
chunks.push({
|
|
7093
|
+
filePath,
|
|
7094
|
+
startLine: importStart + 1,
|
|
7095
|
+
endLine: importEnd,
|
|
7096
|
+
content: lines.slice(importStart, importEnd).join("\n"),
|
|
7097
|
+
kind: "import",
|
|
7098
|
+
name: "imports",
|
|
7099
|
+
score: 0,
|
|
7100
|
+
tokens: estimateTokens2(lines.slice(importStart, importEnd).join("\n"))
|
|
7101
|
+
});
|
|
7102
|
+
}
|
|
7103
|
+
}
|
|
7104
|
+
i = 0;
|
|
7105
|
+
while (i < lines.length) {
|
|
7106
|
+
const line = lines[i].trim();
|
|
7107
|
+
const funMatch = line.match(/^(?:(?:private|public|internal|protected|override|suspend|inline)\s+)*fun\s+(?:<[^>]*>\s*)?(\w+)\s*\(/);
|
|
7108
|
+
const classMatch = line.match(/^(?:(?:data|sealed|abstract|open|private|public|internal)\s+)*(?:class|interface|object|enum\s+class)\s+(\w+)/);
|
|
7109
|
+
if (classMatch) {
|
|
7110
|
+
const end = findBraceEnd(lines, i);
|
|
7111
|
+
chunks.push({
|
|
7112
|
+
filePath,
|
|
7113
|
+
startLine: i + 1,
|
|
7114
|
+
endLine: end + 1,
|
|
7115
|
+
content: lines.slice(i, end + 1).join("\n"),
|
|
7116
|
+
kind: line.includes("interface") ? "interface" : "class",
|
|
7117
|
+
name: classMatch[1],
|
|
7118
|
+
score: 0,
|
|
7119
|
+
tokens: estimateTokens2(lines.slice(i, end + 1).join("\n"))
|
|
7120
|
+
});
|
|
7121
|
+
}
|
|
7122
|
+
if (funMatch && !classMatch) {
|
|
7123
|
+
let funStart = i;
|
|
7124
|
+
while (funStart > 0 && lines[funStart - 1].trim().startsWith("@")) funStart--;
|
|
7125
|
+
const end = findBraceEnd(lines, i);
|
|
7126
|
+
const className = findEnclosingClass(lines, i);
|
|
7127
|
+
chunks.push({
|
|
7128
|
+
filePath,
|
|
7129
|
+
startLine: funStart + 1,
|
|
7130
|
+
endLine: end + 1,
|
|
7131
|
+
content: lines.slice(funStart, end + 1).join("\n"),
|
|
7132
|
+
kind: className ? "method" : "function",
|
|
7133
|
+
name: funMatch[1],
|
|
7134
|
+
className,
|
|
7135
|
+
score: 0,
|
|
7136
|
+
tokens: estimateTokens2(lines.slice(funStart, end + 1).join("\n"))
|
|
7137
|
+
});
|
|
7138
|
+
i = end + 1;
|
|
7139
|
+
continue;
|
|
7140
|
+
}
|
|
7141
|
+
i++;
|
|
7142
|
+
}
|
|
7143
|
+
return chunks;
|
|
7144
|
+
}
|
|
6887
7145
|
function getLanguage2(filePath) {
|
|
6888
7146
|
const ext = filePath.split(".").pop()?.toLowerCase() ?? "";
|
|
6889
7147
|
if (ext === "java") return "java";
|
|
6890
7148
|
if (["ts", "tsx", "js", "jsx", "mts", "mjs"].includes(ext)) return "ts";
|
|
6891
7149
|
if (ext === "py") return "python";
|
|
6892
7150
|
if (ext === "go") return "go";
|
|
7151
|
+
if (ext === "rs") return "rust";
|
|
7152
|
+
if (["kt", "kts"].includes(ext)) return "kotlin";
|
|
6893
7153
|
return null;
|
|
6894
7154
|
}
|
|
6895
7155
|
function chunkFile(content, filePath) {
|
|
@@ -6904,23 +7164,41 @@ function chunkFile(content, filePath) {
|
|
|
6904
7164
|
return chunkPython(content, filePath);
|
|
6905
7165
|
case "go":
|
|
6906
7166
|
return chunkGo(content, filePath);
|
|
7167
|
+
case "rust":
|
|
7168
|
+
return chunkRust(content, filePath);
|
|
7169
|
+
case "kotlin":
|
|
7170
|
+
return chunkKotlin(content, filePath);
|
|
6907
7171
|
}
|
|
6908
7172
|
}
|
|
6909
7173
|
function scoreChunks(chunks, task) {
|
|
6910
7174
|
const queryTerms = tokenize(task);
|
|
6911
7175
|
const queryTermSet = new Set(queryTerms);
|
|
7176
|
+
const queryStems = /* @__PURE__ */ new Map();
|
|
7177
|
+
for (const qt of queryTermSet) queryStems.set(qt, stem2(qt));
|
|
7178
|
+
const queryStemSet = new Set(queryStems.values());
|
|
6912
7179
|
for (const chunk of chunks) {
|
|
6913
7180
|
const chunkTerms = tokenize(chunk.content);
|
|
6914
7181
|
const chunkTermSet = new Set(chunkTerms);
|
|
6915
|
-
|
|
7182
|
+
const chunkStemSet = new Set([...chunkTermSet].map(stem2));
|
|
7183
|
+
let exactOverlap = 0;
|
|
6916
7184
|
for (const qt of queryTermSet) {
|
|
6917
|
-
if (chunkTermSet.has(qt))
|
|
7185
|
+
if (chunkTermSet.has(qt)) exactOverlap++;
|
|
6918
7186
|
}
|
|
6919
|
-
|
|
7187
|
+
let stemOverlap = 0;
|
|
7188
|
+
for (const qs of queryStemSet) {
|
|
7189
|
+
if (chunkStemSet.has(qs)) stemOverlap++;
|
|
7190
|
+
}
|
|
7191
|
+
const stemOnlyMatches = Math.max(0, stemOverlap - exactOverlap);
|
|
7192
|
+
const effectiveOverlap = exactOverlap + stemOnlyMatches * 0.5;
|
|
7193
|
+
const termCoverage = queryTermSet.size > 0 ? effectiveOverlap / queryTermSet.size : 0;
|
|
6920
7194
|
let nameBonus = 0;
|
|
6921
7195
|
const nameTerms = tokenize(chunk.name + (chunk.className ? " " + chunk.className : ""));
|
|
6922
7196
|
for (const nt of nameTerms) {
|
|
6923
|
-
if (queryTermSet.has(nt))
|
|
7197
|
+
if (queryTermSet.has(nt)) {
|
|
7198
|
+
nameBonus += 0.3;
|
|
7199
|
+
} else if (queryStemSet.has(stem2(nt))) {
|
|
7200
|
+
nameBonus += 0.15;
|
|
7201
|
+
}
|
|
6924
7202
|
}
|
|
6925
7203
|
const kindBonus = chunk.kind === "method" || chunk.kind === "function" ? 0.1 : chunk.kind === "class" || chunk.kind === "interface" ? 0.05 : 0;
|
|
6926
7204
|
const sizePenalty = chunk.tokens > 500 ? 0.9 : chunk.tokens > 1e3 ? 0.7 : 1;
|
|
@@ -7113,6 +7391,7 @@ export {
|
|
|
7113
7391
|
getLearnerStats,
|
|
7114
7392
|
getOptimizedWeights,
|
|
7115
7393
|
getPruneLevelForRisk,
|
|
7394
|
+
getStemVariants,
|
|
7116
7395
|
getStructuralSummary,
|
|
7117
7396
|
getSynonymStats,
|
|
7118
7397
|
getTelemetryBoosts,
|
|
@@ -7163,6 +7442,7 @@ export {
|
|
|
7163
7442
|
setJsonLogging,
|
|
7164
7443
|
setLogLevel,
|
|
7165
7444
|
similarity,
|
|
7445
|
+
stem2 as stem,
|
|
7166
7446
|
testSignificance,
|
|
7167
7447
|
tokenize,
|
|
7168
7448
|
walkProject,
|
package/dist/mcp/index.js
CHANGED
|
@@ -5112,61 +5112,88 @@ function buildWeightedQuery(intent) {
|
|
|
5112
5112
|
|
|
5113
5113
|
// src/engine/embeddings.ts
|
|
5114
5114
|
function buildTfIdfEmbeddingIndex(index) {
|
|
5115
|
-
const
|
|
5115
|
+
const origTerms = [...index.idf.keys()];
|
|
5116
|
+
const allTerms = [];
|
|
5117
|
+
const termSet = /* @__PURE__ */ new Set();
|
|
5118
|
+
for (const t of origTerms) {
|
|
5119
|
+
if (!termSet.has(t)) {
|
|
5120
|
+
allTerms.push(t);
|
|
5121
|
+
termSet.add(t);
|
|
5122
|
+
}
|
|
5123
|
+
}
|
|
5124
|
+
for (const t of origTerms) {
|
|
5125
|
+
for (const s of getStemVariants(t)) {
|
|
5126
|
+
const stemKey = `\xA7${s}`;
|
|
5127
|
+
if (!termSet.has(stemKey)) {
|
|
5128
|
+
allTerms.push(stemKey);
|
|
5129
|
+
termSet.add(stemKey);
|
|
5130
|
+
}
|
|
5131
|
+
}
|
|
5132
|
+
}
|
|
5116
5133
|
const termToIdx = new Map(allTerms.map((t, i) => [t, i]));
|
|
5117
5134
|
const dimensions = allTerms.length;
|
|
5118
5135
|
const docVectors = /* @__PURE__ */ new Map();
|
|
5119
|
-
const
|
|
5136
|
+
const docNonZero = /* @__PURE__ */ new Map();
|
|
5120
5137
|
for (const [filePath, doc] of index.documents) {
|
|
5121
5138
|
const vec = new Float32Array(dimensions);
|
|
5122
|
-
|
|
5139
|
+
const nonZero = [];
|
|
5123
5140
|
for (const [term, tf] of doc.terms) {
|
|
5124
|
-
const idx = termToIdx.get(term);
|
|
5125
|
-
if (idx === void 0) continue;
|
|
5126
5141
|
const idf = index.idf.get(term) ?? 0;
|
|
5127
5142
|
const weight = tf * idf;
|
|
5128
|
-
|
|
5129
|
-
|
|
5143
|
+
const idx = termToIdx.get(term);
|
|
5144
|
+
if (idx !== void 0) {
|
|
5145
|
+
vec[idx] += weight;
|
|
5146
|
+
nonZero.push(idx);
|
|
5147
|
+
}
|
|
5148
|
+
for (const s of getStemVariants(term)) {
|
|
5149
|
+
const stemIdx = termToIdx.get(`\xA7${s}`);
|
|
5150
|
+
if (stemIdx !== void 0) {
|
|
5151
|
+
vec[stemIdx] += weight * 0.5;
|
|
5152
|
+
nonZero.push(stemIdx);
|
|
5153
|
+
}
|
|
5154
|
+
}
|
|
5130
5155
|
}
|
|
5156
|
+
let norm = 0;
|
|
5157
|
+
for (const i of nonZero) norm += vec[i] * vec[i];
|
|
5131
5158
|
norm = Math.sqrt(norm);
|
|
5132
5159
|
if (norm > 0) {
|
|
5133
|
-
for (
|
|
5134
|
-
vec[i] /= norm;
|
|
5135
|
-
}
|
|
5160
|
+
for (const i of nonZero) vec[i] /= norm;
|
|
5136
5161
|
}
|
|
5137
5162
|
docVectors.set(filePath, vec);
|
|
5138
|
-
|
|
5163
|
+
docNonZero.set(filePath, [...new Set(nonZero)]);
|
|
5139
5164
|
}
|
|
5140
5165
|
function queryFn(text, topK) {
|
|
5141
5166
|
const queryTerms = tokenizeForEmbedding(text);
|
|
5142
|
-
const
|
|
5167
|
+
const expandedCounts = /* @__PURE__ */ new Map();
|
|
5143
5168
|
for (const t of queryTerms) {
|
|
5144
|
-
|
|
5169
|
+
expandedCounts.set(t, (expandedCounts.get(t) ?? 0) + 1);
|
|
5170
|
+
for (const s of getStemVariants(t)) {
|
|
5171
|
+
const stemKey = `\xA7${s}`;
|
|
5172
|
+
expandedCounts.set(stemKey, (expandedCounts.get(stemKey) ?? 0) + 0.5);
|
|
5173
|
+
}
|
|
5145
5174
|
}
|
|
5146
5175
|
const queryVec = new Float32Array(dimensions);
|
|
5147
|
-
|
|
5148
|
-
for (const [term, count] of
|
|
5176
|
+
const queryNonZero = [];
|
|
5177
|
+
for (const [term, count] of expandedCounts) {
|
|
5149
5178
|
const idx = termToIdx.get(term);
|
|
5150
5179
|
if (idx === void 0) continue;
|
|
5151
|
-
const
|
|
5152
|
-
const
|
|
5153
|
-
queryVec[idx] =
|
|
5154
|
-
|
|
5180
|
+
const rawTerm = term.startsWith("\xA7") ? term.slice(1) : term;
|
|
5181
|
+
const idf = index.idf.get(rawTerm) ?? 1;
|
|
5182
|
+
queryVec[idx] = count * idf;
|
|
5183
|
+
queryNonZero.push(idx);
|
|
5155
5184
|
}
|
|
5185
|
+
let queryNorm = 0;
|
|
5186
|
+
for (const i of queryNonZero) queryNorm += queryVec[i] * queryVec[i];
|
|
5156
5187
|
queryNorm = Math.sqrt(queryNorm);
|
|
5157
5188
|
if (queryNorm > 0) {
|
|
5158
|
-
for (
|
|
5159
|
-
queryVec[i] /= queryNorm;
|
|
5160
|
-
}
|
|
5189
|
+
for (const i of queryNonZero) queryVec[i] /= queryNorm;
|
|
5161
5190
|
}
|
|
5162
5191
|
const results = [];
|
|
5192
|
+
const queryIdxSet = new Set(queryNonZero);
|
|
5163
5193
|
for (const [filePath, docVec] of docVectors) {
|
|
5164
5194
|
let dot = 0;
|
|
5165
|
-
for (const
|
|
5166
|
-
|
|
5167
|
-
if (idx !== void 0) {
|
|
5168
|
-
dot += queryVec[idx] * docVec[idx];
|
|
5169
|
-
}
|
|
5195
|
+
for (const i of queryNonZero) {
|
|
5196
|
+
if (docVec[i] !== 0) dot += queryVec[i] * docVec[i];
|
|
5170
5197
|
}
|
|
5171
5198
|
if (dot > 0) {
|
|
5172
5199
|
results.push({ filePath, score: dot });
|
|
@@ -5195,6 +5222,54 @@ function reciprocalRankFusion(bm25Results, embeddingResults, k = 60, bm25Weight
|
|
|
5195
5222
|
}
|
|
5196
5223
|
return [...scores.entries()].map(([filePath, score]) => ({ filePath, score })).sort((a, b) => b.score - a.score);
|
|
5197
5224
|
}
|
|
5225
|
+
function stem2(word) {
|
|
5226
|
+
if (word.length < 4) return word;
|
|
5227
|
+
const rules = [
|
|
5228
|
+
["ization", 4],
|
|
5229
|
+
["isation", 4],
|
|
5230
|
+
["ation", 4],
|
|
5231
|
+
["ition", 4],
|
|
5232
|
+
["tion", 3],
|
|
5233
|
+
["sion", 3],
|
|
5234
|
+
["ment", 3],
|
|
5235
|
+
["ness", 3],
|
|
5236
|
+
["able", 3],
|
|
5237
|
+
["ible", 3],
|
|
5238
|
+
["ive", 3],
|
|
5239
|
+
["ing", 3],
|
|
5240
|
+
["ity", 3],
|
|
5241
|
+
["ous", 3],
|
|
5242
|
+
["ful", 3],
|
|
5243
|
+
["ate", 3],
|
|
5244
|
+
["ize", 3],
|
|
5245
|
+
["ise", 3],
|
|
5246
|
+
["ure", 3],
|
|
5247
|
+
["ent", 3],
|
|
5248
|
+
["ant", 3],
|
|
5249
|
+
["al", 3],
|
|
5250
|
+
["er", 3],
|
|
5251
|
+
["or", 3],
|
|
5252
|
+
["ed", 3],
|
|
5253
|
+
["ly", 3],
|
|
5254
|
+
["es", 3],
|
|
5255
|
+
["s", 3]
|
|
5256
|
+
];
|
|
5257
|
+
for (const [suffix, minRemaining] of rules) {
|
|
5258
|
+
if (word.endsWith(suffix) && word.length - suffix.length >= minRemaining) {
|
|
5259
|
+
return word.slice(0, word.length - suffix.length);
|
|
5260
|
+
}
|
|
5261
|
+
}
|
|
5262
|
+
return word;
|
|
5263
|
+
}
|
|
5264
|
+
function getStemVariants(word) {
|
|
5265
|
+
const variants = /* @__PURE__ */ new Set();
|
|
5266
|
+
variants.add(word);
|
|
5267
|
+
const stripped = stem2(word);
|
|
5268
|
+
if (stripped !== word && stripped.length >= 3) variants.add(stripped);
|
|
5269
|
+
if (word.length >= 6) variants.add(word.slice(0, 5));
|
|
5270
|
+
if (word.endsWith("e") && word.length >= 5) variants.add(word.slice(0, -1));
|
|
5271
|
+
return [...variants];
|
|
5272
|
+
}
|
|
5198
5273
|
function tokenizeForEmbedding(text) {
|
|
5199
5274
|
return text.toLowerCase().replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]/g, " ").split(/\s+/).filter((t) => t.length >= 2);
|
|
5200
5275
|
}
|
|
@@ -5354,7 +5429,7 @@ async function runContextPipeline(input) {
|
|
|
5354
5429
|
const rerankerApproved = new Set(rerankResult.files.map((rf) => rf.filePath));
|
|
5355
5430
|
const rerankedMatches = boostedMatches.map((m) => ({
|
|
5356
5431
|
filePath: m.filePath,
|
|
5357
|
-
score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score,
|
|
5432
|
+
score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score * 0.25,
|
|
5358
5433
|
matchedTerms: [...m.matchedTerms]
|
|
5359
5434
|
}));
|
|
5360
5435
|
for (const m of rerankedMatches) {
|
package/package.json
CHANGED