npm - cto-ai-cli - Versions diffs - 8.0.1 → 8.1.0 - Mend

cto-ai-cli 8.0.1 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -38,14 +38,14 @@ This runs a self-contained presentation that shows: project analysis, semantic m
 ## Benchmark Results
-**Eval Harness v8.0** — 20-file Java enterprise project, 4 tasks with expert-labeled ground truth:
-| Metric | Result |
-|---|---|
-| **Must-have recall** | **100%** (every critical file found) |
-| **Precision** | **38–44%** |
-| **F1** | **55%** |
-| **Noise rate** | **11.3%** |
+**Eval Harness v8.1** — 20-file Java enterprise project, 4 tasks with expert-labeled ground truth:
+| Metric | v8.0 | **v8.1** |
+|---|---|---|
+| **Must-have recall** | 100% | **100%** |
+| **Precision** | 38% | **60%** (+22pp) |
+| **F1** | 55% | **74%** (+19pp) |
+| **Noise rate** | 11.3% | **5.7%** (-5.6pp) |
 **Real production repos** (Java monoliths):

package/dist/cli/index.js CHANGED Viewed

@@ -5674,61 +5674,88 @@ var init_query_intent = __esm({
 // src/engine/embeddings.ts
 function buildTfIdfEmbeddingIndex(index) {
-  const allTerms = [...index.idf.keys()];
+  const origTerms = [...index.idf.keys()];
+  const allTerms = [];
+  const termSet = /* @__PURE__ */ new Set();
+  for (const t of origTerms) {
+    if (!termSet.has(t)) {
+      allTerms.push(t);
+      termSet.add(t);
+    }
+  }
+  for (const t of origTerms) {
+    for (const s of getStemVariants(t)) {
+      const stemKey = `\xA7${s}`;
+      if (!termSet.has(stemKey)) {
+        allTerms.push(stemKey);
+        termSet.add(stemKey);
+      }
+    }
+  }
   const termToIdx = new Map(allTerms.map((t, i) => [t, i]));
   const dimensions = allTerms.length;
   const docVectors = /* @__PURE__ */ new Map();
-  const docNorms = /* @__PURE__ */ new Map();
+  const docNonZero = /* @__PURE__ */ new Map();
   for (const [filePath, doc] of index.documents) {
     const vec = new Float32Array(dimensions);
-    let norm = 0;
+    const nonZero = [];
     for (const [term, tf] of doc.terms) {
-      const idx = termToIdx.get(term);
-      if (idx === void 0) continue;
       const idf = index.idf.get(term) ?? 0;
       const weight = tf * idf;
-      vec[idx] = weight;
-      norm += weight * weight;
+      const idx = termToIdx.get(term);
+      if (idx !== void 0) {
+        vec[idx] += weight;
+        nonZero.push(idx);
+      }
+      for (const s of getStemVariants(term)) {
+        const stemIdx = termToIdx.get(`\xA7${s}`);
+        if (stemIdx !== void 0) {
+          vec[stemIdx] += weight * 0.5;
+          nonZero.push(stemIdx);
+        }
+      }
     }
+    let norm = 0;
+    for (const i of nonZero) norm += vec[i] * vec[i];
     norm = Math.sqrt(norm);
     if (norm > 0) {
-      for (let i = 0; i < dimensions; i++) {
-        vec[i] /= norm;
-      }
+      for (const i of nonZero) vec[i] /= norm;
     }
     docVectors.set(filePath, vec);
-    docNorms.set(filePath, norm);
+    docNonZero.set(filePath, [...new Set(nonZero)]);
   }
   function queryFn(text, topK) {
     const queryTerms = tokenizeForEmbedding(text);
-    const termCounts = /* @__PURE__ */ new Map();
+    const expandedCounts = /* @__PURE__ */ new Map();
     for (const t of queryTerms) {
-      termCounts.set(t, (termCounts.get(t) ?? 0) + 1);
+      expandedCounts.set(t, (expandedCounts.get(t) ?? 0) + 1);
+      for (const s of getStemVariants(t)) {
+        const stemKey = `\xA7${s}`;
+        expandedCounts.set(stemKey, (expandedCounts.get(stemKey) ?? 0) + 0.5);
+      }
     }
     const queryVec = new Float32Array(dimensions);
-    let queryNorm = 0;
-    for (const [term, count] of termCounts) {
+    const queryNonZero = [];
+    for (const [term, count] of expandedCounts) {
       const idx = termToIdx.get(term);
       if (idx === void 0) continue;
-      const idf = index.idf.get(term) ?? 0;
-      const weight = count * idf;
-      queryVec[idx] = weight;
-      queryNorm += weight * weight;
+      const rawTerm = term.startsWith("\xA7") ? term.slice(1) : term;
+      const idf = index.idf.get(rawTerm) ?? 1;
+      queryVec[idx] = count * idf;
+      queryNonZero.push(idx);
     }
+    let queryNorm = 0;
+    for (const i of queryNonZero) queryNorm += queryVec[i] * queryVec[i];
     queryNorm = Math.sqrt(queryNorm);
     if (queryNorm > 0) {
-      for (let i = 0; i < dimensions; i++) {
-        queryVec[i] /= queryNorm;
-      }
+      for (const i of queryNonZero) queryVec[i] /= queryNorm;
     }
     const results = [];
+    const queryIdxSet = new Set(queryNonZero);
     for (const [filePath, docVec] of docVectors) {
       let dot = 0;
-      for (const [term] of termCounts) {
-        const idx = termToIdx.get(term);
-        if (idx !== void 0) {
-          dot += queryVec[idx] * docVec[idx];
-        }
+      for (const i of queryNonZero) {
+        if (docVec[i] !== 0) dot += queryVec[i] * docVec[i];
       }
       if (dot > 0) {
         results.push({ filePath, score: dot });
@@ -5757,6 +5784,54 @@ function reciprocalRankFusion(bm25Results, embeddingResults, k = 60, bm25Weight
   }
   return [...scores.entries()].map(([filePath, score]) => ({ filePath, score })).sort((a, b) => b.score - a.score);
 }
+function stem2(word) {
+  if (word.length < 4) return word;
+  const rules = [
+    ["ization", 4],
+    ["isation", 4],
+    ["ation", 4],
+    ["ition", 4],
+    ["tion", 3],
+    ["sion", 3],
+    ["ment", 3],
+    ["ness", 3],
+    ["able", 3],
+    ["ible", 3],
+    ["ive", 3],
+    ["ing", 3],
+    ["ity", 3],
+    ["ous", 3],
+    ["ful", 3],
+    ["ate", 3],
+    ["ize", 3],
+    ["ise", 3],
+    ["ure", 3],
+    ["ent", 3],
+    ["ant", 3],
+    ["al", 3],
+    ["er", 3],
+    ["or", 3],
+    ["ed", 3],
+    ["ly", 3],
+    ["es", 3],
+    ["s", 3]
+  ];
+  for (const [suffix, minRemaining] of rules) {
+    if (word.endsWith(suffix) && word.length - suffix.length >= minRemaining) {
+      return word.slice(0, word.length - suffix.length);
+    }
+  }
+  return word;
+}
+function getStemVariants(word) {
+  const variants = /* @__PURE__ */ new Set();
+  variants.add(word);
+  const stripped = stem2(word);
+  if (stripped !== word && stripped.length >= 3) variants.add(stripped);
+  if (word.length >= 6) variants.add(word.slice(0, 5));
+  if (word.endsWith("e") && word.length >= 5) variants.add(word.slice(0, -1));
+  return [...variants];
+}
 function tokenizeForEmbedding(text) {
   return text.toLowerCase().replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]/g, " ").split(/\s+/).filter((t) => t.length >= 2);
 }
@@ -6124,7 +6199,7 @@ async function runContextPipeline(input) {
   const rerankerApproved = new Set(rerankResult.files.map((rf) => rf.filePath));
   const rerankedMatches = boostedMatches.map((m) => ({
     filePath: m.filePath,
-    score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score,
+    score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score * 0.25,
     matchedTerms: [...m.matchedTerms]
   }));
   for (const m of rerankedMatches) {

package/dist/engine/index.d.ts CHANGED Viewed

@@ -1688,6 +1688,16 @@ declare function reciprocalRankFusion(bm25Results: {
     filePath: string;
     score: number;
 }[];
+/**
+ * Simple Porter-like stemmer for code/English terms.
+ * Catches: invalidation↔invalidate, authentication↔authenticate, processing↔process
+ */
+declare function stem(word: string): string;
+/**
+ * Generate stem variants for a term.
+ * Returns original + suffix-stripped + 5-char prefix + trailing-e stripped.
+ */
+declare function getStemVariants(word: string): string[];
 /**
  * Check if ONNX Runtime is available for neural embeddings.
  */
@@ -1829,4 +1839,4 @@ interface AuditOptions {
 }
 declare function auditProject(projectPath: string, filePaths: string[], options?: AuditOptions): Promise<AuditResult>;
-export { type ActionType, type ArchLayer, type AssignmentResult, type CallGraphResult, type ChunkKind, type ChunkRetrievalResult, type CoChangeEntry, type CoChangeMatrix, type CodeChunk, type ContextPipelineInput, type ContextPipelineResult, type CorpusEmbeddings, CtoError, type CtoErrorCode, type DocumentVector, type EmbeddingIndex, type EmbeddingResult, type Experiment, type ExperimentConclusion, type ExperimentGroup, type FileOpenEvent, type FilteredFile, type GroupMetrics, type HopDetail, type ImportSpec, type IndexCacheStats, type LearnerBoost, type LearnerBoostInput, type LearnerModel, type LogEntry, type LogLevel, type Logger, type MethodCall, type MethodDefinition, type MultiHopConfig, type MultiHopResult, type MultiRepoResult, type PatternStats, type QueryIntent, type RerankInput, type RerankResult, type RerankedFile, type SecretFinding, type SecretType, type SelectionInput, type SemanticExpansion, type SemanticMatch, type SemanticScore, type SiblingMatch, type SiblingRepo, type SignalWeight, type SignificanceResult, type StructuralTokens, type SupportedLanguage, type SynonymExpansion, type TelemetryModel, type TelemetrySession, type TfIdfIndex, type TunedWeights, type WeightTunerModel, analyzeProject, assignGroup, attributeToSignal, auditProject, augmentContentWithStructure, bfsBidirectional, boostByCallGraph, boostByGitCoChange, boostByImports, boostByLayer, boostByPath, buildAdjacencyList, buildCallGraph, buildCoChangeMatrix, buildCorpusEmbeddings, buildIndex, buildIndexCached, buildNeuralEmbeddingIndex, buildProjectGraph, buildTfIdfEmbeddingIndex, buildWeightedQuery, calculateCoverage, chunkFile, classifyFileKind, countTokensChars4, countTokensTiktoken, createExperiment, createFreshModel, createLogger, createProject, detectLanguage, detectStack, discoverSiblingRepos, embedQuery, reciprocalRankFusion as embeddingRRF, estimateComplexity, estimateFileTokens, estimateTokens, expandLayers, expandQuery, expandQueryWithPMI, expandTerm, extractPattern, extractStructuralTokens, freeEncoder, getActiveExperiment, getCacheInfo, getConcludedExperiments, getExpansionDetails, getGitRecency, getLearnerBoosts, getLearnerStats, getOptimizedWeights, getPruneLevelForRisk, getStructuralSummary, getSynonymStats, getTelemetryBoosts, invalidateCache, isCtoError, isOnnxAvailable, loadExperiments, loadLearner, loadTelemetry, loadWeightTuner, multiHopQuery, optimizeBudget, parseAllPolyglotImports, parseImports, parseQueryIntent, parseSiblingPaths, pruneFile, pruneFiles, query, queryByEmbedding, querySiblingRepos, reciprocalRankFusion$1 as reciprocalRankFusion, recordFeedback, recordFileOpen, recordOutcome, recordSelection, recordSession, renderExperimentSummary, renderFileChunks, renderMultiRepoSummary, renderTelemetrySummary, renderWeightStatus, rerank, retrieveChunks, runContextPipeline, sanitizeContent, saveExperiments, saveLearner, saveTelemetry, saveWeightTuner, scanContentForSecrets, scanFileForSecrets, scanProjectForSecrets, scoreAllFiles, scoreChunks, scoreFile, selectContext, setJsonLogging, setLogLevel, similarity, testSignificance, tokenize, walkProject, wrapError };
+export { type ActionType, type ArchLayer, type AssignmentResult, type CallGraphResult, type ChunkKind, type ChunkRetrievalResult, type CoChangeEntry, type CoChangeMatrix, type CodeChunk, type ContextPipelineInput, type ContextPipelineResult, type CorpusEmbeddings, CtoError, type CtoErrorCode, type DocumentVector, type EmbeddingIndex, type EmbeddingResult, type Experiment, type ExperimentConclusion, type ExperimentGroup, type FileOpenEvent, type FilteredFile, type GroupMetrics, type HopDetail, type ImportSpec, type IndexCacheStats, type LearnerBoost, type LearnerBoostInput, type LearnerModel, type LogEntry, type LogLevel, type Logger, type MethodCall, type MethodDefinition, type MultiHopConfig, type MultiHopResult, type MultiRepoResult, type PatternStats, type QueryIntent, type RerankInput, type RerankResult, type RerankedFile, type SecretFinding, type SecretType, type SelectionInput, type SemanticExpansion, type SemanticMatch, type SemanticScore, type SiblingMatch, type SiblingRepo, type SignalWeight, type SignificanceResult, type StructuralTokens, type SupportedLanguage, type SynonymExpansion, type TelemetryModel, type TelemetrySession, type TfIdfIndex, type TunedWeights, type WeightTunerModel, analyzeProject, assignGroup, attributeToSignal, auditProject, augmentContentWithStructure, bfsBidirectional, boostByCallGraph, boostByGitCoChange, boostByImports, boostByLayer, boostByPath, buildAdjacencyList, buildCallGraph, buildCoChangeMatrix, buildCorpusEmbeddings, buildIndex, buildIndexCached, buildNeuralEmbeddingIndex, buildProjectGraph, buildTfIdfEmbeddingIndex, buildWeightedQuery, calculateCoverage, chunkFile, classifyFileKind, countTokensChars4, countTokensTiktoken, createExperiment, createFreshModel, createLogger, createProject, detectLanguage, detectStack, discoverSiblingRepos, embedQuery, reciprocalRankFusion as embeddingRRF, estimateComplexity, estimateFileTokens, estimateTokens, expandLayers, expandQuery, expandQueryWithPMI, expandTerm, extractPattern, extractStructuralTokens, freeEncoder, getActiveExperiment, getCacheInfo, getConcludedExperiments, getExpansionDetails, getGitRecency, getLearnerBoosts, getLearnerStats, getOptimizedWeights, getPruneLevelForRisk, getStemVariants, getStructuralSummary, getSynonymStats, getTelemetryBoosts, invalidateCache, isCtoError, isOnnxAvailable, loadExperiments, loadLearner, loadTelemetry, loadWeightTuner, multiHopQuery, optimizeBudget, parseAllPolyglotImports, parseImports, parseQueryIntent, parseSiblingPaths, pruneFile, pruneFiles, query, queryByEmbedding, querySiblingRepos, reciprocalRankFusion$1 as reciprocalRankFusion, recordFeedback, recordFileOpen, recordOutcome, recordSelection, recordSession, renderExperimentSummary, renderFileChunks, renderMultiRepoSummary, renderTelemetrySummary, renderWeightStatus, rerank, retrieveChunks, runContextPipeline, sanitizeContent, saveExperiments, saveLearner, saveTelemetry, saveWeightTuner, scanContentForSecrets, scanFileForSecrets, scanProjectForSecrets, scoreAllFiles, scoreChunks, scoreFile, selectContext, setJsonLogging, setLogLevel, similarity, stem, testSignificance, tokenize, walkProject, wrapError };

package/dist/engine/index.js CHANGED Viewed

@@ -5754,61 +5754,88 @@ function expandLayers(layers) {
 // src/engine/embeddings.ts
 function buildTfIdfEmbeddingIndex(index) {
-  const allTerms = [...index.idf.keys()];
+  const origTerms = [...index.idf.keys()];
+  const allTerms = [];
+  const termSet = /* @__PURE__ */ new Set();
+  for (const t of origTerms) {
+    if (!termSet.has(t)) {
+      allTerms.push(t);
+      termSet.add(t);
+    }
+  }
+  for (const t of origTerms) {
+    for (const s of getStemVariants(t)) {
+      const stemKey = `\xA7${s}`;
+      if (!termSet.has(stemKey)) {
+        allTerms.push(stemKey);
+        termSet.add(stemKey);
+      }
+    }
+  }
   const termToIdx = new Map(allTerms.map((t, i) => [t, i]));
   const dimensions = allTerms.length;
   const docVectors = /* @__PURE__ */ new Map();
-  const docNorms = /* @__PURE__ */ new Map();
+  const docNonZero = /* @__PURE__ */ new Map();
   for (const [filePath, doc] of index.documents) {
     const vec = new Float32Array(dimensions);
-    let norm = 0;
+    const nonZero = [];
     for (const [term, tf] of doc.terms) {
-      const idx = termToIdx.get(term);
-      if (idx === void 0) continue;
       const idf = index.idf.get(term) ?? 0;
       const weight = tf * idf;
-      vec[idx] = weight;
-      norm += weight * weight;
+      const idx = termToIdx.get(term);
+      if (idx !== void 0) {
+        vec[idx] += weight;
+        nonZero.push(idx);
+      }
+      for (const s of getStemVariants(term)) {
+        const stemIdx = termToIdx.get(`\xA7${s}`);
+        if (stemIdx !== void 0) {
+          vec[stemIdx] += weight * 0.5;
+          nonZero.push(stemIdx);
+        }
+      }
     }
+    let norm = 0;
+    for (const i of nonZero) norm += vec[i] * vec[i];
     norm = Math.sqrt(norm);
     if (norm > 0) {
-      for (let i = 0; i < dimensions; i++) {
-        vec[i] /= norm;
-      }
+      for (const i of nonZero) vec[i] /= norm;
     }
     docVectors.set(filePath, vec);
-    docNorms.set(filePath, norm);
+    docNonZero.set(filePath, [...new Set(nonZero)]);
   }
   function queryFn(text, topK) {
     const queryTerms = tokenizeForEmbedding(text);
-    const termCounts = /* @__PURE__ */ new Map();
+    const expandedCounts = /* @__PURE__ */ new Map();
     for (const t of queryTerms) {
-      termCounts.set(t, (termCounts.get(t) ?? 0) + 1);
+      expandedCounts.set(t, (expandedCounts.get(t) ?? 0) + 1);
+      for (const s of getStemVariants(t)) {
+        const stemKey = `\xA7${s}`;
+        expandedCounts.set(stemKey, (expandedCounts.get(stemKey) ?? 0) + 0.5);
+      }
     }
     const queryVec = new Float32Array(dimensions);
-    let queryNorm = 0;
-    for (const [term, count] of termCounts) {
+    const queryNonZero = [];
+    for (const [term, count] of expandedCounts) {
       const idx = termToIdx.get(term);
       if (idx === void 0) continue;
-      const idf = index.idf.get(term) ?? 0;
-      const weight = count * idf;
-      queryVec[idx] = weight;
-      queryNorm += weight * weight;
+      const rawTerm = term.startsWith("\xA7") ? term.slice(1) : term;
+      const idf = index.idf.get(rawTerm) ?? 1;
+      queryVec[idx] = count * idf;
+      queryNonZero.push(idx);
     }
+    let queryNorm = 0;
+    for (const i of queryNonZero) queryNorm += queryVec[i] * queryVec[i];
     queryNorm = Math.sqrt(queryNorm);
     if (queryNorm > 0) {
-      for (let i = 0; i < dimensions; i++) {
-        queryVec[i] /= queryNorm;
-      }
+      for (const i of queryNonZero) queryVec[i] /= queryNorm;
     }
     const results = [];
+    const queryIdxSet = new Set(queryNonZero);
     for (const [filePath, docVec] of docVectors) {
       let dot = 0;
-      for (const [term] of termCounts) {
-        const idx = termToIdx.get(term);
-        if (idx !== void 0) {
-          dot += queryVec[idx] * docVec[idx];
-        }
+      for (const i of queryNonZero) {
+        if (docVec[i] !== 0) dot += queryVec[i] * docVec[i];
       }
       if (dot > 0) {
         results.push({ filePath, score: dot });
@@ -5837,6 +5864,54 @@ function reciprocalRankFusion2(bm25Results, embeddingResults, k = 60, bm25Weight
   }
   return [...scores.entries()].map(([filePath, score]) => ({ filePath, score })).sort((a, b) => b.score - a.score);
 }
+function stem2(word) {
+  if (word.length < 4) return word;
+  const rules = [
+    ["ization", 4],
+    ["isation", 4],
+    ["ation", 4],
+    ["ition", 4],
+    ["tion", 3],
+    ["sion", 3],
+    ["ment", 3],
+    ["ness", 3],
+    ["able", 3],
+    ["ible", 3],
+    ["ive", 3],
+    ["ing", 3],
+    ["ity", 3],
+    ["ous", 3],
+    ["ful", 3],
+    ["ate", 3],
+    ["ize", 3],
+    ["ise", 3],
+    ["ure", 3],
+    ["ent", 3],
+    ["ant", 3],
+    ["al", 3],
+    ["er", 3],
+    ["or", 3],
+    ["ed", 3],
+    ["ly", 3],
+    ["es", 3],
+    ["s", 3]
+  ];
+  for (const [suffix, minRemaining] of rules) {
+    if (word.endsWith(suffix) && word.length - suffix.length >= minRemaining) {
+      return word.slice(0, word.length - suffix.length);
+    }
+  }
+  return word;
+}
+function getStemVariants(word) {
+  const variants = /* @__PURE__ */ new Set();
+  variants.add(word);
+  const stripped = stem2(word);
+  if (stripped !== word && stripped.length >= 3) variants.add(stripped);
+  if (word.length >= 6) variants.add(word.slice(0, 5));
+  if (word.endsWith("e") && word.length >= 5) variants.add(word.slice(0, -1));
+  return [...variants];
+}
 function tokenizeForEmbedding(text) {
   return text.toLowerCase().replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]/g, " ").split(/\s+/).filter((t) => t.length >= 2);
 }
@@ -6013,7 +6088,7 @@ async function runContextPipeline(input) {
   const rerankerApproved = new Set(rerankResult.files.map((rf) => rf.filePath));
   const rerankedMatches = boostedMatches.map((m) => ({
     filePath: m.filePath,
-    score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score,
+    score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score * 0.25,
     matchedTerms: [...m.matchedTerms]
   }));
   for (const m of rerankedMatches) {
@@ -6633,10 +6708,31 @@ function chunkJava(content, filePath) {
         tokens: estimateTokens2(lines.slice(i, classEnd + 1).join("\n"))
       });
     }
-    const methodMatch = line.match(/^(?:@\w+\s+)*(?:public|private|protected|static|\s)+\s+[\w<>\[\],\s?]+\s+(\w+)\s*\(/);
+    const ctorMatch = !classMatch && line.match(/^(?:public|private|protected)\s+([A-Z]\w+)\s*\(/);
+    if (ctorMatch && !line.match(/\s+\w+\s+\w+\s*\(/)) {
+      const name = ctorMatch[1];
+      let ctorStart = i;
+      while (ctorStart > 0 && lines[ctorStart - 1].trim().startsWith("@")) ctorStart--;
+      const ctorEnd = findBraceEnd(lines, i);
+      const className = findEnclosingClass(lines, i);
+      chunks.push({
+        filePath,
+        startLine: ctorStart + 1,
+        endLine: ctorEnd + 1,
+        content: lines.slice(ctorStart, ctorEnd + 1).join("\n"),
+        kind: "method",
+        name,
+        className,
+        score: 0,
+        tokens: estimateTokens2(lines.slice(ctorStart, ctorEnd + 1).join("\n"))
+      });
+      i = ctorEnd + 1;
+      continue;
+    }
+    const methodMatch = line.match(/^(?:@\w+[\s(].*)*(?:public|private|protected|static|final|synchronized|abstract|\s)+\s+[\w<>\[\],\s?]+\s+(\w+)\s*\(/);
     if (methodMatch && !classMatch) {
       const name = methodMatch[1];
-      if (!["if", "for", "while", "switch", "catch", "return"].includes(name)) {
+      if (!["if", "for", "while", "switch", "catch", "return", "class", "interface", "enum"].includes(name)) {
         let methodStart = i;
         while (methodStart > 0 && lines[methodStart - 1].trim().startsWith("@")) {
           methodStart--;
@@ -6860,8 +6956,46 @@ function chunkGo(content, filePath) {
 function findBraceEnd(lines, start) {
   let depth = 0;
   let foundOpen = false;
+  let inSingleLineComment = false;
+  let inMultiLineComment = false;
+  let inString = false;
   for (let i = start; i < lines.length; i++) {
-    for (const ch of lines[i]) {
+    const line = lines[i];
+    inSingleLineComment = false;
+    for (let j = 0; j < line.length; j++) {
+      const ch = line[j];
+      const next = j < line.length - 1 ? line[j + 1] : "";
+      const prev = j > 0 ? line[j - 1] : "";
+      if (inString && ch === "\\") {
+        j++;
+        continue;
+      }
+      if (!inSingleLineComment && !inMultiLineComment) {
+        if ((ch === '"' || ch === "'" || ch === "`") && !inString) {
+          inString = ch;
+          continue;
+        }
+        if (inString && ch === inString) {
+          inString = false;
+          continue;
+        }
+      }
+      if (inString) continue;
+      if (!inMultiLineComment && ch === "/" && next === "/") {
+        inSingleLineComment = true;
+        break;
+      }
+      if (!inSingleLineComment && ch === "/" && next === "*") {
+        inMultiLineComment = true;
+        j++;
+        continue;
+      }
+      if (inMultiLineComment && ch === "*" && next === "/") {
+        inMultiLineComment = false;
+        j++;
+        continue;
+      }
+      if (inSingleLineComment || inMultiLineComment) continue;
       if (ch === "{") {
         depth++;
         foundOpen = true;
@@ -6884,12 +7018,138 @@ function findEnclosingClass(lines, methodLine) {
 function estimateTokens2(content) {
   return Math.ceil(content.length / 4);
 }
+function chunkRust(content, filePath) {
+  const lines = content.split("\n");
+  const chunks = [];
+  let i = 0;
+  while (i < lines.length) {
+    const line = lines[i].trim();
+    if (line.startsWith("use ") || line.startsWith("mod ")) {
+      const blockStart = i;
+      while (i < lines.length && (lines[i].trim().startsWith("use ") || lines[i].trim().startsWith("mod ") || lines[i].trim() === "")) i++;
+      if (i > blockStart) {
+        chunks.push({
+          filePath,
+          startLine: blockStart + 1,
+          endLine: i,
+          content: lines.slice(blockStart, i).join("\n"),
+          kind: "import",
+          name: "imports",
+          score: 0,
+          tokens: estimateTokens2(lines.slice(blockStart, i).join("\n"))
+        });
+      }
+      continue;
+    }
+    const fnMatch = line.match(/^(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/);
+    const implMatch = line.match(/^(?:pub\s+)?impl(?:<[^>]*>)?\s+(?:(\w+)\s+for\s+)?(\w+)/);
+    const typeMatch = line.match(/^(?:pub\s+)?(?:struct|enum|trait)\s+(\w+)/);
+    if (fnMatch) {
+      let fnStart = i;
+      while (fnStart > 0 && lines[fnStart - 1].trim().startsWith("#[")) fnStart--;
+      const end = findBraceEnd(lines, i);
+      chunks.push({
+        filePath,
+        startLine: fnStart + 1,
+        endLine: end + 1,
+        content: lines.slice(fnStart, end + 1).join("\n"),
+        kind: "function",
+        name: fnMatch[1],
+        score: 0,
+        tokens: estimateTokens2(lines.slice(fnStart, end + 1).join("\n"))
+      });
+      i = end + 1;
+      continue;
+    }
+    if (implMatch || typeMatch) {
+      const name = typeMatch ? typeMatch[1] : implMatch[2] ?? implMatch[1] ?? "impl";
+      const end = findBraceEnd(lines, i);
+      const kind = typeMatch ? line.includes("trait") ? "interface" : "class" : "class";
+      chunks.push({
+        filePath,
+        startLine: i + 1,
+        endLine: end + 1,
+        content: lines.slice(i, end + 1).join("\n"),
+        kind,
+        name,
+        score: 0,
+        tokens: estimateTokens2(lines.slice(i, end + 1).join("\n"))
+      });
+    }
+    i++;
+  }
+  return chunks;
+}
+function chunkKotlin(content, filePath) {
+  const lines = content.split("\n");
+  const chunks = [];
+  let i = 0;
+  const importStart = lines.findIndex((l) => l.trim().startsWith("import "));
+  if (importStart >= 0) {
+    let importEnd = importStart;
+    while (importEnd < lines.length && (lines[importEnd].trim().startsWith("import ") || lines[importEnd].trim() === "")) importEnd++;
+    if (importEnd > importStart) {
+      chunks.push({
+        filePath,
+        startLine: importStart + 1,
+        endLine: importEnd,
+        content: lines.slice(importStart, importEnd).join("\n"),
+        kind: "import",
+        name: "imports",
+        score: 0,
+        tokens: estimateTokens2(lines.slice(importStart, importEnd).join("\n"))
+      });
+    }
+  }
+  i = 0;
+  while (i < lines.length) {
+    const line = lines[i].trim();
+    const funMatch = line.match(/^(?:(?:private|public|internal|protected|override|suspend|inline)\s+)*fun\s+(?:<[^>]*>\s*)?(\w+)\s*\(/);
+    const classMatch = line.match(/^(?:(?:data|sealed|abstract|open|private|public|internal)\s+)*(?:class|interface|object|enum\s+class)\s+(\w+)/);
+    if (classMatch) {
+      const end = findBraceEnd(lines, i);
+      chunks.push({
+        filePath,
+        startLine: i + 1,
+        endLine: end + 1,
+        content: lines.slice(i, end + 1).join("\n"),
+        kind: line.includes("interface") ? "interface" : "class",
+        name: classMatch[1],
+        score: 0,
+        tokens: estimateTokens2(lines.slice(i, end + 1).join("\n"))
+      });
+    }
+    if (funMatch && !classMatch) {
+      let funStart = i;
+      while (funStart > 0 && lines[funStart - 1].trim().startsWith("@")) funStart--;
+      const end = findBraceEnd(lines, i);
+      const className = findEnclosingClass(lines, i);
+      chunks.push({
+        filePath,
+        startLine: funStart + 1,
+        endLine: end + 1,
+        content: lines.slice(funStart, end + 1).join("\n"),
+        kind: className ? "method" : "function",
+        name: funMatch[1],
+        className,
+        score: 0,
+        tokens: estimateTokens2(lines.slice(funStart, end + 1).join("\n"))
+      });
+      i = end + 1;
+      continue;
+    }
+    i++;
+  }
+  return chunks;
+}
 function getLanguage2(filePath) {
   const ext = filePath.split(".").pop()?.toLowerCase() ?? "";
   if (ext === "java") return "java";
   if (["ts", "tsx", "js", "jsx", "mts", "mjs"].includes(ext)) return "ts";
   if (ext === "py") return "python";
   if (ext === "go") return "go";
+  if (ext === "rs") return "rust";
+  if (["kt", "kts"].includes(ext)) return "kotlin";
   return null;
 }
 function chunkFile(content, filePath) {
@@ -6904,23 +7164,41 @@ function chunkFile(content, filePath) {
       return chunkPython(content, filePath);
     case "go":
       return chunkGo(content, filePath);
+    case "rust":
+      return chunkRust(content, filePath);
+    case "kotlin":
+      return chunkKotlin(content, filePath);
   }
 }
 function scoreChunks(chunks, task) {
   const queryTerms = tokenize(task);
   const queryTermSet = new Set(queryTerms);
+  const queryStems = /* @__PURE__ */ new Map();
+  for (const qt of queryTermSet) queryStems.set(qt, stem2(qt));
+  const queryStemSet = new Set(queryStems.values());
   for (const chunk of chunks) {
     const chunkTerms = tokenize(chunk.content);
     const chunkTermSet = new Set(chunkTerms);
-    let termOverlap = 0;
+    const chunkStemSet = new Set([...chunkTermSet].map(stem2));
+    let exactOverlap = 0;
     for (const qt of queryTermSet) {
-      if (chunkTermSet.has(qt)) termOverlap++;
+      if (chunkTermSet.has(qt)) exactOverlap++;
     }
-    const termCoverage = queryTermSet.size > 0 ? termOverlap / queryTermSet.size : 0;
+    let stemOverlap = 0;
+    for (const qs of queryStemSet) {
+      if (chunkStemSet.has(qs)) stemOverlap++;
+    }
+    const stemOnlyMatches = Math.max(0, stemOverlap - exactOverlap);
+    const effectiveOverlap = exactOverlap + stemOnlyMatches * 0.5;
+    const termCoverage = queryTermSet.size > 0 ? effectiveOverlap / queryTermSet.size : 0;
     let nameBonus = 0;
     const nameTerms = tokenize(chunk.name + (chunk.className ? " " + chunk.className : ""));
     for (const nt of nameTerms) {
-      if (queryTermSet.has(nt)) nameBonus += 0.3;
+      if (queryTermSet.has(nt)) {
+        nameBonus += 0.3;
+      } else if (queryStemSet.has(stem2(nt))) {
+        nameBonus += 0.15;
+      }
     }
     const kindBonus = chunk.kind === "method" || chunk.kind === "function" ? 0.1 : chunk.kind === "class" || chunk.kind === "interface" ? 0.05 : 0;
     const sizePenalty = chunk.tokens > 500 ? 0.9 : chunk.tokens > 1e3 ? 0.7 : 1;
@@ -7113,6 +7391,7 @@ export {
   getLearnerStats,
   getOptimizedWeights,
   getPruneLevelForRisk,
+  getStemVariants,
   getStructuralSummary,
   getSynonymStats,
   getTelemetryBoosts,
@@ -7163,6 +7442,7 @@ export {
   setJsonLogging,
   setLogLevel,
   similarity,
+  stem2 as stem,
   testSignificance,
   tokenize,
   walkProject,

package/dist/mcp/index.js CHANGED Viewed

@@ -5112,61 +5112,88 @@ function buildWeightedQuery(intent) {
 // src/engine/embeddings.ts
 function buildTfIdfEmbeddingIndex(index) {
-  const allTerms = [...index.idf.keys()];
+  const origTerms = [...index.idf.keys()];
+  const allTerms = [];
+  const termSet = /* @__PURE__ */ new Set();
+  for (const t of origTerms) {
+    if (!termSet.has(t)) {
+      allTerms.push(t);
+      termSet.add(t);
+    }
+  }
+  for (const t of origTerms) {
+    for (const s of getStemVariants(t)) {
+      const stemKey = `\xA7${s}`;
+      if (!termSet.has(stemKey)) {
+        allTerms.push(stemKey);
+        termSet.add(stemKey);
+      }
+    }
+  }
   const termToIdx = new Map(allTerms.map((t, i) => [t, i]));
   const dimensions = allTerms.length;
   const docVectors = /* @__PURE__ */ new Map();
-  const docNorms = /* @__PURE__ */ new Map();
+  const docNonZero = /* @__PURE__ */ new Map();
   for (const [filePath, doc] of index.documents) {
     const vec = new Float32Array(dimensions);
-    let norm = 0;
+    const nonZero = [];
     for (const [term, tf] of doc.terms) {
-      const idx = termToIdx.get(term);
-      if (idx === void 0) continue;
       const idf = index.idf.get(term) ?? 0;
       const weight = tf * idf;
-      vec[idx] = weight;
-      norm += weight * weight;
+      const idx = termToIdx.get(term);
+      if (idx !== void 0) {
+        vec[idx] += weight;
+        nonZero.push(idx);
+      }
+      for (const s of getStemVariants(term)) {
+        const stemIdx = termToIdx.get(`\xA7${s}`);
+        if (stemIdx !== void 0) {
+          vec[stemIdx] += weight * 0.5;
+          nonZero.push(stemIdx);
+        }
+      }
     }
+    let norm = 0;
+    for (const i of nonZero) norm += vec[i] * vec[i];
     norm = Math.sqrt(norm);
     if (norm > 0) {
-      for (let i = 0; i < dimensions; i++) {
-        vec[i] /= norm;
-      }
+      for (const i of nonZero) vec[i] /= norm;
     }
     docVectors.set(filePath, vec);
-    docNorms.set(filePath, norm);
+    docNonZero.set(filePath, [...new Set(nonZero)]);
   }
   function queryFn(text, topK) {
     const queryTerms = tokenizeForEmbedding(text);
-    const termCounts = /* @__PURE__ */ new Map();
+    const expandedCounts = /* @__PURE__ */ new Map();
     for (const t of queryTerms) {
-      termCounts.set(t, (termCounts.get(t) ?? 0) + 1);
+      expandedCounts.set(t, (expandedCounts.get(t) ?? 0) + 1);
+      for (const s of getStemVariants(t)) {
+        const stemKey = `\xA7${s}`;
+        expandedCounts.set(stemKey, (expandedCounts.get(stemKey) ?? 0) + 0.5);
+      }
     }
     const queryVec = new Float32Array(dimensions);
-    let queryNorm = 0;
-    for (const [term, count] of termCounts) {
+    const queryNonZero = [];
+    for (const [term, count] of expandedCounts) {
       const idx = termToIdx.get(term);
       if (idx === void 0) continue;
-      const idf = index.idf.get(term) ?? 0;
-      const weight = count * idf;
-      queryVec[idx] = weight;
-      queryNorm += weight * weight;
+      const rawTerm = term.startsWith("\xA7") ? term.slice(1) : term;
+      const idf = index.idf.get(rawTerm) ?? 1;
+      queryVec[idx] = count * idf;
+      queryNonZero.push(idx);
     }
+    let queryNorm = 0;
+    for (const i of queryNonZero) queryNorm += queryVec[i] * queryVec[i];
     queryNorm = Math.sqrt(queryNorm);
     if (queryNorm > 0) {
-      for (let i = 0; i < dimensions; i++) {
-        queryVec[i] /= queryNorm;
-      }
+      for (const i of queryNonZero) queryVec[i] /= queryNorm;
     }
     const results = [];
+    const queryIdxSet = new Set(queryNonZero);
     for (const [filePath, docVec] of docVectors) {
       let dot = 0;
-      for (const [term] of termCounts) {
-        const idx = termToIdx.get(term);
-        if (idx !== void 0) {
-          dot += queryVec[idx] * docVec[idx];
-        }
+      for (const i of queryNonZero) {
+        if (docVec[i] !== 0) dot += queryVec[i] * docVec[i];
       }
       if (dot > 0) {
         results.push({ filePath, score: dot });
@@ -5195,6 +5222,54 @@ function reciprocalRankFusion(bm25Results, embeddingResults, k = 60, bm25Weight
   }
   return [...scores.entries()].map(([filePath, score]) => ({ filePath, score })).sort((a, b) => b.score - a.score);
 }
+function stem2(word) {
+  if (word.length < 4) return word;
+  const rules = [
+    ["ization", 4],
+    ["isation", 4],
+    ["ation", 4],
+    ["ition", 4],
+    ["tion", 3],
+    ["sion", 3],
+    ["ment", 3],
+    ["ness", 3],
+    ["able", 3],
+    ["ible", 3],
+    ["ive", 3],
+    ["ing", 3],
+    ["ity", 3],
+    ["ous", 3],
+    ["ful", 3],
+    ["ate", 3],
+    ["ize", 3],
+    ["ise", 3],
+    ["ure", 3],
+    ["ent", 3],
+    ["ant", 3],
+    ["al", 3],
+    ["er", 3],
+    ["or", 3],
+    ["ed", 3],
+    ["ly", 3],
+    ["es", 3],
+    ["s", 3]
+  ];
+  for (const [suffix, minRemaining] of rules) {
+    if (word.endsWith(suffix) && word.length - suffix.length >= minRemaining) {
+      return word.slice(0, word.length - suffix.length);
+    }
+  }
+  return word;
+}
+function getStemVariants(word) {
+  const variants = /* @__PURE__ */ new Set();
+  variants.add(word);
+  const stripped = stem2(word);
+  if (stripped !== word && stripped.length >= 3) variants.add(stripped);
+  if (word.length >= 6) variants.add(word.slice(0, 5));
+  if (word.endsWith("e") && word.length >= 5) variants.add(word.slice(0, -1));
+  return [...variants];
+}
 function tokenizeForEmbedding(text) {
   return text.toLowerCase().replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]/g, " ").split(/\s+/).filter((t) => t.length >= 2);
 }
@@ -5354,7 +5429,7 @@ async function runContextPipeline(input) {
   const rerankerApproved = new Set(rerankResult.files.map((rf) => rf.filePath));
   const rerankedMatches = boostedMatches.map((m) => ({
     filePath: m.filePath,
-    score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score,
+    score: rerankerApproved.has(m.filePath) ? m.score * 1.5 : m.score * 0.25,
     matchedTerms: [...m.matchedTerms]
   }));
   for (const m of rerankedMatches) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cto-ai-cli",
-  "version": "8.0.1",
+  "version": "8.1.0",
   "description": "AI context selection done right. Picks the right files, sanitizes secrets, learns from your feedback. --context, --audit, --accept/--reject.",
   "type": "module",
   "bin": {