cto-ai-cli 7.1.0 → 8.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +124 -56
- package/dist/cli/index.js +2018 -34
- package/dist/engine/index.d.ts +826 -3
- package/dist/engine/index.js +3078 -133
- package/dist/mcp/index.js +1978 -34
- package/package.json +1 -1
package/dist/engine/index.d.ts
CHANGED
|
@@ -43,7 +43,7 @@ interface ProjectGraph {
|
|
|
43
43
|
interface GraphEdge {
|
|
44
44
|
from: string;
|
|
45
45
|
to: string;
|
|
46
|
-
type: 'import' | 'export' | 're-export';
|
|
46
|
+
type: 'import' | 'export' | 're-export' | 'call';
|
|
47
47
|
}
|
|
48
48
|
interface HubNode {
|
|
49
49
|
relativePath: string;
|
|
@@ -272,6 +272,7 @@ declare function selectContext(input: SelectionInput): Promise<ContextSelection>
|
|
|
272
272
|
interface TfIdfIndex {
|
|
273
273
|
documents: Map<string, DocumentVector>;
|
|
274
274
|
idf: Map<string, number>;
|
|
275
|
+
docFreq: Map<string, number>;
|
|
275
276
|
avgDocLength: number;
|
|
276
277
|
totalDocs: number;
|
|
277
278
|
}
|
|
@@ -295,8 +296,10 @@ declare function buildIndex(files: {
|
|
|
295
296
|
/**
|
|
296
297
|
* Query the index with a task description.
|
|
297
298
|
* Returns files ranked by semantic relevance (BM25 scoring).
|
|
299
|
+
*
|
|
300
|
+
* @param expandSynonyms - If true, expands query terms with synonyms for better recall
|
|
298
301
|
*/
|
|
299
|
-
declare function query(index: TfIdfIndex, taskDescription: string, maxResults?: number): SemanticMatch[];
|
|
302
|
+
declare function query(index: TfIdfIndex, taskDescription: string, maxResults?: number, expandSynonyms?: boolean): SemanticMatch[];
|
|
300
303
|
/**
|
|
301
304
|
* Compute pairwise similarity between two documents in the index.
|
|
302
305
|
* Useful for finding related files (e.g., "what other files are similar to auth.ts?")
|
|
@@ -312,8 +315,53 @@ declare function tokenize(text: string): string[];
|
|
|
312
315
|
* Boost TF-IDF scores based on file path relevance to the task.
|
|
313
316
|
* This catches cases where the file content doesn't mention the task terms
|
|
314
317
|
* but the file path does (e.g., task "fix auth" → src/auth/middleware.ts).
|
|
318
|
+
*
|
|
319
|
+
* Scoring:
|
|
320
|
+
* - Directory segment match: 0.4 per term (structural relevance)
|
|
321
|
+
* - Filename match: 0.25 per term (weaker — many files share names)
|
|
322
|
+
* - Multiple directory matches multiply: "cache" + "seller" in path = 0.8
|
|
315
323
|
*/
|
|
316
324
|
declare function boostByPath(matches: SemanticMatch[], allFiles: string[], taskDescription: string): SemanticMatch[];
|
|
325
|
+
declare function boostByLayer(matches: SemanticMatch[], allFiles: string[], taskDescription: string): SemanticMatch[];
|
|
326
|
+
/**
|
|
327
|
+
* Boost BM25 results by import chain proximity.
|
|
328
|
+
*
|
|
329
|
+
* Problem: In Java/TS projects, interfaces and type files have minimal content
|
|
330
|
+
* (8-20 lines). BM25 can't rank them because there aren't enough tokens.
|
|
331
|
+
* But they are IMPORTED by files that DO rank well.
|
|
332
|
+
*
|
|
333
|
+
* Solution: For each top-K BM25 result, find files it imports and files that
|
|
334
|
+
* import it. Give those a score boost proportional to the parent's score.
|
|
335
|
+
* Files imported by MULTIPLE top matches get proportionally more boost.
|
|
336
|
+
*
|
|
337
|
+
* @param matches - BM25 results (already scored)
|
|
338
|
+
* @param dependencies - Map of filePath → files it imports
|
|
339
|
+
* @param topK - How many top matches to expand (default: 10)
|
|
340
|
+
* @param boostFactor - How much of the parent's score to transfer (default: 0.4)
|
|
341
|
+
*/
|
|
342
|
+
declare function boostByImports(matches: SemanticMatch[], dependencies: Map<string, string[]>, topK?: number, boostFactor?: number): SemanticMatch[];
|
|
343
|
+
/**
|
|
344
|
+
* Reciprocal Rank Fusion — state-of-the-art multi-signal ranking.
|
|
345
|
+
* Used by Elasticsearch 8, Pinecone, Cohere.
|
|
346
|
+
*
|
|
347
|
+
* Problem: Additive boosting (BM25 + path + import) saturates at 1.0.
|
|
348
|
+
* When multiple boosts stack, all top files get score=1.0 and ranking
|
|
349
|
+
* becomes arbitrary. This is why irrelevant files appear in top-K.
|
|
350
|
+
*
|
|
351
|
+
* Solution: Rank files independently by each signal, then fuse rankings.
|
|
352
|
+
* RRF_score(d) = Σ weight_i / (k + rank_i(d))
|
|
353
|
+
* where k=60 (smoothing constant, standard in literature).
|
|
354
|
+
*
|
|
355
|
+
* Files that rank well across MULTIPLE signals naturally float to the top.
|
|
356
|
+
* No saturation, no arbitrary caps.
|
|
357
|
+
*
|
|
358
|
+
* @param bm25Matches - Raw BM25 results
|
|
359
|
+
* @param allFiles - All file paths in the project
|
|
360
|
+
* @param taskDescription - The query/task
|
|
361
|
+
* @param dependencies - Import graph
|
|
362
|
+
* @param k - RRF smoothing constant (default: 60, standard)
|
|
363
|
+
*/
|
|
364
|
+
declare function reciprocalRankFusion$1(bm25Matches: SemanticMatch[], allFiles: string[], taskDescription: string, dependencies: Map<string, string[]>, k?: number): SemanticMatch[];
|
|
317
365
|
|
|
318
366
|
/**
|
|
319
367
|
* Persistent TF-IDF Index Cache
|
|
@@ -391,6 +439,54 @@ declare function getCacheInfo(projectPath: string): {
|
|
|
391
439
|
builtAt: string | null;
|
|
392
440
|
};
|
|
393
441
|
|
|
442
|
+
/**
|
|
443
|
+
* Query Intent Parsing
|
|
444
|
+
*
|
|
445
|
+
* Before searching, parse the task description into a structured intent:
|
|
446
|
+
* - action: what the developer wants to do (fix, add, refactor, trace)
|
|
447
|
+
* - entities: domain objects mentioned (cache, user, order, chart)
|
|
448
|
+
* - operations: specific operations (delete, create, update, validate)
|
|
449
|
+
* - layers: architectural layers mentioned (controller, service, repository)
|
|
450
|
+
* - qualifiers: narrowing terms (on KVS, in admin, for seller)
|
|
451
|
+
*
|
|
452
|
+
* This structured intent enables:
|
|
453
|
+
* 1. Weighted search: entities get higher BM25 weight than qualifiers
|
|
454
|
+
* 2. Layer-aware expansion: if service layer mentioned, also search use cases
|
|
455
|
+
* 3. Better multi-hop: expand through specific layers, not randomly
|
|
456
|
+
* 4. Smarter chunk selection: prioritize chunks matching entities + operations
|
|
457
|
+
*/
|
|
458
|
+
interface QueryIntent {
|
|
459
|
+
original: string;
|
|
460
|
+
action: ActionType;
|
|
461
|
+
entities: string[];
|
|
462
|
+
operations: string[];
|
|
463
|
+
layers: ArchLayer[];
|
|
464
|
+
qualifiers: string[];
|
|
465
|
+
confidence: number;
|
|
466
|
+
}
|
|
467
|
+
type ActionType = 'fix' | 'add' | 'refactor' | 'trace' | 'test' | 'docs' | 'remove' | 'optimize' | 'unknown';
|
|
468
|
+
type ArchLayer = 'endpoint' | 'usecase' | 'service' | 'repository' | 'cache' | 'client' | 'model' | 'config' | 'queue' | 'middleware';
|
|
469
|
+
/**
|
|
470
|
+
* Parse a task description into a structured query intent.
|
|
471
|
+
*
|
|
472
|
+
* @param task - Raw task description from the developer
|
|
473
|
+
* @returns Structured QueryIntent with action, entities, operations, layers
|
|
474
|
+
*/
|
|
475
|
+
declare function parseQueryIntent(task: string): QueryIntent;
|
|
476
|
+
/**
|
|
477
|
+
* Build a weighted search query from the parsed intent.
|
|
478
|
+
* Entities get highest weight, operations next, qualifiers lowest.
|
|
479
|
+
*
|
|
480
|
+
* @param intent - Parsed query intent
|
|
481
|
+
* @returns Weighted query string with important terms repeated
|
|
482
|
+
*/
|
|
483
|
+
declare function buildWeightedQuery(intent: QueryIntent): string;
|
|
484
|
+
/**
|
|
485
|
+
* Get suggested architectural layers to search based on the intent.
|
|
486
|
+
* If the developer mentions "service", we should also look at use cases and repositories.
|
|
487
|
+
*/
|
|
488
|
+
declare function expandLayers(layers: ArchLayer[]): ArchLayer[];
|
|
489
|
+
|
|
394
490
|
/**
|
|
395
491
|
* Usage Learner — Gets smarter with every use.
|
|
396
492
|
*
|
|
@@ -570,6 +666,8 @@ interface ContextPipelineResult {
|
|
|
570
666
|
fileContentMap: Map<string, string>;
|
|
571
667
|
semanticMap: Map<string, SemanticMatch>;
|
|
572
668
|
learnerMap: Map<string, LearnerBoost>;
|
|
669
|
+
/** Parsed query intent for downstream chunk selection */
|
|
670
|
+
queryIntent: QueryIntent;
|
|
573
671
|
/** Cross-repo results (only present if siblingRepos were provided) */
|
|
574
672
|
multiRepo?: MultiRepoResult;
|
|
575
673
|
/** Index cache stats (how many files were cached vs rebuilt) */
|
|
@@ -592,6 +690,61 @@ declare function optimizeBudget(files: AnalyzedFile[], budget: number): Promise<
|
|
|
592
690
|
declare function pruneFile(file: AnalyzedFile, level: PruneLevel): Promise<PrunedContent>;
|
|
593
691
|
declare function pruneFiles(files: AnalyzedFile[], levelFn: (file: AnalyzedFile) => PruneLevel): Promise<PrunedContent[]>;
|
|
594
692
|
|
|
693
|
+
/**
|
|
694
|
+
* Synonym Expansion for Query Enhancement
|
|
695
|
+
*
|
|
696
|
+
* Zero dependencies. Zero ML. Pure lookup tables.
|
|
697
|
+
*
|
|
698
|
+
* Expands query terms with domain-specific synonyms to improve recall
|
|
699
|
+
* without requiring exact textual overlap. This bridges the gap between
|
|
700
|
+
* BM25's lexical matching and full semantic embeddings.
|
|
701
|
+
*
|
|
702
|
+
* Example:
|
|
703
|
+
* query("database") → ["database", "db", "repository", "orm", "sql", "prisma"]
|
|
704
|
+
* query("auth") → ["auth", "authentication", "login", "session", "jwt", "token"]
|
|
705
|
+
*
|
|
706
|
+
* Why this works:
|
|
707
|
+
* - Captures common abbreviations (db, auth, repo)
|
|
708
|
+
* - Captures technology-specific terms (prisma, jwt, redis)
|
|
709
|
+
* - Captures conceptual relationships (cache → redis, memcached)
|
|
710
|
+
* - Deterministic, predictable, maintainable
|
|
711
|
+
*
|
|
712
|
+
* Trade-offs vs embeddings:
|
|
713
|
+
* + Zero dependencies, zero latency, zero cost
|
|
714
|
+
* + Deterministic (same query → same expansion)
|
|
715
|
+
* + Easy to debug and extend
|
|
716
|
+
* - Limited to manually curated vocabulary
|
|
717
|
+
* - Doesn't capture novel relationships
|
|
718
|
+
* - Estimated +5-8% recall vs embeddings' +10-15%
|
|
719
|
+
*/
|
|
720
|
+
interface SynonymExpansion {
|
|
721
|
+
original: string;
|
|
722
|
+
expanded: string[];
|
|
723
|
+
}
|
|
724
|
+
/**
|
|
725
|
+
* Expand a single term with its synonyms.
|
|
726
|
+
* Returns the original term plus all related terms.
|
|
727
|
+
*/
|
|
728
|
+
declare function expandTerm(term: string): string[];
|
|
729
|
+
/**
|
|
730
|
+
* Expand all terms in a query.
|
|
731
|
+
* Deduplicates the result.
|
|
732
|
+
*/
|
|
733
|
+
declare function expandQuery(query: string): string[];
|
|
734
|
+
/**
|
|
735
|
+
* Get expansion details for debugging/telemetry.
|
|
736
|
+
*/
|
|
737
|
+
declare function getExpansionDetails(query: string): SynonymExpansion[];
|
|
738
|
+
/**
|
|
739
|
+
* Get statistics about the synonym dictionary.
|
|
740
|
+
*/
|
|
741
|
+
declare function getSynonymStats(): {
|
|
742
|
+
canonicalTerms: number;
|
|
743
|
+
totalSynonyms: number;
|
|
744
|
+
avgSynonymsPerTerm: number;
|
|
745
|
+
bidirectionalEntries: number;
|
|
746
|
+
};
|
|
747
|
+
|
|
595
748
|
/**
|
|
596
749
|
* Closed-Loop A/B Testing Engine
|
|
597
750
|
*
|
|
@@ -945,6 +1098,676 @@ declare function estimateTokens(content: string, sizeInBytes: number, method?: '
|
|
|
945
1098
|
declare function estimateFileTokens(filePath: string, method?: 'chars4' | 'tiktoken'): Promise<number>;
|
|
946
1099
|
declare function freeEncoder(): void;
|
|
947
1100
|
|
|
1101
|
+
/**
|
|
1102
|
+
* Corpus-Learned Semantic Expansion
|
|
1103
|
+
*
|
|
1104
|
+
* Zero dependencies. Pure math.
|
|
1105
|
+
*
|
|
1106
|
+
* Problem: BM25 is lexical — "fix authentication" won't match a file about
|
|
1107
|
+
* "OAuth2 token validator" because the terms don't overlap.
|
|
1108
|
+
*
|
|
1109
|
+
* Solution: Learn term associations from the codebase itself using
|
|
1110
|
+
* Pointwise Mutual Information (PMI). If "auth" and "oauth" frequently
|
|
1111
|
+
* co-occur in the same files, they're semantically related.
|
|
1112
|
+
*
|
|
1113
|
+
* How it works:
|
|
1114
|
+
* 1. Build a co-occurrence matrix from the TF-IDF index
|
|
1115
|
+
* 2. Compute PMI for each term pair: PMI(a,b) = log(P(a,b) / (P(a)·P(b)))
|
|
1116
|
+
* 3. For each query term, find the top-K associated terms
|
|
1117
|
+
* 4. Expand the query with these terms (weighted by PMI strength)
|
|
1118
|
+
*
|
|
1119
|
+
* This bridges vocabulary gaps like:
|
|
1120
|
+
* - "auth" → "oauth", "token", "jwt", "credential"
|
|
1121
|
+
* - "cache" → "redis", "ttl", "invalidat", "evict"
|
|
1122
|
+
* - "order" → "purchas", "cart", "checkout"
|
|
1123
|
+
*
|
|
1124
|
+
* Performance: O(|query| × |vocab|) per query — fast because we only
|
|
1125
|
+
* compute PMI for query terms, not the full matrix.
|
|
1126
|
+
*
|
|
1127
|
+
* Also provides dense document embeddings via Random Indexing (Kanerva 2000):
|
|
1128
|
+
* - Each term gets a random sparse vector
|
|
1129
|
+
* - Document = sum of TF-IDF-weighted term vectors
|
|
1130
|
+
* - Cosine similarity captures latent semantic relationships
|
|
1131
|
+
* - Proven to approximate SVD/LSA without matrix decomposition
|
|
1132
|
+
*/
|
|
1133
|
+
|
|
1134
|
+
interface SemanticExpansion {
|
|
1135
|
+
/** Original query terms */
|
|
1136
|
+
original: string[];
|
|
1137
|
+
/** Expanded terms with weights (includes originals at weight 1.0) */
|
|
1138
|
+
expanded: Map<string, number>;
|
|
1139
|
+
/** Which expansions were added and why */
|
|
1140
|
+
expansions: {
|
|
1141
|
+
term: string;
|
|
1142
|
+
source: string;
|
|
1143
|
+
pmi: number;
|
|
1144
|
+
weight: number;
|
|
1145
|
+
}[];
|
|
1146
|
+
}
|
|
1147
|
+
interface CorpusEmbeddings {
|
|
1148
|
+
/** Document embeddings: filePath → dense vector */
|
|
1149
|
+
documents: Map<string, Float64Array>;
|
|
1150
|
+
/** Embedding dimension */
|
|
1151
|
+
dimension: number;
|
|
1152
|
+
/** Term → random index vector (sparse representation) */
|
|
1153
|
+
termVectors: Map<string, {
|
|
1154
|
+
indices: number[];
|
|
1155
|
+
signs: number[];
|
|
1156
|
+
}>;
|
|
1157
|
+
}
|
|
1158
|
+
/**
|
|
1159
|
+
* Expand query terms using corpus-learned PMI associations.
|
|
1160
|
+
*
|
|
1161
|
+
* For each query term, finds terms that co-occur with it significantly
|
|
1162
|
+
* more than chance would predict. These are added to the query with
|
|
1163
|
+
* reduced weight.
|
|
1164
|
+
*
|
|
1165
|
+
* @param index - TF-IDF index (provides term frequencies and doc frequencies)
|
|
1166
|
+
* @param queryTerms - Tokenized query terms
|
|
1167
|
+
* @param topK - Max expansions per query term (default: 3)
|
|
1168
|
+
* @param minPmi - Minimum PMI score to be considered associated (default: 1.0)
|
|
1169
|
+
* @param expansionWeight - Weight multiplier for expanded terms (default: 0.5)
|
|
1170
|
+
*/
|
|
1171
|
+
declare function expandQueryWithPMI(index: TfIdfIndex, queryTerms: string[], topK?: number, minPmi?: number, expansionWeight?: number): SemanticExpansion;
|
|
1172
|
+
/**
|
|
1173
|
+
* Build dense document embeddings using Random Indexing.
|
|
1174
|
+
*
|
|
1175
|
+
* Random Indexing (Kanerva 2000, Sahlgren 2005) is a lightweight alternative
|
|
1176
|
+
* to LSA/SVD that builds document embeddings incrementally:
|
|
1177
|
+
*
|
|
1178
|
+
* 1. Assign each vocabulary term a random sparse vector (index vector)
|
|
1179
|
+
* with mostly zeros and a few +1/-1 entries
|
|
1180
|
+
* 2. A document's embedding = sum of TF-IDF-weighted index vectors of its terms
|
|
1181
|
+
* 3. Query embedding = sum of index vectors of query terms
|
|
1182
|
+
* 4. Similarity = cosine(query_embedding, doc_embedding)
|
|
1183
|
+
*
|
|
1184
|
+
* Proven to approximate SVD/LSA (Achlioptas 2003, Johnson-Lindenstrauss lemma).
|
|
1185
|
+
* O(docs × terms × nnz) where nnz is the sparsity of index vectors (~6).
|
|
1186
|
+
*
|
|
1187
|
+
* @param index - TF-IDF index
|
|
1188
|
+
* @param dimension - Embedding dimension (default: 128)
|
|
1189
|
+
* @param nnz - Non-zero entries per index vector (default: 6)
|
|
1190
|
+
* @param seed - Random seed for reproducibility (default: 42)
|
|
1191
|
+
*/
|
|
1192
|
+
declare function buildCorpusEmbeddings(index: TfIdfIndex, dimension?: number, nnz?: number, seed?: number): CorpusEmbeddings;
|
|
1193
|
+
/**
|
|
1194
|
+
* Compute embedding for a query string.
|
|
1195
|
+
*/
|
|
1196
|
+
declare function embedQuery(query: string, embeddings: CorpusEmbeddings): Float64Array;
|
|
1197
|
+
/**
|
|
1198
|
+
* Rank all documents by embedding similarity to a query.
|
|
1199
|
+
* Returns sorted list of (filePath, similarity) pairs.
|
|
1200
|
+
*/
|
|
1201
|
+
declare function queryByEmbedding(queryVec: Float64Array, embeddings: CorpusEmbeddings, maxResults?: number): {
|
|
1202
|
+
filePath: string;
|
|
1203
|
+
similarity: number;
|
|
1204
|
+
}[];
|
|
1205
|
+
|
|
1206
|
+
/**
|
|
1207
|
+
* AST-Aware Tokenizer for Java, Python, Go
|
|
1208
|
+
*
|
|
1209
|
+
* Zero dependencies. Regex-based structural extraction.
|
|
1210
|
+
*
|
|
1211
|
+
* Problem: The generic tokenizer splits on whitespace and camelCase but doesn't
|
|
1212
|
+
* understand language constructs. A Java file with `@Repository` annotation and
|
|
1213
|
+
* `implements CacheService` has critical structural information that BM25 misses.
|
|
1214
|
+
*
|
|
1215
|
+
* Solution: Extract high-value structural tokens from source code:
|
|
1216
|
+
* - Class/interface/enum names (weighted 3×)
|
|
1217
|
+
* - Method/function names (weighted 2×)
|
|
1218
|
+
* - Annotations (@Repository, @Service, @Controller) as layer indicators
|
|
1219
|
+
* - Inheritance (extends/implements) as relationship signals
|
|
1220
|
+
* - Package/module declarations as structural context
|
|
1221
|
+
*
|
|
1222
|
+
* These tokens are prepended to the regular content, boosting their TF-IDF weight.
|
|
1223
|
+
* The result: files with structurally relevant names rank higher even if their
|
|
1224
|
+
* content is minimal (e.g., Java interfaces).
|
|
1225
|
+
*/
|
|
1226
|
+
interface StructuralTokens {
|
|
1227
|
+
/** Class/interface/enum names found */
|
|
1228
|
+
classNames: string[];
|
|
1229
|
+
/** Method/function names found */
|
|
1230
|
+
methodNames: string[];
|
|
1231
|
+
/** Annotations found (without @) */
|
|
1232
|
+
annotations: string[];
|
|
1233
|
+
/** Parent classes/interfaces (extends/implements) */
|
|
1234
|
+
parents: string[];
|
|
1235
|
+
/** Package/module name */
|
|
1236
|
+
packageName: string | null;
|
|
1237
|
+
/** Detected language */
|
|
1238
|
+
language: 'java' | 'python' | 'go' | 'typescript' | 'unknown';
|
|
1239
|
+
}
|
|
1240
|
+
/**
|
|
1241
|
+
* Extract structural tokens from source code.
|
|
1242
|
+
* Language is detected from file extension or content patterns.
|
|
1243
|
+
*/
|
|
1244
|
+
declare function extractStructuralTokens(content: string, filePath: string): StructuralTokens;
|
|
1245
|
+
/**
|
|
1246
|
+
* Augment file content with structural tokens for better BM25 indexing.
|
|
1247
|
+
*
|
|
1248
|
+
* Prepends high-value structural information to the content:
|
|
1249
|
+
* - Class names repeated 3× (boosted weight)
|
|
1250
|
+
* - Method names repeated 2×
|
|
1251
|
+
* - Annotation-derived layer terms
|
|
1252
|
+
* - Parent class/interface names
|
|
1253
|
+
*
|
|
1254
|
+
* This causes BM25 to rank files higher when the query matches their
|
|
1255
|
+
* structural identity, not just their content.
|
|
1256
|
+
*/
|
|
1257
|
+
declare function augmentContentWithStructure(content: string, filePath: string): string;
|
|
1258
|
+
/**
|
|
1259
|
+
* Get structural summary for a file (for debugging/telemetry).
|
|
1260
|
+
*/
|
|
1261
|
+
declare function getStructuralSummary(content: string, filePath: string): string;
|
|
1262
|
+
|
|
1263
|
+
/**
|
|
1264
|
+
* Feedback-Driven Weight Tuner
|
|
1265
|
+
*
|
|
1266
|
+
* Zero dependencies. Bayesian optimization of signal weights.
|
|
1267
|
+
*
|
|
1268
|
+
* Problem: RRF and boost weights are static (BM25=0.40, path=0.25, import=0.20,
|
|
1269
|
+
* className=0.15). Different codebases have different characteristics — a Java
|
|
1270
|
+
* monolith benefits more from import boost, a microservice from path boost.
|
|
1271
|
+
*
|
|
1272
|
+
* Solution: Record which files the developer actually uses after context selection.
|
|
1273
|
+
* Use this feedback to optimize weights per project using Thompson Sampling:
|
|
1274
|
+
*
|
|
1275
|
+
* 1. Each weight has a Beta distribution prior: Beta(α, β)
|
|
1276
|
+
* 2. When a file ranked high by signal X is accepted → α_X++
|
|
1277
|
+
* 3. When a file ranked high by signal X is rejected → β_X++
|
|
1278
|
+
* 4. Sample from each Beta to get optimistic weights
|
|
1279
|
+
* 5. Normalize to sum to 1.0
|
|
1280
|
+
*
|
|
1281
|
+
* This is the same algorithm used by recommendation systems at Netflix, Spotify.
|
|
1282
|
+
* Converges to optimal weights in ~20-50 selections.
|
|
1283
|
+
*
|
|
1284
|
+
* Storage: .cto/weight-tuner.json — <2KB
|
|
1285
|
+
* Integrates with existing A/B testing infrastructure.
|
|
1286
|
+
*/
|
|
1287
|
+
interface SignalWeight {
|
|
1288
|
+
/** Signal name */
|
|
1289
|
+
name: string;
|
|
1290
|
+
/** Default weight (fallback) */
|
|
1291
|
+
defaultWeight: number;
|
|
1292
|
+
/** Bayesian prior: alpha (success count) */
|
|
1293
|
+
alpha: number;
|
|
1294
|
+
/** Bayesian prior: beta (failure count) */
|
|
1295
|
+
beta: number;
|
|
1296
|
+
}
|
|
1297
|
+
interface WeightTunerModel {
|
|
1298
|
+
version: 1;
|
|
1299
|
+
updatedAt: string;
|
|
1300
|
+
signals: SignalWeight[];
|
|
1301
|
+
totalFeedback: number;
|
|
1302
|
+
/** History of recent weight snapshots for trend analysis */
|
|
1303
|
+
history: {
|
|
1304
|
+
timestamp: string;
|
|
1305
|
+
weights: Record<string, number>;
|
|
1306
|
+
}[];
|
|
1307
|
+
}
|
|
1308
|
+
interface TunedWeights {
|
|
1309
|
+
/** Optimized weights (sum to 1.0) */
|
|
1310
|
+
weights: Record<string, number>;
|
|
1311
|
+
/** Confidence: 0 = pure default, 1 = well-tuned (>50 observations) */
|
|
1312
|
+
confidence: number;
|
|
1313
|
+
/** Whether we're using learned weights or defaults */
|
|
1314
|
+
source: 'learned' | 'default';
|
|
1315
|
+
}
|
|
1316
|
+
/**
|
|
1317
|
+
* Load the weight tuner model from disk.
|
|
1318
|
+
* Returns fresh model with default priors if none exists.
|
|
1319
|
+
*/
|
|
1320
|
+
declare function loadWeightTuner(projectPath: string): WeightTunerModel;
|
|
1321
|
+
/**
|
|
1322
|
+
* Save the weight tuner model to disk.
|
|
1323
|
+
*/
|
|
1324
|
+
declare function saveWeightTuner(projectPath: string, model: WeightTunerModel): void;
|
|
1325
|
+
/**
|
|
1326
|
+
* Create a fresh model with uniform priors.
|
|
1327
|
+
* Alpha=1, Beta=1 is the non-informative prior (uniform distribution).
|
|
1328
|
+
*/
|
|
1329
|
+
declare function createFreshModel(): WeightTunerModel;
|
|
1330
|
+
/**
|
|
1331
|
+
* Record feedback: which signal contributed to each accepted/rejected file.
|
|
1332
|
+
*
|
|
1333
|
+
* @param model - Current tuner model
|
|
1334
|
+
* @param feedback - Array of (signalName, accepted) pairs
|
|
1335
|
+
*/
|
|
1336
|
+
declare function recordFeedback(model: WeightTunerModel, feedback: {
|
|
1337
|
+
signal: string;
|
|
1338
|
+
accepted: boolean;
|
|
1339
|
+
}[]): WeightTunerModel;
|
|
1340
|
+
/**
|
|
1341
|
+
* Get optimized weights using Thompson Sampling.
|
|
1342
|
+
*
|
|
1343
|
+
* For each signal, sample from its Beta(α, β) distribution.
|
|
1344
|
+
* The mean of Beta(α, β) is α/(α+β), which converges to the
|
|
1345
|
+
* true acceptance rate as we collect more feedback.
|
|
1346
|
+
*
|
|
1347
|
+
* With few observations, falls back to default weights.
|
|
1348
|
+
*/
|
|
1349
|
+
declare function getOptimizedWeights(model: WeightTunerModel): TunedWeights;
|
|
1350
|
+
/**
|
|
1351
|
+
* Determine which signal contributed most to a file's ranking.
|
|
1352
|
+
* Used to attribute feedback to the right signal.
|
|
1353
|
+
*
|
|
1354
|
+
* @param filePath - The file being evaluated
|
|
1355
|
+
* @param signalRanks - Map of signal name → rank of this file in that signal's ranking
|
|
1356
|
+
* @returns The signal name that ranked this file highest
|
|
1357
|
+
*/
|
|
1358
|
+
declare function attributeToSignal(signalRanks: Record<string, number>): string;
|
|
1359
|
+
/**
|
|
1360
|
+
* Render weight tuner status for CLI output.
|
|
1361
|
+
*/
|
|
1362
|
+
declare function renderWeightStatus(model: WeightTunerModel): string;
|
|
1363
|
+
|
|
1364
|
+
/**
|
|
1365
|
+
* Cross-File Call Graph Analysis
|
|
1366
|
+
*
|
|
1367
|
+
* Traces method/function calls across files to build execution paths.
|
|
1368
|
+
* Goes beyond import edges: if endpoint A calls service B.method() which
|
|
1369
|
+
* calls repository C.query(), we produce edges A→B and B→C with type 'call'.
|
|
1370
|
+
*
|
|
1371
|
+
* This is the signal that turns "fix cache retrieval" from matching random
|
|
1372
|
+
* files that mention "cache" into tracing the actual execution path:
|
|
1373
|
+
* controller → use case → cache repository → cache implementation.
|
|
1374
|
+
*
|
|
1375
|
+
* Approach: lightweight regex-based static analysis per language.
|
|
1376
|
+
* No AST parser dependency — works on raw file content.
|
|
1377
|
+
*
|
|
1378
|
+
* Supported: Java, TypeScript/JavaScript, Python, Go
|
|
1379
|
+
*/
|
|
1380
|
+
|
|
1381
|
+
interface MethodDefinition {
|
|
1382
|
+
name: string;
|
|
1383
|
+
className?: string;
|
|
1384
|
+
filePath: string;
|
|
1385
|
+
isExported: boolean;
|
|
1386
|
+
}
|
|
1387
|
+
interface MethodCall {
|
|
1388
|
+
callerFile: string;
|
|
1389
|
+
receiverName: string;
|
|
1390
|
+
methodName: string;
|
|
1391
|
+
}
|
|
1392
|
+
interface CallGraphResult {
|
|
1393
|
+
definitions: MethodDefinition[];
|
|
1394
|
+
calls: MethodCall[];
|
|
1395
|
+
edges: GraphEdge[];
|
|
1396
|
+
}
|
|
1397
|
+
/**
|
|
1398
|
+
* Build a cross-file call graph from file contents.
|
|
1399
|
+
*
|
|
1400
|
+
* Returns method definitions, method calls, and resolved call edges.
|
|
1401
|
+
* Call edges connect the file making the call to the file defining the method.
|
|
1402
|
+
*
|
|
1403
|
+
* @param files - Array of {relativePath, content} pairs
|
|
1404
|
+
* @returns CallGraphResult with definitions, calls, and resolved edges
|
|
1405
|
+
*/
|
|
1406
|
+
declare function buildCallGraph(files: {
|
|
1407
|
+
relativePath: string;
|
|
1408
|
+
content: string;
|
|
1409
|
+
}[]): CallGraphResult;
|
|
1410
|
+
/**
|
|
1411
|
+
* Boost BM25 results using call graph edges.
|
|
1412
|
+
*
|
|
1413
|
+
* When a file ranks well in BM25, files it calls or is called by
|
|
1414
|
+
* are likely relevant to the same task. This traces execution paths
|
|
1415
|
+
* that import-only analysis misses.
|
|
1416
|
+
*
|
|
1417
|
+
* @param matches - Current ranked matches
|
|
1418
|
+
* @param callEdges - Call graph edges (type 'call')
|
|
1419
|
+
* @param topK - How many top matches to expand (default: 10)
|
|
1420
|
+
* @param boostFactor - Score boost for called/calling files (default: 0.3)
|
|
1421
|
+
*/
|
|
1422
|
+
declare function boostByCallGraph(matches: {
|
|
1423
|
+
filePath: string;
|
|
1424
|
+
score: number;
|
|
1425
|
+
matchedTerms: string[];
|
|
1426
|
+
}[], callEdges: GraphEdge[], topK?: number, boostFactor?: number): {
|
|
1427
|
+
filePath: string;
|
|
1428
|
+
score: number;
|
|
1429
|
+
matchedTerms: string[];
|
|
1430
|
+
}[];
|
|
1431
|
+
|
|
1432
|
+
/**
|
|
1433
|
+
* Git-Aware Relevance
|
|
1434
|
+
*
|
|
1435
|
+
* Files that are frequently modified together are likely related.
|
|
1436
|
+
* This is the signal that no competitor has — it captures implicit
|
|
1437
|
+
* coupling that import analysis and call graphs miss.
|
|
1438
|
+
*
|
|
1439
|
+
* Examples:
|
|
1440
|
+
* - Controller + its DTO changed together 90% of the time
|
|
1441
|
+
* - Service + its test changed together 80% of the time
|
|
1442
|
+
* - Config + migration changed together in 3 out of 4 commits
|
|
1443
|
+
*
|
|
1444
|
+
* Approach:
|
|
1445
|
+
* 1. Run `git log --name-only` to extract co-change history
|
|
1446
|
+
* 2. Build a co-change matrix: file pairs → co-commit count
|
|
1447
|
+
* 3. Normalize to Jaccard similarity: co(A,B) / (commits(A) + commits(B) - co(A,B))
|
|
1448
|
+
* 4. When file A is selected, boost files with high co-change similarity
|
|
1449
|
+
*
|
|
1450
|
+
* Performance: O(C × F²) where C = commits, F = avg files per commit.
|
|
1451
|
+
* Capped at 500 recent commits to keep it fast.
|
|
1452
|
+
*/
|
|
1453
|
+
interface CoChangeEntry {
|
|
1454
|
+
fileA: string;
|
|
1455
|
+
fileB: string;
|
|
1456
|
+
coCommits: number;
|
|
1457
|
+
similarity: number;
|
|
1458
|
+
}
|
|
1459
|
+
interface CoChangeMatrix {
|
|
1460
|
+
entries: Map<string, CoChangeEntry[]>;
|
|
1461
|
+
fileCommitCounts: Map<string, number>;
|
|
1462
|
+
totalCommits: number;
|
|
1463
|
+
}
|
|
1464
|
+
/**
|
|
1465
|
+
* Build a co-change matrix from git history.
|
|
1466
|
+
*
|
|
1467
|
+
* @param projectPath - Absolute path to the git repository
|
|
1468
|
+
* @param maxCommits - Max commits to analyze (default: 500)
|
|
1469
|
+
* @param minCoChanges - Minimum co-changes to include a pair (default: 2)
|
|
1470
|
+
* @returns CoChangeMatrix with file pair similarities
|
|
1471
|
+
*/
|
|
1472
|
+
declare function buildCoChangeMatrix(projectPath: string, maxCommits?: number, minCoChanges?: number): CoChangeMatrix;
|
|
1473
|
+
/**
|
|
1474
|
+
* Boost BM25 results using git co-change history.
|
|
1475
|
+
*
|
|
1476
|
+
* When file A ranks well and file B was frequently co-changed with A,
|
|
1477
|
+
* B gets a score boost proportional to A's score × co-change similarity.
|
|
1478
|
+
*
|
|
1479
|
+
* @param matches - Current ranked matches
|
|
1480
|
+
* @param coChangeMatrix - Pre-built co-change matrix
|
|
1481
|
+
* @param topK - How many top matches to expand (default: 10)
|
|
1482
|
+
* @param boostFactor - Max boost multiplier (default: 0.25)
|
|
1483
|
+
* @param minSimilarity - Min Jaccard similarity to apply boost (default: 0.15)
|
|
1484
|
+
*/
|
|
1485
|
+
declare function boostByGitCoChange(matches: {
|
|
1486
|
+
filePath: string;
|
|
1487
|
+
score: number;
|
|
1488
|
+
matchedTerms: string[];
|
|
1489
|
+
}[], coChangeMatrix: CoChangeMatrix, topK?: number, boostFactor?: number, minSimilarity?: number): {
|
|
1490
|
+
filePath: string;
|
|
1491
|
+
score: number;
|
|
1492
|
+
matchedTerms: string[];
|
|
1493
|
+
}[];
|
|
1494
|
+
/**
|
|
1495
|
+
* Get recently modified files from git log.
|
|
1496
|
+
* Files modified more recently are more likely relevant to active work.
|
|
1497
|
+
*
|
|
1498
|
+
* @param projectPath - Absolute path to the git repository
|
|
1499
|
+
* @param days - How many days back to look (default: 30)
|
|
1500
|
+
* @returns Map of filePath → recency score (1.0 = today, decays with age)
|
|
1501
|
+
*/
|
|
1502
|
+
declare function getGitRecency(projectPath: string, days?: number): Map<string, number>;
|
|
1503
|
+
|
|
1504
|
+
/**
|
|
1505
|
+
* Multi-Hop Reasoning for Enterprise Queries
|
|
1506
|
+
*
|
|
1507
|
+
* Problem: "fix the seller info cache invalidation on KVS delete"
|
|
1508
|
+
* Required chain:
|
|
1509
|
+
* 1. Find delete KVS endpoint (BM25 matches "delete", "KVS")
|
|
1510
|
+
* 2. Find what it calls (use case → dependency graph)
|
|
1511
|
+
* 3. Find what the use case invalidates (cache repo → call graph)
|
|
1512
|
+
* 4. Find the cache implementation
|
|
1513
|
+
*
|
|
1514
|
+
* Current system finds steps 1 and 3 independently.
|
|
1515
|
+
* Multi-hop traces the chain and finds ALL 4.
|
|
1516
|
+
*
|
|
1517
|
+
* Algorithm: Iterative BM25 with dependency/call expansion
|
|
1518
|
+
* Hop 0: BM25(original query) → top-K files
|
|
1519
|
+
* Hop 1: For each top file, find deps + callees → re-query with expanded terms
|
|
1520
|
+
* Hop 2: Repeat with score decay
|
|
1521
|
+
* Aggregate: Combine scores across hops with exponential decay
|
|
1522
|
+
*
|
|
1523
|
+
* Max 3 hops. Each hop expands through both import deps AND call graph edges.
|
|
1524
|
+
*/
|
|
1525
|
+
|
|
1526
|
+
interface MultiHopConfig {
|
|
1527
|
+
maxHops: number;
|
|
1528
|
+
topKPerHop: number;
|
|
1529
|
+
decayFactor: number;
|
|
1530
|
+
minScoreThreshold: number;
|
|
1531
|
+
}
|
|
1532
|
+
interface MultiHopResult {
|
|
1533
|
+
matches: SemanticMatch[];
|
|
1534
|
+
hops: HopDetail[];
|
|
1535
|
+
totalFilesExplored: number;
|
|
1536
|
+
}
|
|
1537
|
+
interface HopDetail {
|
|
1538
|
+
hop: number;
|
|
1539
|
+
seedFiles: string[];
|
|
1540
|
+
newFiles: string[];
|
|
1541
|
+
expandedTerms: string[];
|
|
1542
|
+
}
|
|
1543
|
+
/**
|
|
1544
|
+
* Execute a multi-hop reasoning query.
|
|
1545
|
+
*
|
|
1546
|
+
* Starting from BM25 results, iteratively expands through the dependency
|
|
1547
|
+
* and call graphs, extracting terms from discovered files to broaden
|
|
1548
|
+
* the search while maintaining relevance through score decay.
|
|
1549
|
+
*
|
|
1550
|
+
* @param index - TF-IDF index for BM25 queries
|
|
1551
|
+
* @param task - Original task description
|
|
1552
|
+
* @param deps - Import dependency map (file → files it imports)
|
|
1553
|
+
* @param callEdges - Call graph edges (from → to with type 'call')
|
|
1554
|
+
* @param fileContents - Map of filePath → file content (for term extraction)
|
|
1555
|
+
* @param config - Multi-hop configuration
|
|
1556
|
+
*/
|
|
1557
|
+
declare function multiHopQuery(index: TfIdfIndex, task: string, deps: Map<string, string[]>, callEdges: {
|
|
1558
|
+
from: string;
|
|
1559
|
+
to: string;
|
|
1560
|
+
}[], fileContents: Map<string, string>, config?: Partial<MultiHopConfig>): MultiHopResult;
|
|
1561
|
+
|
|
1562
|
+
/**
|
|
1563
|
+
* IDE Telemetry — Incremental Learning from File Opens
|
|
1564
|
+
*
|
|
1565
|
+
* Tracks which files the developer actually opens after receiving context.
|
|
1566
|
+
* If CTO suggests 15 files and the developer only uses 5, those 5 should
|
|
1567
|
+
* be weighted higher next time for similar tasks.
|
|
1568
|
+
*
|
|
1569
|
+
* Storage: .cto/telemetry.json — lightweight, per-project.
|
|
1570
|
+
*
|
|
1571
|
+
* Integration points:
|
|
1572
|
+
* - VS Code extension: file-open events → recordFileOpen()
|
|
1573
|
+
* - LSP bridge: cto/telemetry method
|
|
1574
|
+
* - Context pipeline: getTelemetryBoosts() → selector
|
|
1575
|
+
*
|
|
1576
|
+
* Privacy: Only stores relative file paths and timestamps.
|
|
1577
|
+
* No file contents, no user data.
|
|
1578
|
+
*/
|
|
1579
|
+
interface FileOpenEvent {
|
|
1580
|
+
filePath: string;
|
|
1581
|
+
timestamp: number;
|
|
1582
|
+
taskContext?: string;
|
|
1583
|
+
}
|
|
1584
|
+
interface TelemetrySession {
|
|
1585
|
+
taskDescription: string;
|
|
1586
|
+
suggestedFiles: string[];
|
|
1587
|
+
openedFiles: string[];
|
|
1588
|
+
timestamp: number;
|
|
1589
|
+
}
|
|
1590
|
+
interface TelemetryModel {
|
|
1591
|
+
version: number;
|
|
1592
|
+
sessions: TelemetrySession[];
|
|
1593
|
+
fileOpenCounts: Record<string, number>;
|
|
1594
|
+
fileTaskCounts: Record<string, Record<string, number>>;
|
|
1595
|
+
lastUpdated: number;
|
|
1596
|
+
}
|
|
1597
|
+
declare function loadTelemetry(projectPath: string): TelemetryModel;
|
|
1598
|
+
declare function saveTelemetry(projectPath: string, model: TelemetryModel): void;
|
|
1599
|
+
/**
|
|
1600
|
+
* Record that the user opened a file after receiving context suggestions.
|
|
1601
|
+
*/
|
|
1602
|
+
declare function recordFileOpen(model: TelemetryModel, filePath: string, taskContext?: string): TelemetryModel;
|
|
1603
|
+
/**
|
|
1604
|
+
* Record a complete session: what CTO suggested vs what the user used.
|
|
1605
|
+
*/
|
|
1606
|
+
declare function recordSession(model: TelemetryModel, taskDescription: string, suggestedFiles: string[], openedFiles: string[]): TelemetryModel;
|
|
1607
|
+
/**
|
|
1608
|
+
* Get telemetry-based boosts for file ranking.
|
|
1609
|
+
*
|
|
1610
|
+
* Files the user frequently opens for similar tasks get a positive boost.
|
|
1611
|
+
* Files that CTO suggests but the user never opens get a negative signal.
|
|
1612
|
+
*
|
|
1613
|
+
* @param model - Telemetry model
|
|
1614
|
+
* @param taskType - Current task type (debug, feature, refactor, etc.)
|
|
1615
|
+
* @param candidateFiles - Files to compute boosts for
|
|
1616
|
+
* @returns Map of filePath → boost (-1.0 to +1.0)
|
|
1617
|
+
*/
|
|
1618
|
+
declare function getTelemetryBoosts(model: TelemetryModel, taskType: string, candidateFiles: string[]): Map<string, number>;
|
|
1619
|
+
/**
|
|
1620
|
+
* Render a summary of telemetry data for debugging/display.
|
|
1621
|
+
*/
|
|
1622
|
+
declare function renderTelemetrySummary(model: TelemetryModel): string;
|
|
1623
|
+
|
|
1624
|
+
/**
|
|
1625
|
+
* Embedding-Based Retrieval
|
|
1626
|
+
*
|
|
1627
|
+
* Dense vector search to complement BM25 lexical matching.
|
|
1628
|
+
* Catches semantic similarity that BM25 misses:
|
|
1629
|
+
* - "authentication" matches "login" (no lexical overlap)
|
|
1630
|
+
* - "cache invalidation" matches "clear stored data"
|
|
1631
|
+
*
|
|
1632
|
+
* Two backends:
|
|
1633
|
+
* 1. TF-IDF Cosine (built-in, zero deps, always available)
|
|
1634
|
+
* — builds document vectors from TF-IDF weights, queries via cosine similarity
|
|
1635
|
+
* — accuracy: ~85% of neural embeddings for code search
|
|
1636
|
+
*
|
|
1637
|
+
* 2. ONNX Neural (optional, requires onnxruntime-node + model file)
|
|
1638
|
+
* — all-MiniLM-L6-v2 (23MB), 384-dim embeddings
|
|
1639
|
+
* — accuracy: best-in-class for semantic code search
|
|
1640
|
+
*
|
|
1641
|
+
* Integration: produces a ranked list of (filePath, score) that gets
|
|
1642
|
+
* merged with BM25 results via RRF in the context pipeline.
|
|
1643
|
+
*/
|
|
1644
|
+
|
|
1645
|
+
interface EmbeddingResult {
|
|
1646
|
+
filePath: string;
|
|
1647
|
+
score: number;
|
|
1648
|
+
}
|
|
1649
|
+
interface EmbeddingIndex {
|
|
1650
|
+
backend: 'tfidf-cosine' | 'onnx-minilm';
|
|
1651
|
+
dimensions: number;
|
|
1652
|
+
documentCount: number;
|
|
1653
|
+
query: (text: string, topK: number) => EmbeddingResult[];
|
|
1654
|
+
}
|
|
1655
|
+
/**
|
|
1656
|
+
* Build a dense embedding index from TF-IDF vectors.
|
|
1657
|
+
*
|
|
1658
|
+
* Each document becomes a vector in IDF-weighted term space.
|
|
1659
|
+
* Queries are vectorized the same way and matched via cosine similarity.
|
|
1660
|
+
*
|
|
1661
|
+
* This is surprisingly effective for code search because:
|
|
1662
|
+
* - Code has strong term distributions (class names, method names)
|
|
1663
|
+
* - IDF weighting naturally emphasizes discriminative terms
|
|
1664
|
+
* - Cosine similarity handles different document lengths well
|
|
1665
|
+
*
|
|
1666
|
+
* Performance: O(V) per query where V = vocabulary size × document count.
|
|
1667
|
+
* For 1000 files × 5000 unique terms = 5M ops. Fast enough for CLI.
|
|
1668
|
+
*/
|
|
1669
|
+
declare function buildTfIdfEmbeddingIndex(index: TfIdfIndex): EmbeddingIndex;
|
|
1670
|
+
/**
|
|
1671
|
+
* Merge BM25 results with embedding results using Reciprocal Rank Fusion.
|
|
1672
|
+
*
|
|
1673
|
+
* RRF(d) = Σ 1/(k + rank_i(d)) for each ranking i
|
|
1674
|
+
*
|
|
1675
|
+
* This is the standard way to combine lexical and semantic search.
|
|
1676
|
+
* k=60 is the standard constant from the RRF paper (Cormack et al., 2009).
|
|
1677
|
+
*
|
|
1678
|
+
* @param bm25Results - BM25 ranked results (filePath, score)
|
|
1679
|
+
* @param embeddingResults - Embedding ranked results (filePath, score)
|
|
1680
|
+
* @param k - RRF constant (default: 60)
|
|
1681
|
+
* @param bm25Weight - Weight for BM25 signal (default: 0.6)
|
|
1682
|
+
* @param embeddingWeight - Weight for embedding signal (default: 0.4)
|
|
1683
|
+
*/
|
|
1684
|
+
declare function reciprocalRankFusion(bm25Results: {
|
|
1685
|
+
filePath: string;
|
|
1686
|
+
score: number;
|
|
1687
|
+
}[], embeddingResults: EmbeddingResult[], k?: number, bm25Weight?: number, embeddingWeight?: number): {
|
|
1688
|
+
filePath: string;
|
|
1689
|
+
score: number;
|
|
1690
|
+
}[];
|
|
1691
|
+
/**
|
|
1692
|
+
* Check if ONNX Runtime is available for neural embeddings.
|
|
1693
|
+
*/
|
|
1694
|
+
declare function isOnnxAvailable(): Promise<boolean>;
|
|
1695
|
+
/**
|
|
1696
|
+
* Build a neural embedding index using ONNX Runtime.
|
|
1697
|
+
* Requires: npm install onnxruntime-node
|
|
1698
|
+
* Model: all-MiniLM-L6-v2 (download separately to .cto/models/)
|
|
1699
|
+
*
|
|
1700
|
+
* Falls back to TF-IDF cosine if ONNX is not available.
|
|
1701
|
+
*/
|
|
1702
|
+
declare function buildNeuralEmbeddingIndex(_files: {
|
|
1703
|
+
relativePath: string;
|
|
1704
|
+
content: string;
|
|
1705
|
+
}[], modelPath?: string): Promise<EmbeddingIndex | null>;
|
|
1706
|
+
|
|
1707
|
+
/**
|
|
1708
|
+
* Chunk-Level Retrieval
|
|
1709
|
+
*
|
|
1710
|
+
* Instead of including entire files, extract semantic chunks
|
|
1711
|
+
* (functions, methods, classes) and score each chunk independently.
|
|
1712
|
+
*
|
|
1713
|
+
* This is the single biggest efficiency win for context selection:
|
|
1714
|
+
* - A 2000-line file with 1 relevant method → include 50 lines, not 2000
|
|
1715
|
+
* - Token budget goes 10-40x further
|
|
1716
|
+
* - More files can have their relevant parts included
|
|
1717
|
+
*
|
|
1718
|
+
* Chunk types:
|
|
1719
|
+
* - Function/method definition (with body)
|
|
1720
|
+
* - Class/interface declaration (with key members)
|
|
1721
|
+
* - Import block
|
|
1722
|
+
* - Top-level constant/variable block
|
|
1723
|
+
*
|
|
1724
|
+
* Scoring: BM25 term overlap + structural bonus (method name matches query)
|
|
1725
|
+
*/
|
|
1726
|
+
interface CodeChunk {
|
|
1727
|
+
filePath: string;
|
|
1728
|
+
startLine: number;
|
|
1729
|
+
endLine: number;
|
|
1730
|
+
content: string;
|
|
1731
|
+
kind: ChunkKind;
|
|
1732
|
+
name: string;
|
|
1733
|
+
className?: string;
|
|
1734
|
+
score: number;
|
|
1735
|
+
tokens: number;
|
|
1736
|
+
}
|
|
1737
|
+
type ChunkKind = 'function' | 'method' | 'class' | 'interface' | 'import' | 'constant' | 'type' | 'block';
|
|
1738
|
+
interface ChunkRetrievalResult {
|
|
1739
|
+
chunks: CodeChunk[];
|
|
1740
|
+
fileChunks: Map<string, CodeChunk[]>;
|
|
1741
|
+
totalChunks: number;
|
|
1742
|
+
totalTokensUsed: number;
|
|
1743
|
+
}
|
|
1744
|
+
/**
|
|
1745
|
+
* Extract semantic chunks from a file.
|
|
1746
|
+
*/
|
|
1747
|
+
declare function chunkFile(content: string, filePath: string): CodeChunk[];
|
|
1748
|
+
/**
|
|
1749
|
+
* Score chunks against a query.
|
|
1750
|
+
* Uses BM25 term overlap + structural bonuses.
|
|
1751
|
+
*/
|
|
1752
|
+
declare function scoreChunks(chunks: CodeChunk[], task: string): CodeChunk[];
|
|
1753
|
+
/**
|
|
1754
|
+
* Retrieve the most relevant chunks across multiple files.
|
|
1755
|
+
*
|
|
1756
|
+
* @param files - Array of {relativePath, content} pairs
|
|
1757
|
+
* @param task - Task description to match against
|
|
1758
|
+
* @param tokenBudget - Max tokens to include (default: 30000)
|
|
1759
|
+
* @param minScore - Minimum chunk score to include (default: 0.1)
|
|
1760
|
+
*/
|
|
1761
|
+
declare function retrieveChunks(files: {
|
|
1762
|
+
relativePath: string;
|
|
1763
|
+
content: string;
|
|
1764
|
+
}[], task: string, tokenBudget?: number, minScore?: number): ChunkRetrievalResult;
|
|
1765
|
+
/**
|
|
1766
|
+
* Render chunks for a single file as markdown.
|
|
1767
|
+
* Shows relevant chunks with line numbers, connected by "..." for gaps.
|
|
1768
|
+
*/
|
|
1769
|
+
declare function renderFileChunks(filePath: string, chunks: CodeChunk[], ext: string): string;
|
|
1770
|
+
|
|
948
1771
|
type LogLevel = 'debug' | 'info' | 'warn' | 'error';
|
|
949
1772
|
interface LogEntry {
|
|
950
1773
|
level: LogLevel;
|
|
@@ -1006,4 +1829,4 @@ interface AuditOptions {
|
|
|
1006
1829
|
}
|
|
1007
1830
|
declare function auditProject(projectPath: string, filePaths: string[], options?: AuditOptions): Promise<AuditResult>;
|
|
1008
1831
|
|
|
1009
|
-
export { type AssignmentResult, type ContextPipelineInput, type ContextPipelineResult, CtoError, type CtoErrorCode, type DocumentVector, type Experiment, type ExperimentConclusion, type ExperimentGroup, type FilteredFile, type GroupMetrics, type ImportSpec, type IndexCacheStats, type LearnerBoost, type LearnerBoostInput, type LearnerModel, type LogEntry, type LogLevel, type Logger, type MultiRepoResult, type PatternStats, type RerankInput, type RerankResult, type RerankedFile, type SecretFinding, type SecretType, type SelectionInput, type SemanticMatch, type SemanticScore, type SiblingMatch, type SiblingRepo, type SignificanceResult, type SupportedLanguage, type TfIdfIndex, analyzeProject, assignGroup, auditProject, bfsBidirectional, boostByPath, buildAdjacencyList, buildIndex, buildIndexCached, buildProjectGraph, calculateCoverage, classifyFileKind, countTokensChars4, countTokensTiktoken, createExperiment, createLogger, createProject, detectLanguage, detectStack, discoverSiblingRepos, estimateComplexity, estimateFileTokens, estimateTokens, extractPattern, freeEncoder, getActiveExperiment, getCacheInfo, getConcludedExperiments, getLearnerBoosts, getLearnerStats, getPruneLevelForRisk, invalidateCache, isCtoError, loadExperiments, loadLearner, optimizeBudget, parseAllPolyglotImports, parseImports, parseSiblingPaths, pruneFile, pruneFiles, query, querySiblingRepos, recordOutcome, recordSelection, renderExperimentSummary, renderMultiRepoSummary, rerank, runContextPipeline, sanitizeContent, saveExperiments, saveLearner, scanContentForSecrets, scanFileForSecrets, scanProjectForSecrets, scoreAllFiles, scoreFile, selectContext, setJsonLogging, setLogLevel, similarity, testSignificance, tokenize, walkProject, wrapError };
|
|
1832
|
+
export { type ActionType, type ArchLayer, type AssignmentResult, type CallGraphResult, type ChunkKind, type ChunkRetrievalResult, type CoChangeEntry, type CoChangeMatrix, type CodeChunk, type ContextPipelineInput, type ContextPipelineResult, type CorpusEmbeddings, CtoError, type CtoErrorCode, type DocumentVector, type EmbeddingIndex, type EmbeddingResult, type Experiment, type ExperimentConclusion, type ExperimentGroup, type FileOpenEvent, type FilteredFile, type GroupMetrics, type HopDetail, type ImportSpec, type IndexCacheStats, type LearnerBoost, type LearnerBoostInput, type LearnerModel, type LogEntry, type LogLevel, type Logger, type MethodCall, type MethodDefinition, type MultiHopConfig, type MultiHopResult, type MultiRepoResult, type PatternStats, type QueryIntent, type RerankInput, type RerankResult, type RerankedFile, type SecretFinding, type SecretType, type SelectionInput, type SemanticExpansion, type SemanticMatch, type SemanticScore, type SiblingMatch, type SiblingRepo, type SignalWeight, type SignificanceResult, type StructuralTokens, type SupportedLanguage, type SynonymExpansion, type TelemetryModel, type TelemetrySession, type TfIdfIndex, type TunedWeights, type WeightTunerModel, analyzeProject, assignGroup, attributeToSignal, auditProject, augmentContentWithStructure, bfsBidirectional, boostByCallGraph, boostByGitCoChange, boostByImports, boostByLayer, boostByPath, buildAdjacencyList, buildCallGraph, buildCoChangeMatrix, buildCorpusEmbeddings, buildIndex, buildIndexCached, buildNeuralEmbeddingIndex, buildProjectGraph, buildTfIdfEmbeddingIndex, buildWeightedQuery, calculateCoverage, chunkFile, classifyFileKind, countTokensChars4, countTokensTiktoken, createExperiment, createFreshModel, createLogger, createProject, detectLanguage, detectStack, discoverSiblingRepos, embedQuery, reciprocalRankFusion as embeddingRRF, estimateComplexity, estimateFileTokens, estimateTokens, expandLayers, expandQuery, expandQueryWithPMI, expandTerm, extractPattern, extractStructuralTokens, freeEncoder, getActiveExperiment, getCacheInfo, getConcludedExperiments, getExpansionDetails, getGitRecency, getLearnerBoosts, getLearnerStats, getOptimizedWeights, getPruneLevelForRisk, getStructuralSummary, getSynonymStats, getTelemetryBoosts, invalidateCache, isCtoError, isOnnxAvailable, loadExperiments, loadLearner, loadTelemetry, loadWeightTuner, multiHopQuery, optimizeBudget, parseAllPolyglotImports, parseImports, parseQueryIntent, parseSiblingPaths, pruneFile, pruneFiles, query, queryByEmbedding, querySiblingRepos, reciprocalRankFusion$1 as reciprocalRankFusion, recordFeedback, recordFileOpen, recordOutcome, recordSelection, recordSession, renderExperimentSummary, renderFileChunks, renderMultiRepoSummary, renderTelemetrySummary, renderWeightStatus, rerank, retrieveChunks, runContextPipeline, sanitizeContent, saveExperiments, saveLearner, saveTelemetry, saveWeightTuner, scanContentForSecrets, scanFileForSecrets, scanProjectForSecrets, scoreAllFiles, scoreChunks, scoreFile, selectContext, setJsonLogging, setLogLevel, similarity, testSignificance, tokenize, walkProject, wrapError };
|