@tai-io/codesearch 2026.313.1614
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.d.ts +3 -0
- package/dist/build-info.js +4 -0
- package/dist/config.d.ts +62 -0
- package/dist/config.js +52 -0
- package/dist/core/cleanup.d.ts +8 -0
- package/dist/core/cleanup.js +41 -0
- package/dist/core/doc-indexer.d.ts +13 -0
- package/dist/core/doc-indexer.js +76 -0
- package/dist/core/doc-searcher.d.ts +13 -0
- package/dist/core/doc-searcher.js +65 -0
- package/dist/core/file-category.d.ts +7 -0
- package/dist/core/file-category.js +75 -0
- package/dist/core/indexer.d.ts +18 -0
- package/dist/core/indexer.js +177 -0
- package/dist/core/preview.d.ts +13 -0
- package/dist/core/preview.js +58 -0
- package/dist/core/repo-map.d.ts +33 -0
- package/dist/core/repo-map.js +144 -0
- package/dist/core/searcher.d.ts +12 -0
- package/dist/core/searcher.js +97 -0
- package/dist/core/sync.d.ts +15 -0
- package/dist/core/sync.js +212 -0
- package/dist/core/targeted-indexer.d.ts +19 -0
- package/dist/core/targeted-indexer.js +127 -0
- package/dist/embedding/factory.d.ts +4 -0
- package/dist/embedding/factory.js +24 -0
- package/dist/embedding/openai.d.ts +33 -0
- package/dist/embedding/openai.js +234 -0
- package/dist/embedding/truncate.d.ts +6 -0
- package/dist/embedding/truncate.js +14 -0
- package/dist/embedding/types.d.ts +18 -0
- package/dist/embedding/types.js +2 -0
- package/dist/errors.d.ts +17 -0
- package/dist/errors.js +21 -0
- package/dist/format.d.ts +18 -0
- package/dist/format.js +151 -0
- package/dist/hooks/cli-router.d.ts +7 -0
- package/dist/hooks/cli-router.js +47 -0
- package/dist/hooks/hook-output.d.ts +56 -0
- package/dist/hooks/hook-output.js +21 -0
- package/dist/hooks/post-tool-use.d.ts +13 -0
- package/dist/hooks/post-tool-use.js +123 -0
- package/dist/hooks/stop-hook.d.ts +11 -0
- package/dist/hooks/stop-hook.js +137 -0
- package/dist/hooks/targeted-runner.d.ts +11 -0
- package/dist/hooks/targeted-runner.js +58 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +138 -0
- package/dist/paths.d.ts +11 -0
- package/dist/paths.js +54 -0
- package/dist/setup-message.d.ts +4 -0
- package/dist/setup-message.js +48 -0
- package/dist/splitter/ast.d.ts +13 -0
- package/dist/splitter/ast.js +231 -0
- package/dist/splitter/line.d.ts +10 -0
- package/dist/splitter/line.js +103 -0
- package/dist/splitter/symbol-extract.d.ts +16 -0
- package/dist/splitter/symbol-extract.js +61 -0
- package/dist/splitter/types.d.ts +16 -0
- package/dist/splitter/types.js +2 -0
- package/dist/state/doc-metadata.d.ts +18 -0
- package/dist/state/doc-metadata.js +59 -0
- package/dist/state/registry.d.ts +7 -0
- package/dist/state/registry.js +46 -0
- package/dist/state/snapshot.d.ts +26 -0
- package/dist/state/snapshot.js +100 -0
- package/dist/tool-schemas.d.ts +215 -0
- package/dist/tool-schemas.js +269 -0
- package/dist/tools.d.ts +58 -0
- package/dist/tools.js +245 -0
- package/dist/vectordb/rrf.d.ts +32 -0
- package/dist/vectordb/rrf.js +88 -0
- package/dist/vectordb/sqlite.d.ts +34 -0
- package/dist/vectordb/sqlite.js +624 -0
- package/dist/vectordb/types.d.ts +63 -0
- package/dist/vectordb/types.js +2 -0
- package/messages.yaml +69 -0
- package/package.json +79 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { randomUUID } from 'node:crypto';
|
|
4
|
+
import { AstSplitter } from '../splitter/ast.js';
|
|
5
|
+
import { LineSplitter } from '../splitter/line.js';
|
|
6
|
+
import { extensionToLanguage, buildSnapshot } from './sync.js';
|
|
7
|
+
import { getConfig } from '../config.js';
|
|
8
|
+
import { normalizePath, pathToCollectionName } from '../paths.js';
|
|
9
|
+
import { classifyFileCategory } from './file-category.js';
|
|
10
|
+
import { loadSnapshot, saveSnapshot } from '../vectordb/sqlite.js';
|
|
11
|
+
/**
|
|
12
|
+
* Re-index a specific set of files within a project.
|
|
13
|
+
* For each file: delete stale vectors, re-split, re-embed, re-insert, update snapshot.
|
|
14
|
+
*
|
|
15
|
+
* @param rootPath Absolute path to the project root
|
|
16
|
+
* @param relativePaths Relative paths (from rootPath) of files to re-index
|
|
17
|
+
* @param embedding Embedding provider
|
|
18
|
+
* @param vectordb Vector DB provider
|
|
19
|
+
*/
|
|
20
|
+
export async function indexFiles(rootPath, relativePaths, embedding, vectordb) {
|
|
21
|
+
const start = Date.now();
|
|
22
|
+
const normalizedRoot = normalizePath(rootPath);
|
|
23
|
+
const collectionName = pathToCollectionName(normalizedRoot);
|
|
24
|
+
const config = getConfig();
|
|
25
|
+
// Skip if the collection doesn't exist (codebase never indexed)
|
|
26
|
+
if (!(await vectordb.hasCollection(collectionName))) {
|
|
27
|
+
process.stderr.write(`[targeted-indexer] No collection for ${normalizedRoot} — codebase not indexed, skipping.\n`);
|
|
28
|
+
return {
|
|
29
|
+
processedFiles: 0,
|
|
30
|
+
totalChunks: 0,
|
|
31
|
+
skippedFiles: relativePaths.length,
|
|
32
|
+
durationMs: Date.now() - start,
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
const astSplitter = new AstSplitter();
|
|
36
|
+
const lineSplitter = new LineSplitter();
|
|
37
|
+
const allChunks = [];
|
|
38
|
+
const processedPaths = [];
|
|
39
|
+
const deletedPaths = [];
|
|
40
|
+
let skippedFiles = 0;
|
|
41
|
+
// Step 1: delete stale vectors and split files
|
|
42
|
+
const concurrency = config.indexingConcurrency;
|
|
43
|
+
for (let i = 0; i < relativePaths.length; i += concurrency) {
|
|
44
|
+
const batch = relativePaths.slice(i, i + concurrency);
|
|
45
|
+
const batchResults = await Promise.all(batch.map(async (relPath) => {
|
|
46
|
+
// Always remove stale vectors first
|
|
47
|
+
await vectordb.deleteByPath(collectionName, relPath);
|
|
48
|
+
const fullPath = path.join(normalizedRoot, relPath);
|
|
49
|
+
let code;
|
|
50
|
+
try {
|
|
51
|
+
code = fs.readFileSync(fullPath, 'utf-8');
|
|
52
|
+
}
|
|
53
|
+
catch (err) {
|
|
54
|
+
if (err.code === 'ENOENT') {
|
|
55
|
+
// File was deleted — vectors removed, skip re-embedding
|
|
56
|
+
return { relPath, chunks: [], deleted: true };
|
|
57
|
+
}
|
|
58
|
+
throw err;
|
|
59
|
+
}
|
|
60
|
+
if (code.trim().length === 0)
|
|
61
|
+
return { relPath, chunks: [], deleted: false };
|
|
62
|
+
const ext = path.extname(relPath);
|
|
63
|
+
const language = extensionToLanguage(ext);
|
|
64
|
+
let chunks = astSplitter.split(code, language, relPath);
|
|
65
|
+
if (chunks.length === 0) {
|
|
66
|
+
chunks = lineSplitter.split(code, language, relPath);
|
|
67
|
+
}
|
|
68
|
+
return { relPath, chunks, deleted: false };
|
|
69
|
+
}));
|
|
70
|
+
for (const { relPath, chunks, deleted } of batchResults) {
|
|
71
|
+
if (deleted) {
|
|
72
|
+
deletedPaths.push(relPath);
|
|
73
|
+
skippedFiles++;
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
allChunks.push(...chunks);
|
|
77
|
+
processedPaths.push(relPath);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// Step 2: embed and insert chunks
|
|
82
|
+
const batchSize = config.embeddingBatchSize;
|
|
83
|
+
let totalChunks = 0;
|
|
84
|
+
for (let i = 0; i < allChunks.length; i += batchSize) {
|
|
85
|
+
const batch = allChunks.slice(i, i + batchSize);
|
|
86
|
+
const texts = batch.map((c) => c.content);
|
|
87
|
+
const vectors = await embedding.embedBatch(texts);
|
|
88
|
+
const documents = batch.map((chunk, j) => ({
|
|
89
|
+
id: randomUUID(),
|
|
90
|
+
content: chunk.content,
|
|
91
|
+
vector: vectors[j],
|
|
92
|
+
relativePath: chunk.filePath,
|
|
93
|
+
startLine: chunk.startLine,
|
|
94
|
+
endLine: chunk.endLine,
|
|
95
|
+
fileExtension: path.extname(chunk.filePath),
|
|
96
|
+
language: chunk.language,
|
|
97
|
+
fileCategory: classifyFileCategory(chunk.filePath),
|
|
98
|
+
symbolName: chunk.symbolName,
|
|
99
|
+
symbolKind: chunk.symbolKind,
|
|
100
|
+
symbolSignature: chunk.symbolSignature,
|
|
101
|
+
parentSymbol: chunk.parentSymbol,
|
|
102
|
+
}));
|
|
103
|
+
await vectordb.insert(collectionName, documents);
|
|
104
|
+
totalChunks += batch.length;
|
|
105
|
+
}
|
|
106
|
+
// Step 3: update snapshot — refresh hashes for processed files, remove deleted
|
|
107
|
+
const snapshot = loadSnapshot(normalizedRoot);
|
|
108
|
+
if (snapshot) {
|
|
109
|
+
const freshSnapshot = buildSnapshot(normalizedRoot, processedPaths);
|
|
110
|
+
for (const relPath of processedPaths) {
|
|
111
|
+
if (freshSnapshot[relPath]) {
|
|
112
|
+
snapshot[relPath] = freshSnapshot[relPath];
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
for (const relPath of deletedPaths) {
|
|
116
|
+
Reflect.deleteProperty(snapshot, relPath);
|
|
117
|
+
}
|
|
118
|
+
saveSnapshot(normalizedRoot, snapshot);
|
|
119
|
+
}
|
|
120
|
+
return {
|
|
121
|
+
processedFiles: processedPaths.length,
|
|
122
|
+
totalChunks,
|
|
123
|
+
skippedFiles,
|
|
124
|
+
durationMs: Date.now() - start,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
//# sourceMappingURL=targeted-indexer.js.map
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { OpenAIEmbedding } from './openai.js';
|
|
2
|
+
export function createEmbedding(config) {
|
|
3
|
+
switch (config.embeddingProvider) {
|
|
4
|
+
case 'openai':
|
|
5
|
+
return new OpenAIEmbedding();
|
|
6
|
+
case 'ollama':
|
|
7
|
+
return new OpenAIEmbedding({
|
|
8
|
+
apiKey: config.openaiApiKey || 'ollama',
|
|
9
|
+
baseUrl: config.ollamaBaseUrl,
|
|
10
|
+
model: config.embeddingModel,
|
|
11
|
+
});
|
|
12
|
+
case 'local':
|
|
13
|
+
return new OpenAIEmbedding({
|
|
14
|
+
apiKey: config.openaiApiKey || 'local',
|
|
15
|
+
baseUrl: config.openaiBaseUrl,
|
|
16
|
+
model: config.embeddingModel,
|
|
17
|
+
});
|
|
18
|
+
default: {
|
|
19
|
+
const _exhaustive = config.embeddingProvider;
|
|
20
|
+
throw new Error(`Unknown embedding provider: ${_exhaustive}`);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=factory.js.map
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { type Embedding, type EmbeddingVector, type TokenEstimate } from './types.js';
|
|
2
|
+
export declare function contentHash(text: string): string;
|
|
3
|
+
export interface OpenAIEmbeddingOptions {
|
|
4
|
+
apiKey?: string;
|
|
5
|
+
baseUrl?: string;
|
|
6
|
+
model?: string;
|
|
7
|
+
}
|
|
8
|
+
export declare class OpenAIEmbedding implements Embedding {
|
|
9
|
+
private client;
|
|
10
|
+
private model;
|
|
11
|
+
private _dimension;
|
|
12
|
+
private initialized;
|
|
13
|
+
private memoryCache;
|
|
14
|
+
private cacheDir;
|
|
15
|
+
constructor(options?: OpenAIEmbeddingOptions);
|
|
16
|
+
get dimension(): number;
|
|
17
|
+
/**
|
|
18
|
+
* Validate the API key and detect embedding dimension by embedding a test string.
|
|
19
|
+
* Must be called once before any other operations.
|
|
20
|
+
*/
|
|
21
|
+
initialize(): Promise<void>;
|
|
22
|
+
private ensureInitialized;
|
|
23
|
+
embed(text: string): Promise<EmbeddingVector>;
|
|
24
|
+
embedBatch(texts: string[]): Promise<EmbeddingVector[]>;
|
|
25
|
+
estimateTokens(texts: string[]): TokenEstimate;
|
|
26
|
+
private setMemoryCache;
|
|
27
|
+
private callWithRetry;
|
|
28
|
+
private callApi;
|
|
29
|
+
private getDiskCachePath;
|
|
30
|
+
private readDiskCache;
|
|
31
|
+
private writeDiskCache;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=openai.d.ts.map
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import fsp from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import OpenAI from 'openai';
|
|
5
|
+
import { EmbeddingError } from '../errors.js';
|
|
6
|
+
import { getConfig } from '../config.js';
|
|
7
|
+
import { getCacheDir } from '../paths.js';
|
|
8
|
+
import { truncateToSafeLength } from './truncate.js';
|
|
9
|
+
const RETRY_DELAYS = [1000, 4000, 16000]; // exponential backoff
|
|
10
|
+
const RETRYABLE_STATUS = new Set([429, 500, 502, 503]);
|
|
11
|
+
const MAX_MEMORY_CACHE_SIZE = 10_000;
|
|
12
|
+
const MAX_RETRY_AFTER_MS = 60_000;
|
|
13
|
+
export function contentHash(text) {
|
|
14
|
+
return createHash('sha256').update(text).digest('hex').slice(0, 16);
|
|
15
|
+
}
|
|
16
|
+
export class OpenAIEmbedding {
|
|
17
|
+
client;
|
|
18
|
+
model;
|
|
19
|
+
_dimension = 0;
|
|
20
|
+
initialized = false;
|
|
21
|
+
memoryCache = new Map();
|
|
22
|
+
cacheDir;
|
|
23
|
+
constructor(options) {
|
|
24
|
+
const config = getConfig();
|
|
25
|
+
const apiKey = options?.apiKey ?? config.openaiApiKey;
|
|
26
|
+
const baseUrl = options?.baseUrl ?? config.openaiBaseUrl;
|
|
27
|
+
this.client = new OpenAI({
|
|
28
|
+
apiKey,
|
|
29
|
+
...(baseUrl && { baseURL: baseUrl }),
|
|
30
|
+
});
|
|
31
|
+
this.model = options?.model ?? config.embeddingModel;
|
|
32
|
+
this.cacheDir = path.join(getCacheDir(), 'embeddings');
|
|
33
|
+
}
|
|
34
|
+
get dimension() {
|
|
35
|
+
return this._dimension;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Validate the API key and detect embedding dimension by embedding a test string.
|
|
39
|
+
* Must be called once before any other operations.
|
|
40
|
+
*/
|
|
41
|
+
async initialize() {
|
|
42
|
+
try {
|
|
43
|
+
const result = await this.callApi(['dimension probe']);
|
|
44
|
+
this._dimension = result[0].length;
|
|
45
|
+
this.initialized = true;
|
|
46
|
+
console.log(`Embedding model "${this.model}" validated. Dimension: ${this._dimension}`);
|
|
47
|
+
}
|
|
48
|
+
catch (err) {
|
|
49
|
+
throw new EmbeddingError(`Failed to initialize embedding provider. Check your API key, base URL, and model name. ` +
|
|
50
|
+
`Model: "${this.model}"`, err);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
ensureInitialized() {
|
|
54
|
+
if (!this.initialized) {
|
|
55
|
+
throw new EmbeddingError('Embedding provider not initialized. Call initialize() before embed/embedBatch.');
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
async embed(text) {
|
|
59
|
+
this.ensureInitialized();
|
|
60
|
+
const results = await this.embedBatch([text]);
|
|
61
|
+
return results[0];
|
|
62
|
+
}
|
|
63
|
+
async embedBatch(texts) {
|
|
64
|
+
this.ensureInitialized();
|
|
65
|
+
if (texts.length === 0)
|
|
66
|
+
return [];
|
|
67
|
+
// Track which indices have empty/whitespace-only text so we return zero vectors for them
|
|
68
|
+
const emptyIndices = new Set();
|
|
69
|
+
for (let i = 0; i < texts.length; i++) {
|
|
70
|
+
if (texts[i].trim().length === 0) {
|
|
71
|
+
emptyIndices.add(i);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
const results = Array.from({
|
|
75
|
+
length: texts.length,
|
|
76
|
+
}).fill(null);
|
|
77
|
+
// Fill empty-text slots with zero vectors immediately
|
|
78
|
+
for (const i of emptyIndices) {
|
|
79
|
+
results[i] = new Array(this._dimension).fill(0);
|
|
80
|
+
}
|
|
81
|
+
// Check caches for non-empty texts
|
|
82
|
+
for (let i = 0; i < texts.length; i++) {
|
|
83
|
+
if (emptyIndices.has(i))
|
|
84
|
+
continue;
|
|
85
|
+
const hash = contentHash(texts[i]);
|
|
86
|
+
const memHit = this.memoryCache.get(hash);
|
|
87
|
+
if (memHit) {
|
|
88
|
+
results[i] = memHit;
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
const diskHit = await this.readDiskCache(hash);
|
|
92
|
+
if (diskHit) {
|
|
93
|
+
this.setMemoryCache(hash, diskHit);
|
|
94
|
+
results[i] = diskHit;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
const uncachedIndices = [];
|
|
98
|
+
const uncachedTexts = [];
|
|
99
|
+
for (let i = 0; i < texts.length; i++) {
|
|
100
|
+
if (results[i] === null) {
|
|
101
|
+
uncachedIndices.push(i);
|
|
102
|
+
uncachedTexts.push(texts[i]);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
if (uncachedTexts.length === 0) {
|
|
106
|
+
return results;
|
|
107
|
+
}
|
|
108
|
+
const batchSize = getConfig().embeddingBatchSize;
|
|
109
|
+
const freshEmbeddings = [];
|
|
110
|
+
for (let offset = 0; offset < uncachedTexts.length; offset += batchSize) {
|
|
111
|
+
const batch = uncachedTexts.slice(offset, offset + batchSize);
|
|
112
|
+
const batchResult = await this.callWithRetry(batch);
|
|
113
|
+
freshEmbeddings.push(...batchResult);
|
|
114
|
+
}
|
|
115
|
+
for (let i = 0; i < uncachedIndices.length; i++) {
|
|
116
|
+
const idx = uncachedIndices[i];
|
|
117
|
+
const hash = contentHash(texts[idx]);
|
|
118
|
+
const vec = freshEmbeddings[i];
|
|
119
|
+
this.setMemoryCache(hash, vec);
|
|
120
|
+
this.writeDiskCache(hash, vec);
|
|
121
|
+
results[idx] = vec;
|
|
122
|
+
}
|
|
123
|
+
if (results.some((r) => r === null)) {
|
|
124
|
+
throw new EmbeddingError('Missing embeddings: some texts did not receive vectors after cache lookup and API call.');
|
|
125
|
+
}
|
|
126
|
+
return results;
|
|
127
|
+
}
|
|
128
|
+
estimateTokens(texts) {
|
|
129
|
+
const totalChars = texts.reduce((sum, t) => sum + t.length, 0);
|
|
130
|
+
const estimatedTokens = Math.ceil(totalChars / 4);
|
|
131
|
+
// Per-million-token pricing for known OpenAI models; local models are free
|
|
132
|
+
const COST_PER_MILLION = {
|
|
133
|
+
'text-embedding-3-small': 0.02,
|
|
134
|
+
'text-embedding-3-large': 0.13,
|
|
135
|
+
'text-embedding-ada-002': 0.1,
|
|
136
|
+
};
|
|
137
|
+
const rate = COST_PER_MILLION[this.model] ?? 0;
|
|
138
|
+
const estimatedCostUsd = (estimatedTokens / 1_000_000) * rate;
|
|
139
|
+
return { totalChars, estimatedTokens, estimatedCostUsd };
|
|
140
|
+
}
|
|
141
|
+
setMemoryCache(hash, vec) {
|
|
142
|
+
if (this.memoryCache.size >= MAX_MEMORY_CACHE_SIZE && !this.memoryCache.has(hash)) {
|
|
143
|
+
// Delete the oldest entry (first key from the iterator)
|
|
144
|
+
const oldest = this.memoryCache.keys().next().value;
|
|
145
|
+
if (oldest !== undefined) {
|
|
146
|
+
this.memoryCache.delete(oldest);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
this.memoryCache.set(hash, vec);
|
|
150
|
+
}
|
|
151
|
+
async callWithRetry(texts) {
|
|
152
|
+
let currentBatchSize = texts.length;
|
|
153
|
+
const allResults = [];
|
|
154
|
+
let startOffset = 0;
|
|
155
|
+
for (let attempt = 0; attempt < RETRY_DELAYS.length + 1; attempt++) {
|
|
156
|
+
try {
|
|
157
|
+
for (let offset = startOffset; offset < texts.length; offset += currentBatchSize) {
|
|
158
|
+
const batch = texts.slice(offset, offset + currentBatchSize);
|
|
159
|
+
const result = await this.callApi(batch);
|
|
160
|
+
allResults.push(...result);
|
|
161
|
+
startOffset = offset + currentBatchSize;
|
|
162
|
+
}
|
|
163
|
+
return allResults;
|
|
164
|
+
}
|
|
165
|
+
catch (err) {
|
|
166
|
+
const status = err.status;
|
|
167
|
+
const isRetryable = status !== undefined && RETRYABLE_STATUS.has(status);
|
|
168
|
+
if (!isRetryable || attempt >= RETRY_DELAYS.length) {
|
|
169
|
+
throw new EmbeddingError(`Embedding API call failed after ${attempt + 1} attempt(s). Status: ${status ?? 'unknown'}`, err);
|
|
170
|
+
}
|
|
171
|
+
let delay = RETRY_DELAYS[attempt];
|
|
172
|
+
if (status === 429) {
|
|
173
|
+
const retryAfter = err.headers?.['retry-after'];
|
|
174
|
+
if (retryAfter) {
|
|
175
|
+
const parsed = parseInt(retryAfter, 10);
|
|
176
|
+
if (!isNaN(parsed))
|
|
177
|
+
delay = Math.min(parsed * 1000, MAX_RETRY_AFTER_MS);
|
|
178
|
+
}
|
|
179
|
+
// Halve batch size on rate limit to avoid repeated throttling
|
|
180
|
+
currentBatchSize = Math.max(1, Math.floor(currentBatchSize / 2));
|
|
181
|
+
console.warn(`Rate limited. Retrying in ${delay}ms with batch size ${currentBatchSize}.`);
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
console.warn(`Embedding API error (status ${status}). Retrying in ${delay}ms...`);
|
|
185
|
+
}
|
|
186
|
+
await sleep(delay);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
throw new EmbeddingError('Unexpected: exhausted retries');
|
|
190
|
+
}
|
|
191
|
+
async callApi(texts) {
|
|
192
|
+
const response = await this.client.embeddings.create({
|
|
193
|
+
model: this.model,
|
|
194
|
+
input: texts.map(truncateToSafeLength),
|
|
195
|
+
});
|
|
196
|
+
const sorted = response.data.sort((a, b) => a.index - b.index);
|
|
197
|
+
return sorted.map((d) => d.embedding);
|
|
198
|
+
}
|
|
199
|
+
getDiskCachePath(hash) {
|
|
200
|
+
// Shard into subdirectories to avoid too many files in one dir
|
|
201
|
+
const shard = hash.slice(0, 2);
|
|
202
|
+
return path.join(this.cacheDir, this.model.replace(/[^a-zA-Z0-9-]/g, '_'), shard, `${hash}.json`);
|
|
203
|
+
}
|
|
204
|
+
async readDiskCache(hash) {
|
|
205
|
+
const filepath = this.getDiskCachePath(hash);
|
|
206
|
+
try {
|
|
207
|
+
const data = await fsp.readFile(filepath, 'utf-8');
|
|
208
|
+
return JSON.parse(data);
|
|
209
|
+
}
|
|
210
|
+
catch (err) {
|
|
211
|
+
const code = err.code;
|
|
212
|
+
if (code === 'ENOENT') {
|
|
213
|
+
// File doesn't exist -- normal cache miss
|
|
214
|
+
return null;
|
|
215
|
+
}
|
|
216
|
+
// Parse error or other I/O problem: warn and remove corrupted file
|
|
217
|
+
console.warn(`Corrupted embedding cache file ${filepath}, deleting.`);
|
|
218
|
+
fsp.unlink(filepath).catch(() => { });
|
|
219
|
+
return null;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
writeDiskCache(hash, vector) {
|
|
223
|
+
const filepath = this.getDiskCachePath(hash);
|
|
224
|
+
// Fire-and-forget async write
|
|
225
|
+
fsp
|
|
226
|
+
.mkdir(path.dirname(filepath), { recursive: true })
|
|
227
|
+
.then(() => fsp.writeFile(filepath, JSON.stringify(vector)))
|
|
228
|
+
.catch(() => { });
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
function sleep(ms) {
|
|
232
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
233
|
+
}
|
|
234
|
+
//# sourceMappingURL=openai.js.map
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
// Safety margin below 8,191 token limit (~4 chars/token for code)
|
|
2
|
+
const MAX_EMBED_CHARS = 6000;
|
|
3
|
+
/**
|
|
4
|
+
* Truncate text to stay within the embedding model's token limit.
|
|
5
|
+
* Cuts at the last newline boundary to avoid splitting mid-line.
|
|
6
|
+
*/
|
|
7
|
+
export function truncateToSafeLength(text) {
|
|
8
|
+
if (text.length <= MAX_EMBED_CHARS)
|
|
9
|
+
return text;
|
|
10
|
+
const truncated = text.slice(0, MAX_EMBED_CHARS);
|
|
11
|
+
const lastNewline = truncated.lastIndexOf('\n');
|
|
12
|
+
return lastNewline > 0 ? truncated.slice(0, lastNewline) : truncated;
|
|
13
|
+
}
|
|
14
|
+
//# sourceMappingURL=truncate.js.map
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export type EmbeddingVector = number[];
|
|
2
|
+
export interface TokenEstimate {
|
|
3
|
+
totalChars: number;
|
|
4
|
+
estimatedTokens: number;
|
|
5
|
+
estimatedCostUsd: number;
|
|
6
|
+
}
|
|
7
|
+
export interface Embedding {
|
|
8
|
+
/**
|
|
9
|
+
* Validate provider connectivity and detect embedding dimension.
|
|
10
|
+
* Must be called once before any embed/embedBatch operations.
|
|
11
|
+
*/
|
|
12
|
+
initialize(): Promise<void>;
|
|
13
|
+
embed(text: string): Promise<EmbeddingVector>;
|
|
14
|
+
embedBatch(texts: string[]): Promise<EmbeddingVector[]>;
|
|
15
|
+
estimateTokens(texts: string[]): TokenEstimate;
|
|
16
|
+
readonly dimension: number;
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=types.d.ts.map
|
package/dist/errors.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export declare class CodesearchError extends Error {
|
|
2
|
+
readonly cause?: unknown | undefined;
|
|
3
|
+
constructor(message: string, cause?: unknown | undefined);
|
|
4
|
+
}
|
|
5
|
+
export declare class ConfigError extends CodesearchError {
|
|
6
|
+
}
|
|
7
|
+
export declare class EmbeddingError extends CodesearchError {
|
|
8
|
+
}
|
|
9
|
+
export declare class VectorDBError extends CodesearchError {
|
|
10
|
+
}
|
|
11
|
+
export declare class IndexingError extends CodesearchError {
|
|
12
|
+
}
|
|
13
|
+
export declare class SearchError extends CodesearchError {
|
|
14
|
+
}
|
|
15
|
+
export declare class BootstrapError extends CodesearchError {
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=errors.d.ts.map
|
package/dist/errors.js
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export class CodesearchError extends Error {
|
|
2
|
+
cause;
|
|
3
|
+
constructor(message, cause) {
|
|
4
|
+
super(message);
|
|
5
|
+
this.cause = cause;
|
|
6
|
+
this.name = this.constructor.name;
|
|
7
|
+
}
|
|
8
|
+
}
|
|
9
|
+
export class ConfigError extends CodesearchError {
|
|
10
|
+
}
|
|
11
|
+
export class EmbeddingError extends CodesearchError {
|
|
12
|
+
}
|
|
13
|
+
export class VectorDBError extends CodesearchError {
|
|
14
|
+
}
|
|
15
|
+
export class IndexingError extends CodesearchError {
|
|
16
|
+
}
|
|
17
|
+
export class SearchError extends CodesearchError {
|
|
18
|
+
}
|
|
19
|
+
export class BootstrapError extends CodesearchError {
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=errors.js.map
|
package/dist/format.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { CleanupResult } from './core/cleanup.js';
|
|
2
|
+
import type { PreviewResult, IndexResult } from './core/indexer.js';
|
|
3
|
+
import type { DocIndexResult } from './core/doc-indexer.js';
|
|
4
|
+
import type { DocSearchResult } from './core/doc-searcher.js';
|
|
5
|
+
import type { CodebaseState } from './state/snapshot.js';
|
|
6
|
+
export declare function textResult(text: string): {
|
|
7
|
+
content: {
|
|
8
|
+
type: "text";
|
|
9
|
+
text: string;
|
|
10
|
+
}[];
|
|
11
|
+
};
|
|
12
|
+
export declare function formatCleanupResult(result: CleanupResult, normalizedPath: string, dryRun: boolean): string;
|
|
13
|
+
export declare function formatIndexResult(result: IndexResult, normalizedPath: string): string;
|
|
14
|
+
export declare function formatPreview(preview: PreviewResult, rootPath: string): string;
|
|
15
|
+
export declare function formatListIndexed(states: CodebaseState[]): string;
|
|
16
|
+
export declare function formatDocIndexResult(result: DocIndexResult): string;
|
|
17
|
+
export declare function formatDocSearchResults(results: DocSearchResult[], query: string): string;
|
|
18
|
+
//# sourceMappingURL=format.d.ts.map
|
package/dist/format.js
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { listProjects } from './state/registry.js';
|
|
2
|
+
export function textResult(text) {
|
|
3
|
+
return { content: [{ type: 'text', text }] };
|
|
4
|
+
}
|
|
5
|
+
export function formatCleanupResult(result, normalizedPath, dryRun) {
|
|
6
|
+
if (dryRun) {
|
|
7
|
+
if (result.removedFiles.length === 0) {
|
|
8
|
+
return `Dry run for ${normalizedPath}: no orphaned vectors found.`;
|
|
9
|
+
}
|
|
10
|
+
const lines = [
|
|
11
|
+
`Dry run for ${normalizedPath}: ${result.removedFiles.length} file(s) would be cleaned:`,
|
|
12
|
+
'',
|
|
13
|
+
];
|
|
14
|
+
for (const f of result.removedFiles) {
|
|
15
|
+
lines.push(` - ${f}`);
|
|
16
|
+
}
|
|
17
|
+
return lines.join('\n');
|
|
18
|
+
}
|
|
19
|
+
if (result.totalRemoved === 0) {
|
|
20
|
+
return `Cleanup complete for ${normalizedPath}: no orphaned vectors found.`;
|
|
21
|
+
}
|
|
22
|
+
const lines = [
|
|
23
|
+
`Cleanup complete for ${normalizedPath}`,
|
|
24
|
+
'',
|
|
25
|
+
` Removed: ${result.totalRemoved} file(s)`,
|
|
26
|
+
` Duration: ${(result.durationMs / 1000).toFixed(1)}s`,
|
|
27
|
+
'',
|
|
28
|
+
'Cleaned files:',
|
|
29
|
+
];
|
|
30
|
+
for (const f of result.removedFiles) {
|
|
31
|
+
lines.push(` - ${f}`);
|
|
32
|
+
}
|
|
33
|
+
return lines.join('\n');
|
|
34
|
+
}
|
|
35
|
+
export function formatIndexResult(result, normalizedPath) {
|
|
36
|
+
const lines = [
|
|
37
|
+
`Indexing complete for ${normalizedPath}`,
|
|
38
|
+
'',
|
|
39
|
+
` Total files: ${result.totalFiles}`,
|
|
40
|
+
` Total chunks: ${result.totalChunks}`,
|
|
41
|
+
` Added: ${result.addedFiles}`,
|
|
42
|
+
` Modified: ${result.modifiedFiles}`,
|
|
43
|
+
` Removed: ${result.removedFiles}`,
|
|
44
|
+
` Skipped: ${result.skippedFiles}`,
|
|
45
|
+
` Parse failures: ${result.parseFailures.length}`,
|
|
46
|
+
` Tokens: ~${(result.estimatedTokens / 1000).toFixed(0)}K`,
|
|
47
|
+
` Cost: $${result.estimatedCostUsd.toFixed(4)}`,
|
|
48
|
+
` Duration: ${(result.durationMs / 1000).toFixed(1)}s`,
|
|
49
|
+
];
|
|
50
|
+
if (result.parseFailures.length > 0) {
|
|
51
|
+
lines.push('');
|
|
52
|
+
lines.push('Parse failures:');
|
|
53
|
+
const toShow = result.parseFailures.slice(0, 10);
|
|
54
|
+
for (const file of toShow) {
|
|
55
|
+
lines.push(`- ${file}`);
|
|
56
|
+
}
|
|
57
|
+
if (result.parseFailures.length > 10) {
|
|
58
|
+
lines.push(`- ... and ${result.parseFailures.length - 10} more`);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return lines.join('\n');
|
|
62
|
+
}
|
|
63
|
+
export function formatPreview(preview, rootPath) {
|
|
64
|
+
const lines = [`Preview for ${rootPath}:`, ''];
|
|
65
|
+
const sorted = Object.entries(preview.byExtension).sort((a, b) => b[1] - a[1]);
|
|
66
|
+
if (sorted.length > 0) {
|
|
67
|
+
const maxExt = Math.max(...sorted.map(([ext]) => ext.length));
|
|
68
|
+
for (const [ext, count] of sorted) {
|
|
69
|
+
lines.push(` ${ext.padEnd(maxExt)} ${count.toLocaleString()}`);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
lines.push(`Total: ${preview.totalFiles.toLocaleString()} files`, '');
|
|
73
|
+
if (preview.topDirectories.length > 0) {
|
|
74
|
+
lines.push('Top directories:');
|
|
75
|
+
for (const { dir, count } of preview.topDirectories) {
|
|
76
|
+
lines.push(` ${dir}/: ${count.toLocaleString()} files`);
|
|
77
|
+
}
|
|
78
|
+
lines.push('');
|
|
79
|
+
}
|
|
80
|
+
const tokenStr = preview.estimatedTokens >= 1_000_000
|
|
81
|
+
? `~${(preview.estimatedTokens / 1_000_000).toFixed(1)}M`
|
|
82
|
+
: `~${(preview.estimatedTokens / 1000).toFixed(0)}K`;
|
|
83
|
+
lines.push(`Estimated: ${tokenStr} tokens (~$${preview.estimatedCostUsd.toFixed(4)})`, '');
|
|
84
|
+
lines.push('Warnings:');
|
|
85
|
+
if (preview.warnings.length === 0) {
|
|
86
|
+
lines.push('- None');
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
for (const w of preview.warnings) {
|
|
90
|
+
lines.push(`- ${w}`);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return lines.join('\n');
|
|
94
|
+
}
|
|
95
|
+
export function formatListIndexed(states) {
|
|
96
|
+
const registry = listProjects();
|
|
97
|
+
const pathToProject = new Map(Object.entries(registry).map(([name, p]) => [p, name]));
|
|
98
|
+
const lines = [`Indexed Codebases (${states.length})`, ''];
|
|
99
|
+
for (const s of states) {
|
|
100
|
+
const projectName = pathToProject.get(s.path);
|
|
101
|
+
const heading = projectName ? `${s.path} (project: ${projectName})` : s.path;
|
|
102
|
+
lines.push(heading);
|
|
103
|
+
lines.push(` Status: ${s.status}`);
|
|
104
|
+
if (s.totalFiles)
|
|
105
|
+
lines.push(` Files: ${s.totalFiles}`);
|
|
106
|
+
if (s.totalChunks)
|
|
107
|
+
lines.push(` Chunks: ${s.totalChunks}`);
|
|
108
|
+
if (s.lastIndexed)
|
|
109
|
+
lines.push(` Last indexed: ${s.lastIndexed}`);
|
|
110
|
+
if (s.status === 'indexing' && s.progress !== undefined) {
|
|
111
|
+
lines.push(` Progress: ${s.progress}% — ${s.progressMessage ?? ''}`);
|
|
112
|
+
}
|
|
113
|
+
if (s.error)
|
|
114
|
+
lines.push(` Error: ${s.error}`);
|
|
115
|
+
lines.push('');
|
|
116
|
+
}
|
|
117
|
+
return lines.join('\n');
|
|
118
|
+
}
|
|
119
|
+
export function formatDocIndexResult(result) {
|
|
120
|
+
const lines = [
|
|
121
|
+
`Documentation cached: ${result.library}/${result.topic}`,
|
|
122
|
+
'',
|
|
123
|
+
` Source: ${result.source}`,
|
|
124
|
+
` Chunks: ${result.totalChunks}`,
|
|
125
|
+
` Tokens: ~${(result.estimatedTokens / 1000).toFixed(0)}K`,
|
|
126
|
+
` Duration: ${(result.durationMs / 1000).toFixed(1)}s`,
|
|
127
|
+
` Collection: ${result.collectionName}`,
|
|
128
|
+
'',
|
|
129
|
+
`Use \`lookup(query="...", library="${result.library}")\` to search this documentation.`,
|
|
130
|
+
];
|
|
131
|
+
return lines.join('\n');
|
|
132
|
+
}
|
|
133
|
+
export function formatDocSearchResults(results, query) {
|
|
134
|
+
if (results.length === 0) {
|
|
135
|
+
return `No cached documentation found for "${query}".`;
|
|
136
|
+
}
|
|
137
|
+
const lines = [`Found ${results.length} result(s) for "${query}" in cached docs:\n`];
|
|
138
|
+
for (let i = 0; i < results.length; i++) {
|
|
139
|
+
const r = results[i];
|
|
140
|
+
const staleTag = r.stale ? ' **[STALE]**' : '';
|
|
141
|
+
lines.push(`### Result ${i + 1}`);
|
|
142
|
+
lines.push(`**Library:** ${r.library}/${r.topic} | **Source:** ${r.source}${staleTag}`);
|
|
143
|
+
lines.push(`**Score:** ${r.score.toFixed(4)}`);
|
|
144
|
+
lines.push('```markdown');
|
|
145
|
+
lines.push(r.content);
|
|
146
|
+
lines.push('```');
|
|
147
|
+
lines.push('');
|
|
148
|
+
}
|
|
149
|
+
return lines.join('\n');
|
|
150
|
+
}
|
|
151
|
+
//# sourceMappingURL=format.js.map
|