claude-eidetic 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config.d.ts +87 -0
- package/dist/config.js +65 -0
- package/dist/core/indexer.d.ts +18 -0
- package/dist/core/indexer.js +169 -0
- package/dist/core/preview.d.ts +14 -0
- package/dist/core/preview.js +61 -0
- package/dist/core/searcher.d.ts +24 -0
- package/dist/core/searcher.js +101 -0
- package/dist/core/snapshot-io.d.ts +6 -0
- package/dist/core/snapshot-io.js +39 -0
- package/dist/core/sync.d.ts +35 -0
- package/dist/core/sync.js +188 -0
- package/dist/embedding/factory.d.ts +17 -0
- package/dist/embedding/factory.js +41 -0
- package/dist/embedding/openai.d.ts +45 -0
- package/dist/embedding/openai.js +243 -0
- package/dist/embedding/truncate.d.ts +6 -0
- package/dist/embedding/truncate.js +14 -0
- package/dist/embedding/types.d.ts +18 -0
- package/dist/embedding/types.js +2 -0
- package/dist/errors.d.ts +17 -0
- package/dist/errors.js +21 -0
- package/dist/format.d.ts +12 -0
- package/dist/format.js +97 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +109 -0
- package/dist/infra/qdrant-bootstrap.d.ts +2 -0
- package/dist/infra/qdrant-bootstrap.js +94 -0
- package/dist/paths.d.ts +11 -0
- package/dist/paths.js +41 -0
- package/dist/splitter/ast.d.ts +13 -0
- package/dist/splitter/ast.js +169 -0
- package/dist/splitter/line.d.ts +14 -0
- package/dist/splitter/line.js +109 -0
- package/dist/splitter/types.d.ts +11 -0
- package/dist/splitter/types.js +2 -0
- package/dist/state/registry.d.ts +8 -0
- package/dist/state/registry.js +33 -0
- package/dist/state/snapshot.d.ts +26 -0
- package/dist/state/snapshot.js +101 -0
- package/dist/tool-schemas.d.ts +135 -0
- package/dist/tool-schemas.js +162 -0
- package/dist/tools.d.ts +40 -0
- package/dist/tools.js +169 -0
- package/dist/vectordb/milvus.d.ts +33 -0
- package/dist/vectordb/milvus.js +328 -0
- package/dist/vectordb/qdrant.d.ts +51 -0
- package/dist/vectordb/qdrant.js +241 -0
- package/dist/vectordb/types.d.ts +35 -0
- package/dist/vectordb/types.js +2 -0
- package/package.json +62 -0
package/dist/format.js
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { listProjects } from './state/registry.js';
|
|
2
|
+
export function textResult(text) {
|
|
3
|
+
return { content: [{ type: 'text', text }] };
|
|
4
|
+
}
|
|
5
|
+
export function formatIndexResult(result, normalizedPath) {
|
|
6
|
+
const lines = [
|
|
7
|
+
`Indexing complete for ${normalizedPath}`,
|
|
8
|
+
'',
|
|
9
|
+
`| Metric | Value |`,
|
|
10
|
+
`|--------|-------|`,
|
|
11
|
+
`| Total files | ${result.totalFiles} |`,
|
|
12
|
+
`| Total chunks | ${result.totalChunks} |`,
|
|
13
|
+
`| Added files | ${result.addedFiles} |`,
|
|
14
|
+
`| Modified files | ${result.modifiedFiles} |`,
|
|
15
|
+
`| Removed files | ${result.removedFiles} |`,
|
|
16
|
+
`| Skipped (unchanged) | ${result.skippedFiles} |`,
|
|
17
|
+
`| Parse failures | ${result.parseFailures.length} |`,
|
|
18
|
+
`| Estimated tokens | ~${(result.estimatedTokens / 1000).toFixed(0)}K |`,
|
|
19
|
+
`| Estimated cost | $${result.estimatedCostUsd.toFixed(4)} |`,
|
|
20
|
+
`| Duration | ${(result.durationMs / 1000).toFixed(1)}s |`,
|
|
21
|
+
];
|
|
22
|
+
if (result.parseFailures.length > 0) {
|
|
23
|
+
lines.push('');
|
|
24
|
+
lines.push('**Parse Failures:**');
|
|
25
|
+
const toShow = result.parseFailures.slice(0, 10);
|
|
26
|
+
for (const file of toShow) {
|
|
27
|
+
lines.push(`- ${file}`);
|
|
28
|
+
}
|
|
29
|
+
if (result.parseFailures.length > 10) {
|
|
30
|
+
lines.push(`- ... and ${result.parseFailures.length - 10} more`);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return lines.join('\n');
|
|
34
|
+
}
|
|
35
|
+
export function formatPreview(preview, rootPath) {
|
|
36
|
+
const lines = [`Preview for ${rootPath}:`, ''];
|
|
37
|
+
// Extension table
|
|
38
|
+
const sorted = Object.entries(preview.byExtension)
|
|
39
|
+
.sort((a, b) => b[1] - a[1]);
|
|
40
|
+
if (sorted.length > 0) {
|
|
41
|
+
lines.push('| Extension | Files |');
|
|
42
|
+
lines.push('|-----------|-------|');
|
|
43
|
+
for (const [ext, count] of sorted) {
|
|
44
|
+
lines.push(`| ${ext} | ${count.toLocaleString()} |`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
lines.push(`Total: ${preview.totalFiles.toLocaleString()} files`, '');
|
|
48
|
+
// Top directories
|
|
49
|
+
if (preview.topDirectories.length > 0) {
|
|
50
|
+
lines.push('Top directories:');
|
|
51
|
+
for (const { dir, count } of preview.topDirectories) {
|
|
52
|
+
lines.push(` ${dir}/: ${count.toLocaleString()} files`);
|
|
53
|
+
}
|
|
54
|
+
lines.push('');
|
|
55
|
+
}
|
|
56
|
+
// Cost estimate
|
|
57
|
+
const tokenStr = preview.estimatedTokens >= 1_000_000
|
|
58
|
+
? `~${(preview.estimatedTokens / 1_000_000).toFixed(1)}M`
|
|
59
|
+
: `~${(preview.estimatedTokens / 1000).toFixed(0)}K`;
|
|
60
|
+
lines.push(`Estimated: ${tokenStr} tokens (~$${preview.estimatedCostUsd.toFixed(4)})`, '');
|
|
61
|
+
// Warnings
|
|
62
|
+
lines.push('Warnings:');
|
|
63
|
+
if (preview.warnings.length === 0) {
|
|
64
|
+
lines.push('- None');
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
for (const w of preview.warnings) {
|
|
68
|
+
lines.push(`- ${w}`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return lines.join('\n');
|
|
72
|
+
}
|
|
73
|
+
export function formatListIndexed(states) {
|
|
74
|
+
const registry = listProjects();
|
|
75
|
+
const pathToProject = new Map(Object.entries(registry).map(([name, p]) => [p, name]));
|
|
76
|
+
const lines = [`## Indexed Codebases (${states.length})\n`];
|
|
77
|
+
for (const s of states) {
|
|
78
|
+
const projectName = pathToProject.get(s.path);
|
|
79
|
+
const heading = projectName ? `${s.path} (project: \`${projectName}\`)` : s.path;
|
|
80
|
+
lines.push(`### ${heading}`);
|
|
81
|
+
lines.push(`- **Status:** ${s.status}`);
|
|
82
|
+
if (s.totalFiles)
|
|
83
|
+
lines.push(`- **Files:** ${s.totalFiles}`);
|
|
84
|
+
if (s.totalChunks)
|
|
85
|
+
lines.push(`- **Chunks:** ${s.totalChunks}`);
|
|
86
|
+
if (s.lastIndexed)
|
|
87
|
+
lines.push(`- **Last indexed:** ${s.lastIndexed}`);
|
|
88
|
+
if (s.status === 'indexing' && s.progress !== undefined) {
|
|
89
|
+
lines.push(`- **Progress:** ${s.progress}% — ${s.progressMessage ?? ''}`);
|
|
90
|
+
}
|
|
91
|
+
if (s.error)
|
|
92
|
+
lines.push(`- **Error:** ${s.error}`);
|
|
93
|
+
lines.push('');
|
|
94
|
+
}
|
|
95
|
+
return lines.join('\n');
|
|
96
|
+
}
|
|
97
|
+
//# sourceMappingURL=format.js.map
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// CRITICAL: Redirect console outputs to stderr BEFORE any imports
|
|
3
|
+
// Only MCP protocol messages should go to stdout
|
|
4
|
+
console.log = (...args) => {
|
|
5
|
+
process.stderr.write('[LOG] ' + args.join(' ') + '\n');
|
|
6
|
+
};
|
|
7
|
+
console.warn = (...args) => {
|
|
8
|
+
process.stderr.write('[WARN] ' + args.join(' ') + '\n');
|
|
9
|
+
};
|
|
10
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
11
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
12
|
+
import { ListToolsRequestSchema, CallToolRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
13
|
+
import { loadConfig } from './config.js';
|
|
14
|
+
import { createEmbedding } from './embedding/factory.js';
|
|
15
|
+
import { QdrantVectorDB } from './vectordb/qdrant.js';
|
|
16
|
+
import { bootstrapQdrant } from './infra/qdrant-bootstrap.js';
|
|
17
|
+
import { StateManager, cleanupOrphanedSnapshots } from './state/snapshot.js';
|
|
18
|
+
import { ToolHandlers } from './tools.js';
|
|
19
|
+
import { TOOL_DEFINITIONS } from './tool-schemas.js';
|
|
20
|
+
const WORKFLOW_GUIDANCE = `# Eidetic Code Search Workflow
|
|
21
|
+
|
|
22
|
+
**Before searching:** Ensure the codebase is indexed.
|
|
23
|
+
- \`list_indexed\` → see what's already indexed
|
|
24
|
+
- \`index_codebase(path="...", dryRun=true)\` → preview before indexing
|
|
25
|
+
- \`index_codebase(path="...")\` → index (incremental, only re-embeds changed files)
|
|
26
|
+
|
|
27
|
+
**Searching efficiently:**
|
|
28
|
+
- \`search_code(query="...")\` → returns compact table by default (~20 tokens/result)
|
|
29
|
+
- Review the table, then use Read tool to fetch full code for interesting results
|
|
30
|
+
- Add \`compact=false\` only when you need all code snippets immediately
|
|
31
|
+
- Use \`extensionFilter\` to narrow by file type
|
|
32
|
+
- Use \`project\` param instead of \`path\` for convenience
|
|
33
|
+
- Start with specific queries, broaden if no results
|
|
34
|
+
|
|
35
|
+
**After first index:**
|
|
36
|
+
- Re-indexing is incremental (only changed files re-embedded)
|
|
37
|
+
- Use \`project\` param instead of \`path\` for convenience
|
|
38
|
+
- Use \`get_indexing_status\` to check progress during long indexes
|
|
39
|
+
|
|
40
|
+
**Cross-project search:**
|
|
41
|
+
- Index multiple projects, each with its own path
|
|
42
|
+
- Search across any indexed project regardless of current working directory`;
|
|
43
|
+
async function main() {
|
|
44
|
+
const config = loadConfig();
|
|
45
|
+
console.log(`Config loaded. Provider: ${config.vectordbProvider}, Model: ${config.embeddingModel}`);
|
|
46
|
+
const embedding = createEmbedding(config);
|
|
47
|
+
await embedding.initialize();
|
|
48
|
+
let vectordb;
|
|
49
|
+
if (config.vectordbProvider === 'milvus') {
|
|
50
|
+
const { MilvusVectorDB } = await import('./vectordb/milvus.js');
|
|
51
|
+
vectordb = new MilvusVectorDB();
|
|
52
|
+
console.log(`Using Milvus at ${config.milvusAddress}`);
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
const qdrantUrl = await bootstrapQdrant();
|
|
56
|
+
vectordb = new QdrantVectorDB(qdrantUrl);
|
|
57
|
+
console.log(`Using Qdrant at ${qdrantUrl}`);
|
|
58
|
+
}
|
|
59
|
+
const cleaned = await cleanupOrphanedSnapshots(vectordb);
|
|
60
|
+
if (cleaned > 0) {
|
|
61
|
+
console.log(`Cleaned ${cleaned} orphaned snapshot(s).`);
|
|
62
|
+
}
|
|
63
|
+
const state = new StateManager();
|
|
64
|
+
const handlers = new ToolHandlers(embedding, vectordb, state);
|
|
65
|
+
const server = new Server({ name: 'claude-eidetic', version: '0.1.0' }, { capabilities: { tools: {} } });
|
|
66
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
67
|
+
tools: [...TOOL_DEFINITIONS],
|
|
68
|
+
}));
|
|
69
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
70
|
+
const { name, arguments: args } = request.params;
|
|
71
|
+
switch (name) {
|
|
72
|
+
case 'index_codebase':
|
|
73
|
+
return handlers.handleIndexCodebase(args ?? {});
|
|
74
|
+
case 'search_code':
|
|
75
|
+
return handlers.handleSearchCode(args ?? {});
|
|
76
|
+
case 'clear_index':
|
|
77
|
+
return handlers.handleClearIndex(args ?? {});
|
|
78
|
+
case 'get_indexing_status':
|
|
79
|
+
return handlers.handleGetIndexingStatus(args ?? {});
|
|
80
|
+
case 'list_indexed':
|
|
81
|
+
return handlers.handleListIndexed();
|
|
82
|
+
case '__IMPORTANT':
|
|
83
|
+
return {
|
|
84
|
+
content: [{ type: 'text', text: WORKFLOW_GUIDANCE }],
|
|
85
|
+
};
|
|
86
|
+
default:
|
|
87
|
+
return {
|
|
88
|
+
content: [{ type: 'text', text: `Unknown tool: ${name}` }],
|
|
89
|
+
isError: true,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
const transport = new StdioServerTransport();
|
|
94
|
+
await server.connect(transport);
|
|
95
|
+
console.log('Claude Eidetic MCP server started on stdio.');
|
|
96
|
+
}
|
|
97
|
+
process.on('SIGINT', () => {
|
|
98
|
+
console.error('Received SIGINT, shutting down...');
|
|
99
|
+
process.exit(0);
|
|
100
|
+
});
|
|
101
|
+
process.on('SIGTERM', () => {
|
|
102
|
+
console.error('Received SIGTERM, shutting down...');
|
|
103
|
+
process.exit(0);
|
|
104
|
+
});
|
|
105
|
+
main().catch((err) => {
|
|
106
|
+
console.error('Fatal error:', err);
|
|
107
|
+
process.exit(1);
|
|
108
|
+
});
|
|
109
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { execFileSync } from 'node:child_process';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { BootstrapError } from '../errors.js';
|
|
4
|
+
import { getConfig } from '../config.js';
|
|
5
|
+
import { getDataDir } from '../paths.js';
|
|
6
|
+
const CONTAINER_NAME = 'eidetic-qdrant';
|
|
7
|
+
const HEALTH_TIMEOUT_MS = 30_000;
|
|
8
|
+
const HEALTH_POLL_MS = 500;
|
|
9
|
+
export async function bootstrapQdrant() {
|
|
10
|
+
const config = getConfig();
|
|
11
|
+
const url = config.qdrantUrl;
|
|
12
|
+
if (await isQdrantHealthy(url)) {
|
|
13
|
+
console.log(`Qdrant reachable at ${url}`);
|
|
14
|
+
return url;
|
|
15
|
+
}
|
|
16
|
+
console.log(`Qdrant not reachable at ${url}. Attempting Docker auto-provision...`);
|
|
17
|
+
if (!isDockerAvailable()) {
|
|
18
|
+
throw new BootstrapError(`Qdrant not reachable at ${url} and Docker not found.\n` +
|
|
19
|
+
`Either: (a) install Docker and retry, or (b) set QDRANT_URL to your Qdrant instance.`);
|
|
20
|
+
}
|
|
21
|
+
const containerState = getContainerState();
|
|
22
|
+
if (containerState === 'running') {
|
|
23
|
+
console.log(`Container "${CONTAINER_NAME}" is running. Waiting for health...`);
|
|
24
|
+
}
|
|
25
|
+
else if (containerState === 'stopped') {
|
|
26
|
+
console.log(`Container "${CONTAINER_NAME}" exists but stopped. Starting...`);
|
|
27
|
+
execFileSync('docker', ['start', CONTAINER_NAME], { stdio: 'pipe' });
|
|
28
|
+
}
|
|
29
|
+
else {
|
|
30
|
+
const dataDir = path.join(getDataDir(), 'qdrant-data').replace(/\\/g, '/');
|
|
31
|
+
console.log(`Creating new Qdrant container "${CONTAINER_NAME}"...`);
|
|
32
|
+
execFileSync('docker', [
|
|
33
|
+
'run', '-d',
|
|
34
|
+
'--name', CONTAINER_NAME,
|
|
35
|
+
'--restart', 'unless-stopped',
|
|
36
|
+
'-p', '6333:6333',
|
|
37
|
+
'-p', '6334:6334',
|
|
38
|
+
'-v', `${dataDir}:/qdrant/storage`,
|
|
39
|
+
'qdrant/qdrant',
|
|
40
|
+
], { stdio: 'pipe' });
|
|
41
|
+
}
|
|
42
|
+
const healthy = await waitForHealth(url, HEALTH_TIMEOUT_MS);
|
|
43
|
+
if (!healthy) {
|
|
44
|
+
throw new BootstrapError(`Qdrant container started but failed health check after ${HEALTH_TIMEOUT_MS / 1000}s. ` +
|
|
45
|
+
`Check: docker logs ${CONTAINER_NAME}`);
|
|
46
|
+
}
|
|
47
|
+
console.log(`Qdrant auto-provisioned and healthy at ${url}`);
|
|
48
|
+
return url;
|
|
49
|
+
}
|
|
50
|
+
async function isQdrantHealthy(url) {
|
|
51
|
+
try {
|
|
52
|
+
const resp = await fetch(`${url}/healthz`, { signal: AbortSignal.timeout(3000) });
|
|
53
|
+
return resp.ok;
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
function isDockerAvailable() {
|
|
60
|
+
try {
|
|
61
|
+
execFileSync('docker', ['info'], { stdio: 'pipe', timeout: 10_000 });
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
function getContainerState() {
|
|
69
|
+
try {
|
|
70
|
+
const output = execFileSync('docker', [
|
|
71
|
+
'ps', '-a',
|
|
72
|
+
'--filter', `name=^/${CONTAINER_NAME}$`,
|
|
73
|
+
'--format', '{{.State}}',
|
|
74
|
+
], { encoding: 'utf-8', stdio: 'pipe' }).trim();
|
|
75
|
+
if (!output)
|
|
76
|
+
return 'none';
|
|
77
|
+
if (output === 'running')
|
|
78
|
+
return 'running';
|
|
79
|
+
return 'stopped';
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
return 'none';
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
async function waitForHealth(url, timeoutMs) {
|
|
86
|
+
const start = Date.now();
|
|
87
|
+
while (Date.now() - start < timeoutMs) {
|
|
88
|
+
if (await isQdrantHealthy(url))
|
|
89
|
+
return true;
|
|
90
|
+
await new Promise(r => setTimeout(r, HEALTH_POLL_MS));
|
|
91
|
+
}
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=qdrant-bootstrap.js.map
|
package/dist/paths.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalize a path to forward slashes, resolve to absolute, remove trailing slash.
|
|
3
|
+
* This is the single source of truth for path handling — called at every boundary.
|
|
4
|
+
*/
|
|
5
|
+
export declare function normalizePath(inputPath: string): string;
|
|
6
|
+
export declare function getDataDir(): string;
|
|
7
|
+
export declare function getSnapshotDir(): string;
|
|
8
|
+
export declare function getCacheDir(): string;
|
|
9
|
+
export declare function getRegistryPath(): string;
|
|
10
|
+
export declare function pathToCollectionName(absolutePath: string): string;
|
|
11
|
+
//# sourceMappingURL=paths.d.ts.map
|
package/dist/paths.js
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import os from 'node:os';
|
|
3
|
+
import { getConfig } from './config.js';
|
|
4
|
+
/**
|
|
5
|
+
* Normalize a path to forward slashes, resolve to absolute, remove trailing slash.
|
|
6
|
+
* This is the single source of truth for path handling — called at every boundary.
|
|
7
|
+
*/
|
|
8
|
+
export function normalizePath(inputPath) {
|
|
9
|
+
let resolved = inputPath;
|
|
10
|
+
if (resolved.startsWith('~')) {
|
|
11
|
+
resolved = path.join(os.homedir(), resolved.slice(1));
|
|
12
|
+
}
|
|
13
|
+
resolved = path.resolve(resolved);
|
|
14
|
+
resolved = resolved.replace(/\\/g, '/');
|
|
15
|
+
if (resolved.length > 1 && resolved.endsWith('/')) {
|
|
16
|
+
resolved = resolved.slice(0, -1);
|
|
17
|
+
}
|
|
18
|
+
return resolved;
|
|
19
|
+
}
|
|
20
|
+
export function getDataDir() {
|
|
21
|
+
return normalizePath(getConfig().eideticDataDir);
|
|
22
|
+
}
|
|
23
|
+
export function getSnapshotDir() {
|
|
24
|
+
return `${getDataDir()}/snapshots`;
|
|
25
|
+
}
|
|
26
|
+
export function getCacheDir() {
|
|
27
|
+
return `${getDataDir()}/cache`;
|
|
28
|
+
}
|
|
29
|
+
export function getRegistryPath() {
|
|
30
|
+
return `${getDataDir()}/registry.json`;
|
|
31
|
+
}
|
|
32
|
+
export function pathToCollectionName(absolutePath) {
|
|
33
|
+
const normalized = normalizePath(absolutePath);
|
|
34
|
+
const safe = normalized
|
|
35
|
+
.toLowerCase()
|
|
36
|
+
.replace(/[^a-z0-9]/g, '_')
|
|
37
|
+
.replace(/_+/g, '_')
|
|
38
|
+
.replace(/^_|_$/g, '');
|
|
39
|
+
return `eidetic_${safe}`;
|
|
40
|
+
}
|
|
41
|
+
//# sourceMappingURL=paths.js.map
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Splitter, CodeChunk } from './types.js';
|
|
2
|
+
export declare class AstSplitter implements Splitter {
|
|
3
|
+
private parser;
|
|
4
|
+
private currentLang;
|
|
5
|
+
private static langCache;
|
|
6
|
+
private static resolveLanguage;
|
|
7
|
+
split(code: string, language: string, filePath: string): CodeChunk[];
|
|
8
|
+
static isSupported(language: string): boolean;
|
|
9
|
+
private extractChunks;
|
|
10
|
+
private refineChunks;
|
|
11
|
+
private splitLargeChunk;
|
|
12
|
+
}
|
|
13
|
+
//# sourceMappingURL=ast.d.ts.map
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { createRequire } from 'node:module';
|
|
2
|
+
// tree-sitter and language parsers are native CommonJS modules
|
|
3
|
+
const require = createRequire(import.meta.url);
|
|
4
|
+
const Parser = require('tree-sitter');
|
|
5
|
+
// Lazy-load language parsers to avoid startup cost for unused languages
|
|
6
|
+
const languageParsers = {
|
|
7
|
+
javascript: () => require('tree-sitter-javascript'),
|
|
8
|
+
js: () => require('tree-sitter-javascript'),
|
|
9
|
+
typescript: () => require('tree-sitter-typescript').typescript,
|
|
10
|
+
ts: () => require('tree-sitter-typescript').typescript,
|
|
11
|
+
tsx: () => require('tree-sitter-typescript').tsx,
|
|
12
|
+
python: () => require('tree-sitter-python'),
|
|
13
|
+
py: () => require('tree-sitter-python'),
|
|
14
|
+
go: () => require('tree-sitter-go'),
|
|
15
|
+
java: () => require('tree-sitter-java'),
|
|
16
|
+
rust: () => require('tree-sitter-rust'),
|
|
17
|
+
rs: () => require('tree-sitter-rust'),
|
|
18
|
+
cpp: () => require('tree-sitter-cpp'),
|
|
19
|
+
'c++': () => require('tree-sitter-cpp'),
|
|
20
|
+
c: () => require('tree-sitter-cpp'),
|
|
21
|
+
csharp: () => require('tree-sitter-c-sharp'),
|
|
22
|
+
cs: () => require('tree-sitter-c-sharp'),
|
|
23
|
+
};
|
|
24
|
+
// AST node types that represent logical code units per language
|
|
25
|
+
const SPLITTABLE_TYPES = {
|
|
26
|
+
javascript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement'],
|
|
27
|
+
typescript: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement', 'interface_declaration', 'type_alias_declaration'],
|
|
28
|
+
tsx: ['function_declaration', 'arrow_function', 'class_declaration', 'method_definition', 'export_statement', 'interface_declaration', 'type_alias_declaration'],
|
|
29
|
+
python: ['function_definition', 'class_definition', 'decorated_definition', 'async_function_definition'],
|
|
30
|
+
java: ['method_declaration', 'class_declaration', 'interface_declaration', 'constructor_declaration'],
|
|
31
|
+
cpp: ['function_definition', 'class_specifier', 'namespace_definition', 'declaration'],
|
|
32
|
+
go: ['function_declaration', 'method_declaration', 'type_declaration', 'var_declaration', 'const_declaration'],
|
|
33
|
+
rust: ['function_item', 'impl_item', 'struct_item', 'enum_item', 'trait_item', 'mod_item'],
|
|
34
|
+
csharp: ['method_declaration', 'class_declaration', 'interface_declaration', 'struct_declaration', 'enum_declaration'],
|
|
35
|
+
};
|
|
36
|
+
// Map aliases to canonical language names for node type lookup
|
|
37
|
+
const LANG_CANONICAL = {
|
|
38
|
+
js: 'javascript', ts: 'typescript', py: 'python',
|
|
39
|
+
rs: 'rust', 'c++': 'cpp', c: 'cpp', cs: 'csharp',
|
|
40
|
+
};
|
|
41
|
+
const MAX_CHUNK_CHARS = 2500;
|
|
42
|
+
export class AstSplitter {
|
|
43
|
+
parser = new Parser();
|
|
44
|
+
currentLang = '';
|
|
45
|
+
// Shared across all AstSplitter instances — one cache per process
|
|
46
|
+
static langCache = new Map();
|
|
47
|
+
static resolveLanguage(lang) {
|
|
48
|
+
// Resolve alias to canonical name first — prevents duplicate cache entries
|
|
49
|
+
const canonical = LANG_CANONICAL[lang] ?? lang;
|
|
50
|
+
const cached = AstSplitter.langCache.get(canonical);
|
|
51
|
+
if (cached)
|
|
52
|
+
return cached;
|
|
53
|
+
const factory = languageParsers[canonical] ?? languageParsers[lang];
|
|
54
|
+
if (!factory)
|
|
55
|
+
return null;
|
|
56
|
+
try {
|
|
57
|
+
const mod = factory();
|
|
58
|
+
AstSplitter.langCache.set(canonical, mod);
|
|
59
|
+
return mod;
|
|
60
|
+
}
|
|
61
|
+
catch (err) {
|
|
62
|
+
console.warn(`Failed to load tree-sitter parser for "${lang}": ${err}`);
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
split(code, language, filePath) {
|
|
67
|
+
const lang = language.toLowerCase();
|
|
68
|
+
const canonical = LANG_CANONICAL[lang] ?? lang;
|
|
69
|
+
const langModule = AstSplitter.resolveLanguage(lang);
|
|
70
|
+
if (!langModule) {
|
|
71
|
+
return []; // Caller should fall back to line splitter
|
|
72
|
+
}
|
|
73
|
+
try {
|
|
74
|
+
// Skip setLanguage() if parser is already configured for this language
|
|
75
|
+
if (canonical !== this.currentLang) {
|
|
76
|
+
this.parser.setLanguage(langModule);
|
|
77
|
+
this.currentLang = canonical;
|
|
78
|
+
}
|
|
79
|
+
const tree = this.parser.parse(code);
|
|
80
|
+
if (!tree.rootNode)
|
|
81
|
+
return [];
|
|
82
|
+
const nodeTypes = SPLITTABLE_TYPES[canonical] ?? [];
|
|
83
|
+
const rawChunks = this.extractChunks(tree.rootNode, code, nodeTypes, language, filePath);
|
|
84
|
+
// If no meaningful chunks found, return empty (caller will use line splitter)
|
|
85
|
+
if (rawChunks.length === 0)
|
|
86
|
+
return [];
|
|
87
|
+
return this.refineChunks(rawChunks);
|
|
88
|
+
}
|
|
89
|
+
catch (err) {
|
|
90
|
+
console.warn(`AST parse failed for "${filePath}" (${language}): ${err}`);
|
|
91
|
+
return []; // Caller should fall back to line splitter
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
static isSupported(language) {
|
|
95
|
+
return language.toLowerCase() in languageParsers;
|
|
96
|
+
}
|
|
97
|
+
extractChunks(node, code, splittableTypes, language, filePath) {
|
|
98
|
+
const chunks = [];
|
|
99
|
+
const traverse = (current) => {
|
|
100
|
+
if (splittableTypes.includes(current.type)) {
|
|
101
|
+
const text = code.slice(current.startIndex, current.endIndex);
|
|
102
|
+
if (text.trim().length > 0) {
|
|
103
|
+
chunks.push({
|
|
104
|
+
content: text,
|
|
105
|
+
startLine: current.startPosition.row + 1,
|
|
106
|
+
endLine: current.endPosition.row + 1,
|
|
107
|
+
language,
|
|
108
|
+
filePath,
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
for (const child of current.children) {
|
|
113
|
+
traverse(child);
|
|
114
|
+
}
|
|
115
|
+
};
|
|
116
|
+
traverse(node);
|
|
117
|
+
return chunks;
|
|
118
|
+
}
|
|
119
|
+
refineChunks(chunks) {
|
|
120
|
+
const result = [];
|
|
121
|
+
for (const chunk of chunks) {
|
|
122
|
+
if (chunk.content.length <= MAX_CHUNK_CHARS) {
|
|
123
|
+
result.push(chunk);
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
result.push(...this.splitLargeChunk(chunk));
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return result;
|
|
130
|
+
}
|
|
131
|
+
splitLargeChunk(chunk) {
|
|
132
|
+
const lines = chunk.content.split('\n');
|
|
133
|
+
const subChunks = [];
|
|
134
|
+
let current = '';
|
|
135
|
+
let startLine = chunk.startLine;
|
|
136
|
+
let lineCount = 0;
|
|
137
|
+
for (let i = 0; i < lines.length; i++) {
|
|
138
|
+
const line = lines[i];
|
|
139
|
+
const addition = i < lines.length - 1 ? line + '\n' : line;
|
|
140
|
+
if (current.length + addition.length > MAX_CHUNK_CHARS && current.length > 0) {
|
|
141
|
+
subChunks.push({
|
|
142
|
+
content: current,
|
|
143
|
+
startLine,
|
|
144
|
+
endLine: startLine + lineCount - 1,
|
|
145
|
+
language: chunk.language,
|
|
146
|
+
filePath: chunk.filePath,
|
|
147
|
+
});
|
|
148
|
+
current = addition;
|
|
149
|
+
startLine = chunk.startLine + i;
|
|
150
|
+
lineCount = 1;
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
current += addition;
|
|
154
|
+
lineCount++;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
if (current.trim().length > 0) {
|
|
158
|
+
subChunks.push({
|
|
159
|
+
content: current,
|
|
160
|
+
startLine,
|
|
161
|
+
endLine: startLine + lineCount - 1,
|
|
162
|
+
language: chunk.language,
|
|
163
|
+
filePath: chunk.filePath,
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
return subChunks;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
//# sourceMappingURL=ast.js.map
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { Splitter, CodeChunk } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Simple line-based splitter. Used as fallback when tree-sitter
|
|
4
|
+
* doesn't support the language or fails to parse.
|
|
5
|
+
*/
|
|
6
|
+
export declare class LineSplitter implements Splitter {
|
|
7
|
+
private chunkLines;
|
|
8
|
+
private overlapLines;
|
|
9
|
+
constructor(chunkLines?: number, overlapLines?: number);
|
|
10
|
+
split(code: string, language: string, filePath: string): CodeChunk[];
|
|
11
|
+
private refineChunks;
|
|
12
|
+
private splitLargeChunk;
|
|
13
|
+
}
|
|
14
|
+
//# sourceMappingURL=line.d.ts.map
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
const DEFAULT_CHUNK_LINES = 60;
|
|
2
|
+
const OVERLAP_LINES = 5;
|
|
3
|
+
const MAX_CHUNK_CHARS = 2500;
|
|
4
|
+
/**
|
|
5
|
+
* Simple line-based splitter. Used as fallback when tree-sitter
|
|
6
|
+
* doesn't support the language or fails to parse.
|
|
7
|
+
*/
|
|
8
|
+
export class LineSplitter {
|
|
9
|
+
chunkLines;
|
|
10
|
+
overlapLines;
|
|
11
|
+
constructor(chunkLines = DEFAULT_CHUNK_LINES, overlapLines = OVERLAP_LINES) {
|
|
12
|
+
this.chunkLines = chunkLines;
|
|
13
|
+
this.overlapLines = Math.min(overlapLines, chunkLines - 1);
|
|
14
|
+
}
|
|
15
|
+
split(code, language, filePath) {
|
|
16
|
+
const lines = code.split('\n');
|
|
17
|
+
if (lines.length === 0)
|
|
18
|
+
return [];
|
|
19
|
+
const raw = [];
|
|
20
|
+
let start = 0;
|
|
21
|
+
while (start < lines.length) {
|
|
22
|
+
const end = Math.min(start + this.chunkLines, lines.length);
|
|
23
|
+
const content = lines.slice(start, end).join('\n');
|
|
24
|
+
if (content.trim().length > 0) {
|
|
25
|
+
raw.push({
|
|
26
|
+
content,
|
|
27
|
+
startLine: start + 1,
|
|
28
|
+
endLine: end,
|
|
29
|
+
language,
|
|
30
|
+
filePath,
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
start = Math.max(start + 1, end - this.overlapLines);
|
|
34
|
+
}
|
|
35
|
+
return this.refineChunks(raw);
|
|
36
|
+
}
|
|
37
|
+
refineChunks(chunks) {
|
|
38
|
+
const result = [];
|
|
39
|
+
for (const chunk of chunks) {
|
|
40
|
+
if (chunk.content.length <= MAX_CHUNK_CHARS) {
|
|
41
|
+
result.push(chunk);
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
result.push(...this.splitLargeChunk(chunk));
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return result;
|
|
48
|
+
}
|
|
49
|
+
splitLargeChunk(chunk) {
|
|
50
|
+
const lines = chunk.content.split('\n');
|
|
51
|
+
const subChunks = [];
|
|
52
|
+
let current = '';
|
|
53
|
+
let startLine = chunk.startLine;
|
|
54
|
+
let lineCount = 0;
|
|
55
|
+
const flush = () => {
|
|
56
|
+
if (current.trim().length > 0) {
|
|
57
|
+
subChunks.push({
|
|
58
|
+
content: current,
|
|
59
|
+
startLine,
|
|
60
|
+
endLine: startLine + lineCount - 1,
|
|
61
|
+
language: chunk.language,
|
|
62
|
+
filePath: chunk.filePath,
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
for (let i = 0; i < lines.length; i++) {
|
|
67
|
+
const line = lines[i];
|
|
68
|
+
const addition = i < lines.length - 1 ? line + '\n' : line;
|
|
69
|
+
if (current.length + addition.length > MAX_CHUNK_CHARS && current.length > 0) {
|
|
70
|
+
flush();
|
|
71
|
+
current = '';
|
|
72
|
+
startLine = chunk.startLine + i;
|
|
73
|
+
lineCount = 0;
|
|
74
|
+
}
|
|
75
|
+
// If a single line exceeds the limit, hard-split it by characters
|
|
76
|
+
if (addition.length > MAX_CHUNK_CHARS) {
|
|
77
|
+
// Flush anything accumulated before this line
|
|
78
|
+
if (current.length > 0) {
|
|
79
|
+
flush();
|
|
80
|
+
current = '';
|
|
81
|
+
startLine = chunk.startLine + i;
|
|
82
|
+
lineCount = 0;
|
|
83
|
+
}
|
|
84
|
+
const lineNum = chunk.startLine + i;
|
|
85
|
+
for (let offset = 0; offset < addition.length; offset += MAX_CHUNK_CHARS) {
|
|
86
|
+
const slice = addition.slice(offset, offset + MAX_CHUNK_CHARS);
|
|
87
|
+
if (slice.trim().length > 0) {
|
|
88
|
+
subChunks.push({
|
|
89
|
+
content: slice,
|
|
90
|
+
startLine: lineNum,
|
|
91
|
+
endLine: lineNum,
|
|
92
|
+
language: chunk.language,
|
|
93
|
+
filePath: chunk.filePath,
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
startLine = chunk.startLine + i + 1;
|
|
98
|
+
lineCount = 0;
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
current += addition;
|
|
102
|
+
lineCount++;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
flush();
|
|
106
|
+
return subChunks;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
//# sourceMappingURL=line.js.map
|