codexa 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +916 -0
- package/bin/codexa.js +2 -0
- package/dist/agent.js +64 -0
- package/dist/chunker.js +50 -0
- package/dist/cli.js +95 -0
- package/dist/config.js +97 -0
- package/dist/db.js +140 -0
- package/dist/embeddings/index.js +148 -0
- package/dist/ingest.js +69 -0
- package/dist/models/index.js +130 -0
- package/dist/retriever.js +22 -0
- package/dist/types.js +2 -0
- package/dist/utils/logger.js +13 -0
- package/package.json +79 -0
package/bin/codexa.js
ADDED
package/dist/agent.js
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.askQuestion = askQuestion;
|
|
7
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
8
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
9
|
+
const retriever_1 = require("./retriever");
|
|
10
|
+
const models_1 = require("./models");
|
|
11
|
+
const SYSTEM_PROMPT = `
|
|
12
|
+
You are RepoSage.
|
|
13
|
+
You answer questions about a codebase using ONLY the provided code snippets.
|
|
14
|
+
|
|
15
|
+
Rules:
|
|
16
|
+
- Use the CODE_SNIPPET sections only.
|
|
17
|
+
- Do NOT hallucinate missing files.
|
|
18
|
+
- If the context does not contain enough information, say:
|
|
19
|
+
"The provided context does not contain that information."
|
|
20
|
+
- Keep answers short, direct, and technical.
|
|
21
|
+
`;
|
|
22
|
+
async function askQuestion(cwd, config, options) {
|
|
23
|
+
const { question, session = 'default' } = options;
|
|
24
|
+
const history = await loadHistory(config.historyDir, session);
|
|
25
|
+
const matches = await (0, retriever_1.retrieveContext)(question, config);
|
|
26
|
+
if (matches.length === 0) {
|
|
27
|
+
throw new Error('No chunks found. Run `agent ingest` first.');
|
|
28
|
+
}
|
|
29
|
+
const context = (0, retriever_1.formatContext)(matches);
|
|
30
|
+
const messages = [
|
|
31
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
32
|
+
...history,
|
|
33
|
+
{
|
|
34
|
+
role: 'user',
|
|
35
|
+
content: `CONTEXT:\n${context}\n\nQUESTION: ${question}\nANSWER:`,
|
|
36
|
+
},
|
|
37
|
+
];
|
|
38
|
+
const llm = (0, models_1.createLLMClient)(config);
|
|
39
|
+
let finalAnswer = '';
|
|
40
|
+
const answer = await llm.generate(messages, {
|
|
41
|
+
stream: options.stream,
|
|
42
|
+
onToken: (token) => {
|
|
43
|
+
finalAnswer += token;
|
|
44
|
+
if (options.onToken)
|
|
45
|
+
options.onToken(token);
|
|
46
|
+
},
|
|
47
|
+
});
|
|
48
|
+
if (!options.stream) {
|
|
49
|
+
finalAnswer = answer;
|
|
50
|
+
}
|
|
51
|
+
return finalAnswer;
|
|
52
|
+
}
|
|
53
|
+
async function loadHistory(dir, id) {
|
|
54
|
+
await fs_extra_1.default.ensureDir(dir);
|
|
55
|
+
const file = node_path_1.default.join(dir, `${id}.json`);
|
|
56
|
+
if (!(await fs_extra_1.default.pathExists(file)))
|
|
57
|
+
return [];
|
|
58
|
+
return fs_extra_1.default.readJson(file);
|
|
59
|
+
}
|
|
60
|
+
// async function saveHistory(dir: string, id: string, messages: any[]) {
|
|
61
|
+
// await fs.ensureDir(dir);
|
|
62
|
+
// const file = path.join(dir, `${id}.json`);
|
|
63
|
+
// await fs.writeJson(file, messages, { spaces: 2 });
|
|
64
|
+
// }
|
package/dist/chunker.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.chunkFile = chunkFile;
|
|
7
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
8
|
+
const node_readline_1 = __importDefault(require("node:readline"));
|
|
9
|
+
async function chunkFile(filePath, maxChunkSize, overlap) {
|
|
10
|
+
const chunks = [];
|
|
11
|
+
const fileStream = fs_extra_1.default.createReadStream(filePath, { encoding: 'utf8' });
|
|
12
|
+
const rl = node_readline_1.default.createInterface({
|
|
13
|
+
input: fileStream,
|
|
14
|
+
crlfDelay: Infinity,
|
|
15
|
+
});
|
|
16
|
+
const buffer = [];
|
|
17
|
+
let currentSize = 0;
|
|
18
|
+
let startLine = 1;
|
|
19
|
+
let lineNumber = 0;
|
|
20
|
+
for await (const line of rl) {
|
|
21
|
+
lineNumber += 1;
|
|
22
|
+
buffer.push(line);
|
|
23
|
+
currentSize += line.length;
|
|
24
|
+
const shouldFlush = currentSize >= maxChunkSize;
|
|
25
|
+
if (shouldFlush) {
|
|
26
|
+
const content = buffer.join('\n');
|
|
27
|
+
chunks.push({
|
|
28
|
+
filePath,
|
|
29
|
+
startLine,
|
|
30
|
+
endLine: lineNumber,
|
|
31
|
+
content,
|
|
32
|
+
});
|
|
33
|
+
const overlapStart = Math.max(0, buffer.length - overlap);
|
|
34
|
+
const overlapBuffer = buffer.slice(overlapStart);
|
|
35
|
+
startLine = lineNumber - overlapBuffer.length + 1;
|
|
36
|
+
buffer.length = 0;
|
|
37
|
+
buffer.push(...overlapBuffer);
|
|
38
|
+
currentSize = buffer.reduce((acc, curr) => acc + curr.length, 0);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
if (buffer.length > 0) {
|
|
42
|
+
chunks.push({
|
|
43
|
+
filePath,
|
|
44
|
+
startLine,
|
|
45
|
+
endLine: lineNumber,
|
|
46
|
+
content: buffer.join('\n'),
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
return chunks;
|
|
50
|
+
}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
4
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
5
|
+
};
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
const commander_1 = require("commander");
|
|
8
|
+
const ora_1 = __importDefault(require("ora"));
|
|
9
|
+
const config_1 = require("./config");
|
|
10
|
+
const ingest_1 = require("./ingest");
|
|
11
|
+
const agent_1 = require("./agent");
|
|
12
|
+
const logger_1 = require("./utils/logger");
|
|
13
|
+
const program = new commander_1.Command();
|
|
14
|
+
program
|
|
15
|
+
.name('codexa')
|
|
16
|
+
.description('Ask questions about any local repository from the command line.')
|
|
17
|
+
.version('0.1.0');
|
|
18
|
+
program
|
|
19
|
+
.command('init')
|
|
20
|
+
.description('Create a local .codexarc.json with sensible defaults.')
|
|
21
|
+
.action(async () => {
|
|
22
|
+
const cwd = process.cwd();
|
|
23
|
+
await (0, config_1.ensureConfig)(cwd);
|
|
24
|
+
logger_1.log.success('Created .codexarc.json. Update it with your provider keys if needed.');
|
|
25
|
+
});
|
|
26
|
+
program
|
|
27
|
+
.command('ingest')
|
|
28
|
+
.option('-f, --force', 'clear the previous index before ingesting', false)
|
|
29
|
+
.description('Chunk the current repository and store embeddings locally.')
|
|
30
|
+
.action(async (options) => {
|
|
31
|
+
const cwd = process.cwd();
|
|
32
|
+
const config = await (0, config_1.ensureConfig)(cwd);
|
|
33
|
+
try {
|
|
34
|
+
await (0, ingest_1.ingestRepository)({ cwd, config, force: options.force });
|
|
35
|
+
}
|
|
36
|
+
catch (error) {
|
|
37
|
+
console.log('Ingestion failed.');
|
|
38
|
+
handleError(error);
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
program
|
|
42
|
+
.command('ask')
|
|
43
|
+
.description('Ask a natural-language question about the current repo.')
|
|
44
|
+
.argument('<question...>', 'Question to ask about the codebase.')
|
|
45
|
+
.option('-s, --session <name>', 'session identifier to keep conversation context', 'default')
|
|
46
|
+
.option('--no-stream', 'disable streaming output')
|
|
47
|
+
.action(async (question, options) => {
|
|
48
|
+
const cwd = process.cwd();
|
|
49
|
+
const config = await (0, config_1.loadConfig)(cwd);
|
|
50
|
+
const prompt = question.join(' ');
|
|
51
|
+
// Commander behavior:
|
|
52
|
+
// default: stream = true
|
|
53
|
+
// --no-stream => stream = false
|
|
54
|
+
const stream = options.stream !== false;
|
|
55
|
+
const spinner = (0, ora_1.default)('Extracting Response...').start();
|
|
56
|
+
try {
|
|
57
|
+
const answer = await (0, agent_1.askQuestion)(cwd, config, {
|
|
58
|
+
question: prompt,
|
|
59
|
+
session: options.session,
|
|
60
|
+
stream,
|
|
61
|
+
onToken: stream
|
|
62
|
+
? (token) => {
|
|
63
|
+
if (spinner.isSpinning)
|
|
64
|
+
spinner.stop();
|
|
65
|
+
process.stdout.write(token);
|
|
66
|
+
}
|
|
67
|
+
: undefined,
|
|
68
|
+
onStatusUpdate: (status) => {
|
|
69
|
+
if (!stream)
|
|
70
|
+
spinner.text = status;
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
spinner.stop();
|
|
74
|
+
if (!stream) {
|
|
75
|
+
console.log('\n' + answer.trim() + '\n');
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
console.log('\n');
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
spinner.fail('Question failed.');
|
|
83
|
+
handleError(error);
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
program.parseAsync(process.argv).catch(handleError);
|
|
87
|
+
function handleError(error) {
|
|
88
|
+
if (error instanceof Error) {
|
|
89
|
+
logger_1.log.error(error.message);
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
console.error(error);
|
|
93
|
+
}
|
|
94
|
+
process.exitCode = 1;
|
|
95
|
+
}
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.ensureConfig = ensureConfig;
|
|
7
|
+
exports.loadConfig = loadConfig;
|
|
8
|
+
exports.saveConfig = saveConfig;
|
|
9
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
10
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
11
|
+
const dotenv_1 = __importDefault(require("dotenv"));
|
|
12
|
+
dotenv_1.default.config();
|
|
13
|
+
const CONFIG_FILENAME = '.codexarc.json';
|
|
14
|
+
const DEFAULT_CONFIG = {
|
|
15
|
+
modelProvider: 'groq',
|
|
16
|
+
model: 'llama-3.1-8b-instant',
|
|
17
|
+
embeddingProvider: 'local',
|
|
18
|
+
embeddingModel: 'Xenova/all-MiniLM-L6-v2',
|
|
19
|
+
localModelUrl: 'http://localhost:11434',
|
|
20
|
+
localModelApiKey: '',
|
|
21
|
+
maxChunkSize: 300,
|
|
22
|
+
chunkOverlap: 30,
|
|
23
|
+
includeGlobs: [
|
|
24
|
+
'**/*.ts',
|
|
25
|
+
'**/*.tsx',
|
|
26
|
+
'**/*.js',
|
|
27
|
+
'**/*.jsx',
|
|
28
|
+
'**/*.py',
|
|
29
|
+
'**/*.go',
|
|
30
|
+
'**/*.rs',
|
|
31
|
+
'**/*.java',
|
|
32
|
+
'**/*.md',
|
|
33
|
+
'**/*.json',
|
|
34
|
+
],
|
|
35
|
+
excludeGlobs: [
|
|
36
|
+
'node_modules/**',
|
|
37
|
+
'.git/**',
|
|
38
|
+
'dist/**',
|
|
39
|
+
'build/**',
|
|
40
|
+
'.codexa/**',
|
|
41
|
+
'package-lock.json',
|
|
42
|
+
],
|
|
43
|
+
historyDir: '.codexa/sessions',
|
|
44
|
+
dbPath: '.codexa/index.db',
|
|
45
|
+
temperature: 0.2,
|
|
46
|
+
topK: 5,
|
|
47
|
+
};
|
|
48
|
+
async function ensureConfig(cwd) {
|
|
49
|
+
const configPath = node_path_1.default.join(cwd, CONFIG_FILENAME);
|
|
50
|
+
if (!(await fs_extra_1.default.pathExists(configPath))) {
|
|
51
|
+
await fs_extra_1.default.writeJson(configPath, DEFAULT_CONFIG, { spaces: 2 });
|
|
52
|
+
}
|
|
53
|
+
return loadConfig(cwd);
|
|
54
|
+
}
|
|
55
|
+
async function loadConfig(cwd) {
|
|
56
|
+
const configPath = node_path_1.default.join(cwd, CONFIG_FILENAME);
|
|
57
|
+
let config;
|
|
58
|
+
if (!(await fs_extra_1.default.pathExists(configPath))) {
|
|
59
|
+
config = { ...DEFAULT_CONFIG };
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
config = {
|
|
63
|
+
...DEFAULT_CONFIG,
|
|
64
|
+
...(await fs_extra_1.default.readJson(configPath)),
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
return hydratePaths(cwd, config);
|
|
68
|
+
}
|
|
69
|
+
async function saveConfig(cwd, config) {
|
|
70
|
+
const configPath = node_path_1.default.join(cwd, CONFIG_FILENAME);
|
|
71
|
+
const dehydrated = dehydratePaths(cwd, config);
|
|
72
|
+
await fs_extra_1.default.writeJson(configPath, dehydrated, { spaces: 2 });
|
|
73
|
+
}
|
|
74
|
+
function hydratePaths(cwd, config) {
|
|
75
|
+
const clone = { ...config };
|
|
76
|
+
clone.dbPath = resolveDataPath(cwd, config.dbPath);
|
|
77
|
+
clone.historyDir = resolveDataPath(cwd, config.historyDir);
|
|
78
|
+
return clone;
|
|
79
|
+
}
|
|
80
|
+
function dehydratePaths(cwd, config) {
|
|
81
|
+
const clone = { ...config };
|
|
82
|
+
clone.dbPath = makeRelative(cwd, config.dbPath);
|
|
83
|
+
clone.historyDir = makeRelative(cwd, config.historyDir);
|
|
84
|
+
return clone;
|
|
85
|
+
}
|
|
86
|
+
function resolveDataPath(cwd, maybeRelative) {
|
|
87
|
+
if (node_path_1.default.isAbsolute(maybeRelative)) {
|
|
88
|
+
return maybeRelative;
|
|
89
|
+
}
|
|
90
|
+
return node_path_1.default.join(cwd, maybeRelative);
|
|
91
|
+
}
|
|
92
|
+
function makeRelative(cwd, maybeAbsolute) {
|
|
93
|
+
if (!node_path_1.default.isAbsolute(maybeAbsolute)) {
|
|
94
|
+
return maybeAbsolute;
|
|
95
|
+
}
|
|
96
|
+
return node_path_1.default.relative(cwd, maybeAbsolute);
|
|
97
|
+
}
|
package/dist/db.js
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.VectorStore = void 0;
|
|
7
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
8
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
9
|
+
const better_sqlite3_1 = __importDefault(require("better-sqlite3"));
|
|
10
|
+
function cosineSimilarity(a, b) {
|
|
11
|
+
if (a.length !== b.length) {
|
|
12
|
+
return 0;
|
|
13
|
+
}
|
|
14
|
+
let dot = 0;
|
|
15
|
+
let normA = 0;
|
|
16
|
+
let normB = 0;
|
|
17
|
+
const len = a.length;
|
|
18
|
+
for (let i = 0; i < len; i++) {
|
|
19
|
+
const ai = a[i];
|
|
20
|
+
const bi = b[i];
|
|
21
|
+
dot += ai * bi;
|
|
22
|
+
normA += ai * ai;
|
|
23
|
+
normB += bi * bi;
|
|
24
|
+
}
|
|
25
|
+
if (normA === 0 || normB === 0) {
|
|
26
|
+
return 0;
|
|
27
|
+
}
|
|
28
|
+
const sqrtNormA = Math.sqrt(normA);
|
|
29
|
+
const sqrtNormB = Math.sqrt(normB);
|
|
30
|
+
return dot / (sqrtNormA * sqrtNormB);
|
|
31
|
+
}
|
|
32
|
+
class VectorStore {
|
|
33
|
+
dbPath;
|
|
34
|
+
db = null;
|
|
35
|
+
constructor(dbPath) {
|
|
36
|
+
this.dbPath = dbPath;
|
|
37
|
+
}
|
|
38
|
+
init() {
|
|
39
|
+
if (this.db) {
|
|
40
|
+
return;
|
|
41
|
+
}
|
|
42
|
+
const dir = node_path_1.default.dirname(this.dbPath);
|
|
43
|
+
fs_extra_1.default.ensureDirSync(dir);
|
|
44
|
+
this.db = new better_sqlite3_1.default(this.dbPath);
|
|
45
|
+
this.db.pragma('journal_mode = WAL');
|
|
46
|
+
this.db
|
|
47
|
+
.prepare(`CREATE TABLE IF NOT EXISTS chunks (
|
|
48
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
49
|
+
file_path TEXT NOT NULL,
|
|
50
|
+
start_line INTEGER NOT NULL,
|
|
51
|
+
end_line INTEGER NOT NULL,
|
|
52
|
+
content TEXT NOT NULL,
|
|
53
|
+
compressed TEXT,
|
|
54
|
+
embedding TEXT NOT NULL
|
|
55
|
+
)`)
|
|
56
|
+
.run();
|
|
57
|
+
}
|
|
58
|
+
clear() {
|
|
59
|
+
const db = this.connection;
|
|
60
|
+
db.prepare('DELETE FROM chunks').run();
|
|
61
|
+
}
|
|
62
|
+
insertChunks(chunks) {
|
|
63
|
+
const db = this.connection;
|
|
64
|
+
const insert = db.prepare(`INSERT INTO chunks (file_path, start_line, end_line, content, compressed, embedding)
|
|
65
|
+
VALUES (@filePath, @startLine, @endLine, @content, @compressed, @embedding)`);
|
|
66
|
+
const tx = db.transaction((rows) => {
|
|
67
|
+
rows.forEach((chunk) => {
|
|
68
|
+
if (!chunk.embedding) {
|
|
69
|
+
throw new Error(`Chunk missing embedding for ${chunk.filePath}`);
|
|
70
|
+
}
|
|
71
|
+
insert.run({
|
|
72
|
+
filePath: chunk.filePath,
|
|
73
|
+
startLine: chunk.startLine,
|
|
74
|
+
endLine: chunk.endLine,
|
|
75
|
+
content: chunk.content,
|
|
76
|
+
compressed: chunk.compressed ?? '',
|
|
77
|
+
embedding: JSON.stringify(chunk.embedding),
|
|
78
|
+
});
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
tx(chunks);
|
|
82
|
+
}
|
|
83
|
+
search(queryEmbedding, topK) {
|
|
84
|
+
const db = this.connection;
|
|
85
|
+
const rows = db.prepare('SELECT * FROM chunks').all();
|
|
86
|
+
if (rows.length === 0) {
|
|
87
|
+
return [];
|
|
88
|
+
}
|
|
89
|
+
// if (rows.length > 1000) {
|
|
90
|
+
// console.error(`Searching through ${rows.length} chunks...`);
|
|
91
|
+
// }
|
|
92
|
+
// min-heap approach: keep only the top K results
|
|
93
|
+
// avoids sorting all results
|
|
94
|
+
const topResults = [];
|
|
95
|
+
const minScore = { value: -Infinity };
|
|
96
|
+
for (const row of rows) {
|
|
97
|
+
const embedding = JSON.parse(row.embedding);
|
|
98
|
+
const score = cosineSimilarity(queryEmbedding, embedding);
|
|
99
|
+
if (topResults.length >= topK && score <= minScore.value) {
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
const result = {
|
|
103
|
+
filePath: row.file_path,
|
|
104
|
+
startLine: row.start_line,
|
|
105
|
+
endLine: row.end_line,
|
|
106
|
+
content: row.content,
|
|
107
|
+
compressed: row.compressed ?? '',
|
|
108
|
+
embedding,
|
|
109
|
+
score,
|
|
110
|
+
};
|
|
111
|
+
if (topResults.length < topK) {
|
|
112
|
+
topResults.push(result);
|
|
113
|
+
if (result.score < minScore.value) {
|
|
114
|
+
minScore.value = result.score;
|
|
115
|
+
}
|
|
116
|
+
if (topResults.length === topK) {
|
|
117
|
+
topResults.sort((a, b) => a.score - b.score);
|
|
118
|
+
minScore.value = topResults[0].score;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
else if (score > minScore.value) {
|
|
122
|
+
topResults[0] = result;
|
|
123
|
+
topResults.sort((a, b) => a.score - b.score);
|
|
124
|
+
minScore.value = topResults[0].score;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
const finalResults = topResults.sort((a, b) => b.score - a.score);
|
|
128
|
+
// if (rows.length > 1000) {
|
|
129
|
+
// console.error(`Search complete, returning top ${finalResults.length} results`);
|
|
130
|
+
// }
|
|
131
|
+
return finalResults;
|
|
132
|
+
}
|
|
133
|
+
get connection() {
|
|
134
|
+
if (!this.db) {
|
|
135
|
+
this.init();
|
|
136
|
+
}
|
|
137
|
+
return this.db;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
exports.VectorStore = VectorStore;
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.createEmbedder = createEmbedder;
|
|
40
|
+
const openai_1 = __importDefault(require("openai"));
|
|
41
|
+
class OpenAIEmbedder {
|
|
42
|
+
model;
|
|
43
|
+
client;
|
|
44
|
+
constructor(model, apiKey) {
|
|
45
|
+
this.model = model;
|
|
46
|
+
if (!apiKey) {
|
|
47
|
+
throw new Error('OPENAI_API_KEY is required for the OpenAI embedding provider.');
|
|
48
|
+
}
|
|
49
|
+
this.client = new openai_1.default({
|
|
50
|
+
apiKey,
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
async embed(texts) {
|
|
54
|
+
const res = await this.client.embeddings.create({
|
|
55
|
+
model: this.model,
|
|
56
|
+
input: texts,
|
|
57
|
+
});
|
|
58
|
+
return res.data.map((item) => item.embedding);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
class LocalEmbedder {
|
|
62
|
+
model;
|
|
63
|
+
pipeline;
|
|
64
|
+
pipelinePromise;
|
|
65
|
+
constructor(model) {
|
|
66
|
+
this.model = model;
|
|
67
|
+
}
|
|
68
|
+
async getPipeline() {
|
|
69
|
+
if (this.pipeline) {
|
|
70
|
+
return this.pipeline;
|
|
71
|
+
}
|
|
72
|
+
if (!this.pipelinePromise) {
|
|
73
|
+
this.pipelinePromise = (async () => {
|
|
74
|
+
try {
|
|
75
|
+
const transformersModule = await Promise.resolve().then(() => __importStar(require('@xenova/transformers')));
|
|
76
|
+
const { pipeline } = transformersModule;
|
|
77
|
+
this.pipeline = await pipeline('feature-extraction', this.model);
|
|
78
|
+
// console.log(`Pipeline created successfully with ${this.model}`);
|
|
79
|
+
return this.pipeline;
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
83
|
+
throw new Error(`Failed to load embedding model "${this.model}": ${errorMessage}`);
|
|
84
|
+
}
|
|
85
|
+
})();
|
|
86
|
+
}
|
|
87
|
+
return this.pipelinePromise;
|
|
88
|
+
}
|
|
89
|
+
async preload() {
|
|
90
|
+
await this.getPipeline();
|
|
91
|
+
console.error(`Embedding model preloaded\n`);
|
|
92
|
+
}
|
|
93
|
+
async embed(texts) {
|
|
94
|
+
const pipe = await this.getPipeline();
|
|
95
|
+
const embeddingPromises = texts.map(async (text) => {
|
|
96
|
+
const output = await pipe(text, {
|
|
97
|
+
pooling: 'mean',
|
|
98
|
+
normalize: true,
|
|
99
|
+
});
|
|
100
|
+
return this.extractEmbedding(output);
|
|
101
|
+
});
|
|
102
|
+
return Promise.all(embeddingPromises);
|
|
103
|
+
}
|
|
104
|
+
extractEmbedding(output) {
|
|
105
|
+
const data = output?.data;
|
|
106
|
+
if (data) {
|
|
107
|
+
if (Array.isArray(data)) {
|
|
108
|
+
return data;
|
|
109
|
+
}
|
|
110
|
+
else if (data instanceof Float32Array || data instanceof Float64Array) {
|
|
111
|
+
return Array.from(data);
|
|
112
|
+
}
|
|
113
|
+
else if (typeof data === 'object' && data !== null && 'tolist' in data) {
|
|
114
|
+
return data.tolist();
|
|
115
|
+
}
|
|
116
|
+
else {
|
|
117
|
+
return Array.from(data);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
else if (Array.isArray(output)) {
|
|
121
|
+
return output;
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
throw new Error(`Unexpected output format from embedding model`);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// Cache embedders to avoid reloading models on every query
|
|
129
|
+
const embedderCache = new Map();
|
|
130
|
+
function getCacheKey(config) {
|
|
131
|
+
return `${config.embeddingProvider}:${config.embeddingModel}`;
|
|
132
|
+
}
|
|
133
|
+
async function createEmbedder(config) {
|
|
134
|
+
const cacheKey = getCacheKey(config);
|
|
135
|
+
const cached = embedderCache.get(cacheKey);
|
|
136
|
+
if (cached) {
|
|
137
|
+
return cached;
|
|
138
|
+
}
|
|
139
|
+
let embedder;
|
|
140
|
+
if (config.embeddingProvider === 'local') {
|
|
141
|
+
embedder = new LocalEmbedder(config.embeddingModel);
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
embedder = new OpenAIEmbedder(config.embeddingModel, process.env.OPENAI_API_KEY);
|
|
145
|
+
}
|
|
146
|
+
embedderCache.set(cacheKey, embedder);
|
|
147
|
+
return embedder;
|
|
148
|
+
}
|
package/dist/ingest.js
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.ingestRepository = ingestRepository;
|
|
7
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
8
|
+
const globby_1 = require("globby");
|
|
9
|
+
const chunker_1 = require("./chunker");
|
|
10
|
+
const embeddings_1 = require("./embeddings");
|
|
11
|
+
const db_1 = require("./db");
|
|
12
|
+
const ora_1 = __importDefault(require("ora"));
|
|
13
|
+
function compressText(text, cap = 450) {
|
|
14
|
+
return text
|
|
15
|
+
.replace(/\/\*[\s\S]*?\*\//g, '')
|
|
16
|
+
.replace(/\/\/.*/g, '')
|
|
17
|
+
.replace(/\s+/g, ' ')
|
|
18
|
+
.trim()
|
|
19
|
+
.slice(0, cap);
|
|
20
|
+
}
|
|
21
|
+
function tick() {
|
|
22
|
+
return new Promise((resolve) => setImmediate(resolve));
|
|
23
|
+
}
|
|
24
|
+
async function ingestRepository({ cwd, config, force = false, }) {
|
|
25
|
+
const spinnerFiles = (0, ora_1.default)('Finding files...').start();
|
|
26
|
+
const files = await (0, globby_1.globby)(config.includeGlobs, {
|
|
27
|
+
cwd,
|
|
28
|
+
gitignore: true,
|
|
29
|
+
ignore: config.excludeGlobs,
|
|
30
|
+
absolute: true,
|
|
31
|
+
onlyFiles: true,
|
|
32
|
+
});
|
|
33
|
+
if (!files.length) {
|
|
34
|
+
spinnerFiles.fail('No matching files found.');
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
spinnerFiles.succeed(`Found ${files.length} files`);
|
|
38
|
+
const spinnerChunk = (0, ora_1.default)('Chunking files...').start();
|
|
39
|
+
const chunks = [];
|
|
40
|
+
for (const file of files) {
|
|
41
|
+
const ch = await (0, chunker_1.chunkFile)(file, config.maxChunkSize, config.chunkOverlap);
|
|
42
|
+
ch.forEach((c) => (c.filePath = node_path_1.default.relative(cwd, c.filePath)));
|
|
43
|
+
chunks.push(...ch);
|
|
44
|
+
await tick();
|
|
45
|
+
}
|
|
46
|
+
spinnerChunk.succeed(`Chunked files (${chunks.length} chunks)`);
|
|
47
|
+
const spinnerCompress = (0, ora_1.default)('Compressing chunks...').start();
|
|
48
|
+
chunks.forEach((c) => (c.compressed = compressText(c.content)));
|
|
49
|
+
spinnerCompress.succeed('Compression complete');
|
|
50
|
+
const spinnerEmbed = (0, ora_1.default)('Embedding chunks...').start();
|
|
51
|
+
const embedder = await (0, embeddings_1.createEmbedder)(config);
|
|
52
|
+
const batchSize = 32;
|
|
53
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
54
|
+
const batch = chunks.slice(i, i + batchSize);
|
|
55
|
+
const texts = batch.map((c) => c.compressed);
|
|
56
|
+
const vectors = await embedder.embed(texts);
|
|
57
|
+
batch.forEach((c, idx) => (c.embedding = vectors[idx]));
|
|
58
|
+
await tick();
|
|
59
|
+
}
|
|
60
|
+
spinnerEmbed.succeed('Embedding complete');
|
|
61
|
+
const spinnerStore = (0, ora_1.default)('Storing chunks...').start();
|
|
62
|
+
const store = new db_1.VectorStore(config.dbPath);
|
|
63
|
+
store.init();
|
|
64
|
+
if (force)
|
|
65
|
+
store.clear();
|
|
66
|
+
store.insertChunks(chunks);
|
|
67
|
+
spinnerStore.succeed('Stored successfully');
|
|
68
|
+
(0, ora_1.default)().succeed('Ingestion complete!');
|
|
69
|
+
}
|