@milo4jo/contextkit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +181 -0
- package/dist/commands/index-cmd.d.ts +3 -0
- package/dist/commands/index-cmd.d.ts.map +1 -0
- package/dist/commands/index-cmd.js +111 -0
- package/dist/commands/index-cmd.js.map +1 -0
- package/dist/commands/init.d.ts +3 -0
- package/dist/commands/init.d.ts.map +1 -0
- package/dist/commands/init.js +56 -0
- package/dist/commands/init.js.map +1 -0
- package/dist/commands/select.d.ts +3 -0
- package/dist/commands/select.d.ts.map +1 -0
- package/dist/commands/select.js +64 -0
- package/dist/commands/select.js.map +1 -0
- package/dist/commands/source/add.d.ts +3 -0
- package/dist/commands/source/add.d.ts.map +1 -0
- package/dist/commands/source/add.js +87 -0
- package/dist/commands/source/add.js.map +1 -0
- package/dist/commands/source/index.d.ts +3 -0
- package/dist/commands/source/index.d.ts.map +1 -0
- package/dist/commands/source/index.js +14 -0
- package/dist/commands/source/index.js.map +1 -0
- package/dist/commands/source/list.d.ts +3 -0
- package/dist/commands/source/list.d.ts.map +1 -0
- package/dist/commands/source/list.js +46 -0
- package/dist/commands/source/list.js.map +1 -0
- package/dist/commands/source/remove.d.ts +3 -0
- package/dist/commands/source/remove.d.ts.map +1 -0
- package/dist/commands/source/remove.js +38 -0
- package/dist/commands/source/remove.js.map +1 -0
- package/dist/commands/source.d.ts +3 -0
- package/dist/commands/source.d.ts.map +1 -0
- package/dist/commands/source.js +153 -0
- package/dist/commands/source.js.map +1 -0
- package/dist/config/index.d.ts +38 -0
- package/dist/config/index.d.ts.map +1 -0
- package/dist/config/index.js +100 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/types.d.ts +21 -0
- package/dist/config/types.d.ts.map +1 -0
- package/dist/config/types.js +5 -0
- package/dist/config/types.js.map +1 -0
- package/dist/db/index.d.ts +14 -0
- package/dist/db/index.d.ts.map +1 -0
- package/dist/db/index.js +63 -0
- package/dist/db/index.js.map +1 -0
- package/dist/errors/index.d.ts +30 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +51 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +90 -0
- package/dist/index.js.map +1 -0
- package/dist/indexer/chunker.d.ts +44 -0
- package/dist/indexer/chunker.d.ts.map +1 -0
- package/dist/indexer/chunker.js +102 -0
- package/dist/indexer/chunker.js.map +1 -0
- package/dist/indexer/discovery.d.ts +34 -0
- package/dist/indexer/discovery.d.ts.map +1 -0
- package/dist/indexer/discovery.js +66 -0
- package/dist/indexer/discovery.js.map +1 -0
- package/dist/indexer/embeddings.d.ts +32 -0
- package/dist/indexer/embeddings.d.ts.map +1 -0
- package/dist/indexer/embeddings.js +85 -0
- package/dist/indexer/embeddings.js.map +1 -0
- package/dist/indexer/index.d.ts +37 -0
- package/dist/indexer/index.d.ts.map +1 -0
- package/dist/indexer/index.js +123 -0
- package/dist/indexer/index.js.map +1 -0
- package/dist/selector/budget.d.ts +26 -0
- package/dist/selector/budget.d.ts.map +1 -0
- package/dist/selector/budget.js +75 -0
- package/dist/selector/budget.js.map +1 -0
- package/dist/selector/formatter.d.ts +44 -0
- package/dist/selector/formatter.d.ts.map +1 -0
- package/dist/selector/formatter.js +114 -0
- package/dist/selector/formatter.js.map +1 -0
- package/dist/selector/index.d.ts +37 -0
- package/dist/selector/index.d.ts.map +1 -0
- package/dist/selector/index.js +67 -0
- package/dist/selector/index.js.map +1 -0
- package/dist/selector/scoring.d.ts +26 -0
- package/dist/selector/scoring.d.ts.map +1 -0
- package/dist/selector/scoring.js +113 -0
- package/dist/selector/scoring.js.map +1 -0
- package/dist/selector/search.d.ts +29 -0
- package/dist/selector/search.d.ts.map +1 -0
- package/dist/selector/search.js +48 -0
- package/dist/selector/search.js.map +1 -0
- package/dist/utils/cli.d.ts +20 -0
- package/dist/utils/cli.d.ts.map +1 -0
- package/dist/utils/cli.js +19 -0
- package/dist/utils/cli.js.map +1 -0
- package/dist/utils/format.d.ts +30 -0
- package/dist/utils/format.d.ts.map +1 -0
- package/dist/utils/format.js +44 -0
- package/dist/utils/format.js.map +1 -0
- package/dist/utils/output.d.ts +42 -0
- package/dist/utils/output.d.ts.map +1 -0
- package/dist/utils/output.js +62 -0
- package/dist/utils/output.js.map +1 -0
- package/dist/utils/prompts.d.ts +23 -0
- package/dist/utils/prompts.d.ts.map +1 -0
- package/dist/utils/prompts.js +46 -0
- package/dist/utils/prompts.js.map +1 -0
- package/dist/utils/streams.d.ts +40 -0
- package/dist/utils/streams.d.ts.map +1 -0
- package/dist/utils/streams.js +61 -0
- package/dist/utils/streams.js.map +1 -0
- package/package.json +67 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Command } from 'commander';
|
|
3
|
+
import { readFileSync } from 'fs';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
5
|
+
import { dirname, join } from 'path';
|
|
6
|
+
import { initCommand } from './commands/init.js';
|
|
7
|
+
import { sourceCommand } from './commands/source/index.js';
|
|
8
|
+
import { indexCommand } from './commands/index-cmd.js';
|
|
9
|
+
import { selectCommand } from './commands/select.js';
|
|
10
|
+
import { ContextKitError, InvalidUsageError } from './errors/index.js';
|
|
11
|
+
import { writeError, writeMessage } from './utils/streams.js';
|
|
12
|
+
// Get version from package.json
|
|
13
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const pkg = JSON.parse(readFileSync(join(__dirname, '..', 'package.json'), 'utf-8'));
|
|
15
|
+
const VERSION = pkg.version;
|
|
16
|
+
// Exit codes per CLI-DESIGN.md
|
|
17
|
+
const EXIT_ERROR = 1;
|
|
18
|
+
const EXIT_INVALID_USAGE = 2;
|
|
19
|
+
/**
|
|
20
|
+
* Handle errors globally
|
|
21
|
+
*/
|
|
22
|
+
function handleError(error) {
|
|
23
|
+
if (error instanceof ContextKitError) {
|
|
24
|
+
writeError(error.message);
|
|
25
|
+
const exitCode = error instanceof InvalidUsageError ? EXIT_INVALID_USAGE : EXIT_ERROR;
|
|
26
|
+
process.exit(exitCode);
|
|
27
|
+
}
|
|
28
|
+
// Unexpected error - show stack in verbose mode
|
|
29
|
+
if (error instanceof Error) {
|
|
30
|
+
writeError(error.message);
|
|
31
|
+
if (process.env.DEBUG) {
|
|
32
|
+
writeMessage(error.stack || '');
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
writeError('An unexpected error occurred');
|
|
37
|
+
}
|
|
38
|
+
process.exit(EXIT_ERROR);
|
|
39
|
+
}
|
|
40
|
+
// Global error handlers
|
|
41
|
+
process.on('uncaughtException', handleError);
|
|
42
|
+
process.on('unhandledRejection', handleError);
|
|
43
|
+
const program = new Command();
|
|
44
|
+
program
|
|
45
|
+
.name('contextkit')
|
|
46
|
+
.description('Smart context selection for LLMs')
|
|
47
|
+
.version(VERSION, '-v, --version', 'Show version number')
|
|
48
|
+
.showHelpAfterError()
|
|
49
|
+
.configureHelp({
|
|
50
|
+
sortSubcommands: true,
|
|
51
|
+
subcommandTerm: (cmd) => cmd.name(),
|
|
52
|
+
});
|
|
53
|
+
// Global options
|
|
54
|
+
program
|
|
55
|
+
.option('--json', 'Output as JSON')
|
|
56
|
+
.option('--plain', 'Plain output (no colors/formatting)')
|
|
57
|
+
.option('--quiet', 'Suppress non-essential output');
|
|
58
|
+
// Register commands
|
|
59
|
+
program.addCommand(initCommand);
|
|
60
|
+
program.addCommand(sourceCommand);
|
|
61
|
+
program.addCommand(indexCommand);
|
|
62
|
+
program.addCommand(selectCommand);
|
|
63
|
+
// Default action when no command given
|
|
64
|
+
program.action(() => {
|
|
65
|
+
console.log(`
|
|
66
|
+
🎯 contextkit - Smart context selection for AI coding assistants
|
|
67
|
+
|
|
68
|
+
Quick Start:
|
|
69
|
+
$ contextkit init # Initialize in your project
|
|
70
|
+
$ contextkit source add ./src # Add source directories
|
|
71
|
+
$ contextkit index # Index everything
|
|
72
|
+
$ contextkit select "your query" # Find relevant context
|
|
73
|
+
|
|
74
|
+
Commands:
|
|
75
|
+
init Initialize ContextKit in current directory
|
|
76
|
+
source Manage source directories
|
|
77
|
+
index Index all sources (re-run after code changes)
|
|
78
|
+
select Select context for a query
|
|
79
|
+
|
|
80
|
+
Global Options:
|
|
81
|
+
--json Output as JSON
|
|
82
|
+
--plain No colors (or set NO_COLOR=1)
|
|
83
|
+
--quiet Suppress non-essential output
|
|
84
|
+
-v, --version Show version
|
|
85
|
+
|
|
86
|
+
Run 'contextkit <command> --help' for command details.
|
|
87
|
+
`);
|
|
88
|
+
});
|
|
89
|
+
program.parse();
|
|
90
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAC3D,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACvE,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAE9D,gCAAgC;AAChC,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC1D,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,cAAc,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;AACrF,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC;AAE5B,+BAA+B;AAC/B,MAAM,UAAU,GAAG,CAAC,CAAC;AACrB,MAAM,kBAAkB,GAAG,CAAC,CAAC;AAE7B;;GAEG;AACH,SAAS,WAAW,CAAC,KAAc;IACjC,IAAI,KAAK,YAAY,eAAe,EAAE,CAAC;QACrC,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAE1B,MAAM,QAAQ,GAAG,KAAK,YAAY,iBAAiB,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,UAAU,CAAC;QAEtF,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACzB,CAAC;IAED,gDAAgD;IAChD,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;QAC3B,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC1B,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;YACtB,YAAY,CAAC,KAAK,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;SAAM,CAAC;QACN,UAAU,CAAC,8BAA8B,CAAC,CAAC;IAC7C,CAAC;IAED,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;AAC3B,CAAC;AAED,wBAAwB;AACxB,OAAO,CAAC,EAAE,CAAC,mBAAmB,EAAE,WAAW,CAAC,CAAC;AAC7C,OAAO,CAAC,EAAE,CAAC,oBAAoB,EAAE,WAAW,CAAC,CAAC;AAE9C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,OAAO,CAAC,OAAO,EAAE,eAAe,EAAE,qBAAqB,CAAC;KACxD,kBAAkB,EAAE;KACpB,aAAa,CAAC;IACb,eAAe,EAAE,IAAI;IACrB,cAAc,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE;CACpC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,SAAS,EAAE,qCAAqC,CAAC;KACxD,MAAM,CAAC,SAAS,EAAE,+BAA+B,CAAC,CAAC;AAEtD,oBAAoB;AACpB,OAAO,CAAC,UAAU,CAAC,WAAW,CAAC,CAAC;AAChC,OAAO,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC;AAClC,OAAO,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC;AACjC,OAAO,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC;AAElC,uCAAuC;AACvC,OAAO,CAAC,MAAM,CAAC,GAAG,EAAE;IAClB,OAAO,CAAC,GAAG,CAAC;;;;;;;;;;;;;;;;;;;;;;CAsBb,CAAC,CAAC;AACH,CAAC,CAAC,CAAC;AAEH,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking Module
|
|
3
|
+
*
|
|
4
|
+
* Splits files into chunks for embedding.
|
|
5
|
+
* Uses line-based chunking with configurable size and overlap.
|
|
6
|
+
*/
|
|
7
|
+
import type { DiscoveredFile } from './discovery.js';
|
|
8
|
+
/** Chunk of content ready for embedding */
|
|
9
|
+
export interface Chunk {
|
|
10
|
+
/** Unique chunk ID */
|
|
11
|
+
id: string;
|
|
12
|
+
/** Source this chunk belongs to */
|
|
13
|
+
sourceId: string;
|
|
14
|
+
/** Original file path (relative) */
|
|
15
|
+
filePath: string;
|
|
16
|
+
/** Chunk content */
|
|
17
|
+
content: string;
|
|
18
|
+
/** Start line (1-indexed) */
|
|
19
|
+
startLine: number;
|
|
20
|
+
/** End line (1-indexed, inclusive) */
|
|
21
|
+
endLine: number;
|
|
22
|
+
/** Token count */
|
|
23
|
+
tokens: number;
|
|
24
|
+
}
|
|
25
|
+
/** Chunking options */
|
|
26
|
+
export interface ChunkOptions {
|
|
27
|
+
/** Target tokens per chunk */
|
|
28
|
+
chunkSize: number;
|
|
29
|
+
/** Overlap tokens between chunks */
|
|
30
|
+
chunkOverlap: number;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Count tokens in a string
|
|
34
|
+
*/
|
|
35
|
+
export declare function countTokens(text: string): number;
|
|
36
|
+
/**
|
|
37
|
+
* Chunk a single file into pieces
|
|
38
|
+
*/
|
|
39
|
+
export declare function chunkFile(file: DiscoveredFile, options?: ChunkOptions): Chunk[];
|
|
40
|
+
/**
|
|
41
|
+
* Chunk multiple files
|
|
42
|
+
*/
|
|
43
|
+
export declare function chunkFiles(files: DiscoveredFile[], options?: ChunkOptions): Chunk[];
|
|
44
|
+
//# sourceMappingURL=chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../../src/indexer/chunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAErD,2CAA2C;AAC3C,MAAM,WAAW,KAAK;IACpB,sBAAsB;IACtB,EAAE,EAAE,MAAM,CAAC;IACX,mCAAmC;IACnC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,QAAQ,EAAE,MAAM,CAAC;IACjB,oBAAoB;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,6BAA6B;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,sCAAsC;IACtC,OAAO,EAAE,MAAM,CAAC;IAChB,kBAAkB;IAClB,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,uBAAuB;AACvB,MAAM,WAAW,YAAY;IAC3B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,oCAAoC;IACpC,YAAY,EAAE,MAAM,CAAC;CACtB;AAUD;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEhD;AAiBD;;GAEG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,cAAc,EAAE,OAAO,GAAE,YAA8B,GAAG,KAAK,EAAE,CA8DhG;AAED;;GAEG;AACH,wBAAgB,UAAU,CACxB,KAAK,EAAE,cAAc,EAAE,EACvB,OAAO,GAAE,YAA8B,GACtC,KAAK,EAAE,CAST"}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking Module
|
|
3
|
+
*
|
|
4
|
+
* Splits files into chunks for embedding.
|
|
5
|
+
* Uses line-based chunking with configurable size and overlap.
|
|
6
|
+
*/
|
|
7
|
+
import { encodingForModel } from 'js-tiktoken';
|
|
8
|
+
const DEFAULT_OPTIONS = {
|
|
9
|
+
chunkSize: 500,
|
|
10
|
+
chunkOverlap: 50,
|
|
11
|
+
};
|
|
12
|
+
// Use cl100k_base encoding (GPT-4/Claude compatible)
|
|
13
|
+
const encoder = encodingForModel('gpt-4');
|
|
14
|
+
/**
|
|
15
|
+
* Count tokens in a string
|
|
16
|
+
*/
|
|
17
|
+
export function countTokens(text) {
|
|
18
|
+
return encoder.encode(text).length;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Generate a unique chunk ID
|
|
22
|
+
*/
|
|
23
|
+
function generateChunkId(sourceId, filePath, startLine) {
|
|
24
|
+
const base = `${sourceId}:${filePath}:${startLine}`;
|
|
25
|
+
// Simple hash for ID
|
|
26
|
+
let hash = 0;
|
|
27
|
+
for (let i = 0; i < base.length; i++) {
|
|
28
|
+
const char = base.charCodeAt(i);
|
|
29
|
+
hash = (hash << 5) - hash + char;
|
|
30
|
+
hash = hash & hash; // Convert to 32-bit integer
|
|
31
|
+
}
|
|
32
|
+
return `chunk_${Math.abs(hash).toString(36)}`;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Chunk a single file into pieces
|
|
36
|
+
*/
|
|
37
|
+
export function chunkFile(file, options = DEFAULT_OPTIONS) {
|
|
38
|
+
const lines = file.content.split('\n');
|
|
39
|
+
const chunks = [];
|
|
40
|
+
let currentLines = [];
|
|
41
|
+
let currentTokens = 0;
|
|
42
|
+
let startLine = 1;
|
|
43
|
+
for (let i = 0; i < lines.length; i++) {
|
|
44
|
+
const line = lines[i];
|
|
45
|
+
const lineTokens = countTokens(line + '\n');
|
|
46
|
+
// If adding this line exceeds chunk size, save current chunk
|
|
47
|
+
if (currentTokens + lineTokens > options.chunkSize && currentLines.length > 0) {
|
|
48
|
+
const content = currentLines.join('\n');
|
|
49
|
+
chunks.push({
|
|
50
|
+
id: generateChunkId(file.sourceId, file.relativePath, startLine),
|
|
51
|
+
sourceId: file.sourceId,
|
|
52
|
+
filePath: file.relativePath,
|
|
53
|
+
content,
|
|
54
|
+
startLine,
|
|
55
|
+
endLine: startLine + currentLines.length - 1,
|
|
56
|
+
tokens: currentTokens,
|
|
57
|
+
});
|
|
58
|
+
// Calculate overlap: keep last N tokens worth of lines
|
|
59
|
+
const overlapLines = [];
|
|
60
|
+
let overlapTokens = 0;
|
|
61
|
+
for (let j = currentLines.length - 1; j >= 0 && overlapTokens < options.chunkOverlap; j--) {
|
|
62
|
+
const overlapLine = currentLines[j];
|
|
63
|
+
const overlapLineTokens = countTokens(overlapLine + '\n');
|
|
64
|
+
overlapLines.unshift(overlapLine);
|
|
65
|
+
overlapTokens += overlapLineTokens;
|
|
66
|
+
}
|
|
67
|
+
// Start next chunk with overlap
|
|
68
|
+
currentLines = overlapLines;
|
|
69
|
+
currentTokens = overlapTokens;
|
|
70
|
+
startLine =
|
|
71
|
+
startLine + (chunks[chunks.length - 1].endLine - startLine + 1) - overlapLines.length;
|
|
72
|
+
}
|
|
73
|
+
currentLines.push(line);
|
|
74
|
+
currentTokens += lineTokens;
|
|
75
|
+
}
|
|
76
|
+
// Don't forget the last chunk
|
|
77
|
+
if (currentLines.length > 0) {
|
|
78
|
+
const content = currentLines.join('\n');
|
|
79
|
+
chunks.push({
|
|
80
|
+
id: generateChunkId(file.sourceId, file.relativePath, startLine),
|
|
81
|
+
sourceId: file.sourceId,
|
|
82
|
+
filePath: file.relativePath,
|
|
83
|
+
content,
|
|
84
|
+
startLine,
|
|
85
|
+
endLine: startLine + currentLines.length - 1,
|
|
86
|
+
tokens: countTokens(content),
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
return chunks;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Chunk multiple files
|
|
93
|
+
*/
|
|
94
|
+
export function chunkFiles(files, options = DEFAULT_OPTIONS) {
|
|
95
|
+
const allChunks = [];
|
|
96
|
+
for (const file of files) {
|
|
97
|
+
const chunks = chunkFile(file, options);
|
|
98
|
+
allChunks.push(...chunks);
|
|
99
|
+
}
|
|
100
|
+
return allChunks;
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../../src/indexer/chunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AA6B/C,MAAM,eAAe,GAAiB;IACpC,SAAS,EAAE,GAAG;IACd,YAAY,EAAE,EAAE;CACjB,CAAC;AAEF,qDAAqD;AACrD,MAAM,OAAO,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;AAE1C;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,OAAO,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;AACrC,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,QAAgB,EAAE,QAAgB,EAAE,SAAiB;IAC5E,MAAM,IAAI,GAAG,GAAG,QAAQ,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;IACpD,qBAAqB;IACrB,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAChC,IAAI,GAAG,CAAC,IAAI,IAAI,CAAC,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;QACjC,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,4BAA4B;IAClD,CAAC;IACD,OAAO,SAAS,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC,EAAE,CAAC;AAChD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,IAAoB,EAAE,UAAwB,eAAe;IACrF,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACvC,MAAM,MAAM,GAAY,EAAE,CAAC;IAE3B,IAAI,YAAY,GAAa,EAAE,CAAC;IAChC,IAAI,aAAa,GAAG,CAAC,CAAC;IACtB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,UAAU,GAAG,WAAW,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;QAE5C,6DAA6D;QAC7D,IAAI,aAAa,GAAG,UAAU,GAAG,OAAO,CAAC,SAAS,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9E,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,eAAe,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;gBAChE,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,QAAQ,EAAE,IAAI,CAAC,YAAY;gBAC3B,OAAO;gBACP,SAAS;gBACT,OAAO,EAAE,SAAS,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC;gBAC5C,MAAM,EAAE,aAAa;aACtB,CAAC,CAAC;YAEH,uDAAuD;YACvD,MAAM,YAAY,GAAa,EAAE,CAAC;YAClC,IAAI,aAAa,GAAG,CAAC,CAAC;YAEtB,KAAK,IAAI,CAAC,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,aAAa,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC1F,MAAM,WAAW,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;gBACpC,MAAM,iBAAiB,GAAG,WAAW,CAAC,WAAW,GAAG,IAAI,CAAC,CAAC;gBAC1D,YAAY,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;gBAClC,aAAa,IAAI,iBAAiB,CAAC;YACrC,CAAC;YAED,gCAAgC;YAChC,YAAY,GAAG,YAAY,CAAC;YAC5B,aAAa,GAAG,aAAa,CAAC;YAC9B,SAAS;gBACP,SAAS,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,OAAO,GAAG,SAAS,GAAG,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;QAC1F,CAAC;QAED,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxB,aAAa,IAAI,UAAU,CAAC;IAC9B,CAAC;IAED,8BAA8B;IAC9B,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5B,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxC,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,eAAe,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;YAChE,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,QAAQ,EAAE,IAAI,CAAC,YAAY;YAC3B,OAAO;YACP,SAAS;YACT,OAAO,EAAE,SAAS,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC;YAC5C,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC;SAC7B,CAAC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CACxB,KAAuB,EACvB,UAAwB,eAAe;IAEvC,MAAM,SAAS,GAAY,EAAE,CAAC;IAE9B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACxC,SAAS,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;IAC5B,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File Discovery Module
|
|
3
|
+
*
|
|
4
|
+
* Discovers files in sources based on include/exclude patterns.
|
|
5
|
+
*/
|
|
6
|
+
import type { Source } from '../config/types.js';
|
|
7
|
+
/** File info returned by discovery */
|
|
8
|
+
export interface DiscoveredFile {
|
|
9
|
+
/** Absolute path to file */
|
|
10
|
+
path: string;
|
|
11
|
+
/** Path relative to source */
|
|
12
|
+
relativePath: string;
|
|
13
|
+
/** Source this file belongs to */
|
|
14
|
+
sourceId: string;
|
|
15
|
+
/** File content */
|
|
16
|
+
content: string;
|
|
17
|
+
/** File size in bytes */
|
|
18
|
+
size: number;
|
|
19
|
+
}
|
|
20
|
+
/** Discovery result for a source */
|
|
21
|
+
export interface DiscoveryResult {
|
|
22
|
+
sourceId: string;
|
|
23
|
+
files: DiscoveredFile[];
|
|
24
|
+
skipped: number;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Discover all files in a source
|
|
28
|
+
*/
|
|
29
|
+
export declare function discoverFiles(source: Source, baseDir: string): DiscoveryResult;
|
|
30
|
+
/**
|
|
31
|
+
* Discover files from multiple sources
|
|
32
|
+
*/
|
|
33
|
+
export declare function discoverAllFiles(sources: Source[], baseDir: string): DiscoveryResult[];
|
|
34
|
+
//# sourceMappingURL=discovery.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"discovery.d.ts","sourceRoot":"","sources":["../../src/indexer/discovery.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAKjD,sCAAsC;AACtC,MAAM,WAAW,cAAc;IAC7B,4BAA4B;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,8BAA8B;IAC9B,YAAY,EAAE,MAAM,CAAC;IACrB,kCAAkC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,mBAAmB;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,yBAAyB;IACzB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,oCAAoC;AACpC,MAAM,WAAW,eAAe;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,cAAc,EAAE,CAAC;IACxB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,eAAe,CAqD9E;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE,OAAO,EAAE,MAAM,GAAG,eAAe,EAAE,CAEtF"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File Discovery Module
|
|
3
|
+
*
|
|
4
|
+
* Discovers files in sources based on include/exclude patterns.
|
|
5
|
+
*/
|
|
6
|
+
import { resolve } from 'path';
|
|
7
|
+
import { readFileSync, statSync } from 'fs';
|
|
8
|
+
import fg from 'fast-glob';
|
|
9
|
+
/** Maximum file size to process (100KB) */
|
|
10
|
+
const MAX_FILE_SIZE = 100 * 1024;
|
|
11
|
+
/**
|
|
12
|
+
* Discover all files in a source
|
|
13
|
+
*/
|
|
14
|
+
export function discoverFiles(source, baseDir) {
|
|
15
|
+
const sourcePath = resolve(baseDir, source.path);
|
|
16
|
+
// Find matching files
|
|
17
|
+
const matches = fg.sync(source.patterns.include, {
|
|
18
|
+
cwd: sourcePath,
|
|
19
|
+
ignore: source.patterns.exclude,
|
|
20
|
+
onlyFiles: true,
|
|
21
|
+
absolute: false,
|
|
22
|
+
});
|
|
23
|
+
const files = [];
|
|
24
|
+
let skipped = 0;
|
|
25
|
+
for (const relativePath of matches) {
|
|
26
|
+
const absolutePath = resolve(sourcePath, relativePath);
|
|
27
|
+
try {
|
|
28
|
+
const stats = statSync(absolutePath);
|
|
29
|
+
// Skip files that are too large
|
|
30
|
+
if (stats.size > MAX_FILE_SIZE) {
|
|
31
|
+
skipped++;
|
|
32
|
+
continue;
|
|
33
|
+
}
|
|
34
|
+
// Read file content
|
|
35
|
+
const content = readFileSync(absolutePath, 'utf-8');
|
|
36
|
+
// Skip binary files (simple heuristic: check for null bytes)
|
|
37
|
+
if (content.includes('\0')) {
|
|
38
|
+
skipped++;
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
files.push({
|
|
42
|
+
path: absolutePath,
|
|
43
|
+
relativePath,
|
|
44
|
+
sourceId: source.id,
|
|
45
|
+
content,
|
|
46
|
+
size: stats.size,
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
// Skip files we can't read
|
|
51
|
+
skipped++;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
sourceId: source.id,
|
|
56
|
+
files,
|
|
57
|
+
skipped,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Discover files from multiple sources
|
|
62
|
+
*/
|
|
63
|
+
export function discoverAllFiles(sources, baseDir) {
|
|
64
|
+
return sources.map((source) => discoverFiles(source, baseDir));
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=discovery.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"discovery.js","sourceRoot":"","sources":["../../src/indexer/discovery.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,IAAI,CAAC;AAC5C,OAAO,EAAE,MAAM,WAAW,CAAC;AAG3B,2CAA2C;AAC3C,MAAM,aAAa,GAAG,GAAG,GAAG,IAAI,CAAC;AAuBjC;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,MAAc,EAAE,OAAe;IAC3D,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;IAEjD,sBAAsB;IACtB,MAAM,OAAO,GAAG,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE;QAC/C,GAAG,EAAE,UAAU;QACf,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,OAAO;QAC/B,SAAS,EAAE,IAAI;QACf,QAAQ,EAAE,KAAK;KAChB,CAAC,CAAC;IAEH,MAAM,KAAK,GAAqB,EAAE,CAAC;IACnC,IAAI,OAAO,GAAG,CAAC,CAAC;IAEhB,KAAK,MAAM,YAAY,IAAI,OAAO,EAAE,CAAC;QACnC,MAAM,YAAY,GAAG,OAAO,CAAC,UAAU,EAAE,YAAY,CAAC,CAAC;QAEvD,IAAI,CAAC;YACH,MAAM,KAAK,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;YAErC,gCAAgC;YAChC,IAAI,KAAK,CAAC,IAAI,GAAG,aAAa,EAAE,CAAC;gBAC/B,OAAO,EAAE,CAAC;gBACV,SAAS;YACX,CAAC;YAED,oBAAoB;YACpB,MAAM,OAAO,GAAG,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;YAEpD,6DAA6D;YAC7D,IAAI,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,OAAO,EAAE,CAAC;gBACV,SAAS;YACX,CAAC;YAED,KAAK,CAAC,IAAI,CAAC;gBACT,IAAI,EAAE,YAAY;gBAClB,YAAY;gBACZ,QAAQ,EAAE,MAAM,CAAC,EAAE;gBACnB,OAAO;gBACP,IAAI,EAAE,KAAK,CAAC,IAAI;aACjB,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,2BAA2B;YAC3B,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED,OAAO;QACL,QAAQ,EAAE,MAAM,CAAC,EAAE;QACnB,KAAK;QACL,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAiB,EAAE,OAAe;IACjE,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,aAAa,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AACjE,CAAC"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embeddings Module
|
|
3
|
+
*
|
|
4
|
+
* Generates embeddings using local models via @xenova/transformers.
|
|
5
|
+
* Uses gte-small (384 dimensions) by default.
|
|
6
|
+
*/
|
|
7
|
+
import type { Chunk } from './chunker.js';
|
|
8
|
+
/** Embedding dimension for gte-small */
|
|
9
|
+
export declare const EMBEDDING_DIM = 384;
|
|
10
|
+
/** Chunk with embedding */
|
|
11
|
+
export interface EmbeddedChunk extends Chunk {
|
|
12
|
+
embedding: number[];
|
|
13
|
+
}
|
|
14
|
+
/** Progress callback */
|
|
15
|
+
export type ProgressCallback = (current: number, total: number) => void;
|
|
16
|
+
/**
|
|
17
|
+
* Generate embedding for a single text
|
|
18
|
+
*/
|
|
19
|
+
export declare function embed(text: string): Promise<number[]>;
|
|
20
|
+
/**
|
|
21
|
+
* Generate embeddings for multiple texts in batches
|
|
22
|
+
*/
|
|
23
|
+
export declare function embedBatch(texts: string[], onProgress?: ProgressCallback): Promise<number[][]>;
|
|
24
|
+
/**
|
|
25
|
+
* Generate embeddings for chunks
|
|
26
|
+
*/
|
|
27
|
+
export declare function embedChunks(chunks: Chunk[], onProgress?: ProgressCallback): Promise<EmbeddedChunk[]>;
|
|
28
|
+
/**
|
|
29
|
+
* Compute cosine similarity between two vectors
|
|
30
|
+
*/
|
|
31
|
+
export declare function cosineSimilarity(a: number[], b: number[]): number;
|
|
32
|
+
//# sourceMappingURL=embeddings.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embeddings.d.ts","sourceRoot":"","sources":["../../src/indexer/embeddings.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAK1C,wCAAwC;AACxC,eAAO,MAAM,aAAa,MAAM,CAAC;AAKjC,2BAA2B;AAC3B,MAAM,WAAW,aAAc,SAAQ,KAAK;IAC1C,SAAS,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,wBAAwB;AACxB,MAAM,MAAM,gBAAgB,GAAG,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;AAiBxE;;GAEG;AACH,wBAAsB,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAI3D;AAED;;GAEG;AACH,wBAAsB,UAAU,CAC9B,KAAK,EAAE,MAAM,EAAE,EACf,UAAU,CAAC,EAAE,gBAAgB,GAC5B,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAwBrB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,MAAM,EAAE,KAAK,EAAE,EACf,UAAU,CAAC,EAAE,gBAAgB,GAC5B,OAAO,CAAC,aAAa,EAAE,CAAC,CAQ1B;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC,EAAE,MAAM,EAAE,GAAG,MAAM,CAiBjE"}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embeddings Module
|
|
3
|
+
*
|
|
4
|
+
* Generates embeddings using local models via @xenova/transformers.
|
|
5
|
+
* Uses gte-small (384 dimensions) by default.
|
|
6
|
+
*/
|
|
7
|
+
import { pipeline } from '@xenova/transformers';
|
|
8
|
+
/** Default embedding model */
|
|
9
|
+
const DEFAULT_MODEL = 'Xenova/gte-small';
|
|
10
|
+
/** Embedding dimension for gte-small */
|
|
11
|
+
export const EMBEDDING_DIM = 384;
|
|
12
|
+
/** Batch size for embedding generation */
|
|
13
|
+
const BATCH_SIZE = 32;
|
|
14
|
+
// Singleton pipeline instance
|
|
15
|
+
let embeddingPipeline = null;
|
|
16
|
+
/**
|
|
17
|
+
* Initialize the embedding pipeline
|
|
18
|
+
*/
|
|
19
|
+
async function getEmbeddingPipeline() {
|
|
20
|
+
if (!embeddingPipeline) {
|
|
21
|
+
embeddingPipeline = await pipeline('feature-extraction', DEFAULT_MODEL, {
|
|
22
|
+
quantized: true, // Use quantized model for speed
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
return embeddingPipeline;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Generate embedding for a single text
|
|
29
|
+
*/
|
|
30
|
+
export async function embed(text) {
|
|
31
|
+
const pipe = await getEmbeddingPipeline();
|
|
32
|
+
const output = await pipe(text, { pooling: 'mean', normalize: true });
|
|
33
|
+
return Array.from(output.data);
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Generate embeddings for multiple texts in batches
|
|
37
|
+
*/
|
|
38
|
+
export async function embedBatch(texts, onProgress) {
|
|
39
|
+
const pipe = await getEmbeddingPipeline();
|
|
40
|
+
const embeddings = [];
|
|
41
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
42
|
+
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
43
|
+
// Process batch
|
|
44
|
+
const outputs = await Promise.all(batch.map(async (text) => {
|
|
45
|
+
const output = await pipe(text, { pooling: 'mean', normalize: true });
|
|
46
|
+
return Array.from(output.data);
|
|
47
|
+
}));
|
|
48
|
+
embeddings.push(...outputs);
|
|
49
|
+
// Report progress
|
|
50
|
+
if (onProgress) {
|
|
51
|
+
onProgress(Math.min(i + BATCH_SIZE, texts.length), texts.length);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return embeddings;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Generate embeddings for chunks
|
|
58
|
+
*/
|
|
59
|
+
export async function embedChunks(chunks, onProgress) {
|
|
60
|
+
const texts = chunks.map((c) => c.content);
|
|
61
|
+
const embeddings = await embedBatch(texts, onProgress);
|
|
62
|
+
return chunks.map((chunk, i) => ({
|
|
63
|
+
...chunk,
|
|
64
|
+
embedding: embeddings[i],
|
|
65
|
+
}));
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Compute cosine similarity between two vectors
|
|
69
|
+
*/
|
|
70
|
+
export function cosineSimilarity(a, b) {
|
|
71
|
+
if (a.length !== b.length) {
|
|
72
|
+
throw new Error('Vectors must have the same length');
|
|
73
|
+
}
|
|
74
|
+
let dotProduct = 0;
|
|
75
|
+
let normA = 0;
|
|
76
|
+
let normB = 0;
|
|
77
|
+
for (let i = 0; i < a.length; i++) {
|
|
78
|
+
dotProduct += a[i] * b[i];
|
|
79
|
+
normA += a[i] * a[i];
|
|
80
|
+
normB += b[i] * b[i];
|
|
81
|
+
}
|
|
82
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
83
|
+
return magnitude === 0 ? 0 : dotProduct / magnitude;
|
|
84
|
+
}
|
|
85
|
+
//# sourceMappingURL=embeddings.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embeddings.js","sourceRoot":"","sources":["../../src/indexer/embeddings.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAkC,MAAM,sBAAsB,CAAC;AAGhF,8BAA8B;AAC9B,MAAM,aAAa,GAAG,kBAAkB,CAAC;AAEzC,wCAAwC;AACxC,MAAM,CAAC,MAAM,aAAa,GAAG,GAAG,CAAC;AAEjC,0CAA0C;AAC1C,MAAM,UAAU,GAAG,EAAE,CAAC;AAUtB,8BAA8B;AAC9B,IAAI,iBAAiB,GAAqC,IAAI,CAAC;AAE/D;;GAEG;AACH,KAAK,UAAU,oBAAoB;IACjC,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACvB,iBAAiB,GAAG,MAAM,QAAQ,CAAC,oBAAoB,EAAE,aAAa,EAAE;YACtE,SAAS,EAAE,IAAI,EAAE,gCAAgC;SAClD,CAAC,CAAC;IACL,CAAC;IACD,OAAO,iBAAiB,CAAC;AAC3B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,IAAY;IACtC,MAAM,IAAI,GAAG,MAAM,oBAAoB,EAAE,CAAC;IAC1C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACtE,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAoB,CAAC,CAAC;AACjD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,KAAe,EACf,UAA6B;IAE7B,MAAM,IAAI,GAAG,MAAM,oBAAoB,EAAE,CAAC;IAC1C,MAAM,UAAU,GAAe,EAAE,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QAClD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QAE7C,gBAAgB;QAChB,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YACvB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACtE,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAoB,CAAC,CAAC;QACjD,CAAC,CAAC,CACH,CAAC;QAEF,UAAU,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC;QAE5B,kBAAkB;QAClB,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;QACnE,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,MAAe,EACf,UAA6B;IAE7B,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;IAC3C,MAAM,UAAU,GAAG,MAAM,UAAU,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC;IAEvD,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC/B,GAAG,KAAK;QACR,SAAS,EAAE,UAAU,CAAC,CAAC,CAAC;KACzB,CAAC,CAAC,CAAC;AACN,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAAC,CAAW,EAAE,CAAW;IACvD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;IACvD,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACtD,OAAO,SAAS,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,SAAS,CAAC;AACtD,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main Indexer Module
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates file discovery, chunking, embedding, and storage.
|
|
5
|
+
*/
|
|
6
|
+
import Database from 'better-sqlite3';
|
|
7
|
+
import { type ChunkOptions } from './chunker.js';
|
|
8
|
+
import type { Source } from '../config/types.js';
|
|
9
|
+
/** Indexing statistics */
|
|
10
|
+
export interface IndexStats {
|
|
11
|
+
sources: number;
|
|
12
|
+
files: number;
|
|
13
|
+
chunks: number;
|
|
14
|
+
skipped: number;
|
|
15
|
+
timeMs: number;
|
|
16
|
+
}
|
|
17
|
+
/** Progress update */
|
|
18
|
+
export interface IndexProgress {
|
|
19
|
+
phase: 'discovery' | 'chunking' | 'embedding' | 'storing';
|
|
20
|
+
sourceId: string;
|
|
21
|
+
current: number;
|
|
22
|
+
total: number;
|
|
23
|
+
}
|
|
24
|
+
/** Progress callback */
|
|
25
|
+
export type IndexProgressCallback = (progress: IndexProgress) => void;
|
|
26
|
+
/**
|
|
27
|
+
* Index all sources
|
|
28
|
+
*/
|
|
29
|
+
export declare function indexSources(sources: Source[], baseDir: string, db: Database.Database, chunkOptions: ChunkOptions, onProgress?: IndexProgressCallback): Promise<IndexStats>;
|
|
30
|
+
/**
|
|
31
|
+
* Read embedding from blob
|
|
32
|
+
*/
|
|
33
|
+
export declare function readEmbedding(blob: Buffer): number[];
|
|
34
|
+
export { discoverFiles, type DiscoveredFile, type DiscoveryResult } from './discovery.js';
|
|
35
|
+
export { chunkFiles, chunkFile, countTokens, type Chunk, type ChunkOptions } from './chunker.js';
|
|
36
|
+
export { embed, embedBatch, embedChunks, cosineSimilarity, EMBEDDING_DIM, type EmbeddedChunk, } from './embeddings.js';
|
|
37
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/indexer/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,QAAQ,MAAM,gBAAgB,CAAC;AAEtC,OAAO,EAAc,KAAK,YAAY,EAAE,MAAM,cAAc,CAAC;AAE7D,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,0BAA0B;AAC1B,MAAM,WAAW,UAAU;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,sBAAsB;AACtB,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,WAAW,GAAG,UAAU,GAAG,WAAW,GAAG,SAAS,CAAC;IAC1D,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;CACf;AAED,wBAAwB;AACxB,MAAM,MAAM,qBAAqB,GAAG,CAAC,QAAQ,EAAE,aAAa,KAAK,IAAI,CAAC;AAEtE;;GAEG;AACH,wBAAsB,YAAY,CAChC,OAAO,EAAE,MAAM,EAAE,EACjB,OAAO,EAAE,MAAM,EACf,EAAE,EAAE,QAAQ,CAAC,QAAQ,EACrB,YAAY,EAAE,YAAY,EAC1B,UAAU,CAAC,EAAE,qBAAqB,GACjC,OAAO,CAAC,UAAU,CAAC,CA+ErB;AAwDD;;GAEG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAGpD;AAGD,OAAO,EAAE,aAAa,EAAE,KAAK,cAAc,EAAE,KAAK,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAC1F,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,KAAK,KAAK,EAAE,KAAK,YAAY,EAAE,MAAM,cAAc,CAAC;AACjG,OAAO,EACL,KAAK,EACL,UAAU,EACV,WAAW,EACX,gBAAgB,EAChB,aAAa,EACb,KAAK,aAAa,GACnB,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main Indexer Module
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates file discovery, chunking, embedding, and storage.
|
|
5
|
+
*/
|
|
6
|
+
import { discoverFiles } from './discovery.js';
|
|
7
|
+
import { chunkFiles } from './chunker.js';
|
|
8
|
+
import { embedChunks } from './embeddings.js';
|
|
9
|
+
/**
|
|
10
|
+
* Index all sources
|
|
11
|
+
*/
|
|
12
|
+
export async function indexSources(sources, baseDir, db, chunkOptions, onProgress) {
|
|
13
|
+
const startTime = Date.now();
|
|
14
|
+
let totalFiles = 0;
|
|
15
|
+
let totalChunks = 0;
|
|
16
|
+
let totalSkipped = 0;
|
|
17
|
+
for (const source of sources) {
|
|
18
|
+
// Phase 1: Discovery
|
|
19
|
+
onProgress?.({
|
|
20
|
+
phase: 'discovery',
|
|
21
|
+
sourceId: source.id,
|
|
22
|
+
current: 0,
|
|
23
|
+
total: 0,
|
|
24
|
+
});
|
|
25
|
+
const discovered = discoverFiles(source, baseDir);
|
|
26
|
+
totalFiles += discovered.files.length;
|
|
27
|
+
totalSkipped += discovered.skipped;
|
|
28
|
+
onProgress?.({
|
|
29
|
+
phase: 'discovery',
|
|
30
|
+
sourceId: source.id,
|
|
31
|
+
current: discovered.files.length,
|
|
32
|
+
total: discovered.files.length,
|
|
33
|
+
});
|
|
34
|
+
// Phase 2: Chunking
|
|
35
|
+
onProgress?.({
|
|
36
|
+
phase: 'chunking',
|
|
37
|
+
sourceId: source.id,
|
|
38
|
+
current: 0,
|
|
39
|
+
total: discovered.files.length,
|
|
40
|
+
});
|
|
41
|
+
const chunks = chunkFiles(discovered.files, chunkOptions);
|
|
42
|
+
onProgress?.({
|
|
43
|
+
phase: 'chunking',
|
|
44
|
+
sourceId: source.id,
|
|
45
|
+
current: discovered.files.length,
|
|
46
|
+
total: discovered.files.length,
|
|
47
|
+
});
|
|
48
|
+
// Phase 3: Embedding
|
|
49
|
+
const embeddedChunks = await embedChunks(chunks, (current, total) => {
|
|
50
|
+
onProgress?.({
|
|
51
|
+
phase: 'embedding',
|
|
52
|
+
sourceId: source.id,
|
|
53
|
+
current,
|
|
54
|
+
total,
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
// Phase 4: Store in database
|
|
58
|
+
onProgress?.({
|
|
59
|
+
phase: 'storing',
|
|
60
|
+
sourceId: source.id,
|
|
61
|
+
current: 0,
|
|
62
|
+
total: embeddedChunks.length,
|
|
63
|
+
});
|
|
64
|
+
storeChunks(db, source.id, source.path, embeddedChunks, discovered.files.length);
|
|
65
|
+
totalChunks += embeddedChunks.length;
|
|
66
|
+
onProgress?.({
|
|
67
|
+
phase: 'storing',
|
|
68
|
+
sourceId: source.id,
|
|
69
|
+
current: embeddedChunks.length,
|
|
70
|
+
total: embeddedChunks.length,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
sources: sources.length,
|
|
75
|
+
files: totalFiles,
|
|
76
|
+
chunks: totalChunks,
|
|
77
|
+
skipped: totalSkipped,
|
|
78
|
+
timeMs: Date.now() - startTime,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Store chunks in the database
|
|
83
|
+
*/
|
|
84
|
+
function storeChunks(db, sourceId, sourcePath, chunks, fileCount) {
|
|
85
|
+
// Begin transaction for performance
|
|
86
|
+
const transaction = db.transaction(() => {
|
|
87
|
+
// Clear existing chunks for this source
|
|
88
|
+
db.prepare('DELETE FROM chunks WHERE source_id = ?').run(sourceId);
|
|
89
|
+
// Update source record
|
|
90
|
+
db.prepare(`
|
|
91
|
+
INSERT INTO sources (id, path, file_count, chunk_count, indexed_at)
|
|
92
|
+
VALUES (?, ?, ?, ?, datetime('now'))
|
|
93
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
94
|
+
path = excluded.path,
|
|
95
|
+
file_count = excluded.file_count,
|
|
96
|
+
chunk_count = excluded.chunk_count,
|
|
97
|
+
indexed_at = excluded.indexed_at
|
|
98
|
+
`).run(sourceId, sourcePath, fileCount, chunks.length);
|
|
99
|
+
// Insert chunks
|
|
100
|
+
const insertChunk = db.prepare(`
|
|
101
|
+
INSERT INTO chunks (id, source_id, file_path, content, start_line, end_line, tokens, embedding)
|
|
102
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
103
|
+
`);
|
|
104
|
+
for (const chunk of chunks) {
|
|
105
|
+
// Convert embedding to binary blob
|
|
106
|
+
const embeddingBlob = Buffer.from(new Float32Array(chunk.embedding).buffer);
|
|
107
|
+
insertChunk.run(chunk.id, chunk.sourceId, chunk.filePath, chunk.content, chunk.startLine, chunk.endLine, chunk.tokens, embeddingBlob);
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
transaction();
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Read embedding from blob
|
|
114
|
+
*/
|
|
115
|
+
export function readEmbedding(blob) {
|
|
116
|
+
const float32Array = new Float32Array(blob.buffer, blob.byteOffset, blob.length / 4);
|
|
117
|
+
return Array.from(float32Array);
|
|
118
|
+
}
|
|
119
|
+
// Re-export types and functions
|
|
120
|
+
export { discoverFiles } from './discovery.js';
|
|
121
|
+
export { chunkFiles, chunkFile, countTokens } from './chunker.js';
|
|
122
|
+
export { embed, embedBatch, embedChunks, cosineSimilarity, EMBEDDING_DIM, } from './embeddings.js';
|
|
123
|
+
//# sourceMappingURL=index.js.map
|