@vivantel/rag-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/config/release-please.json +38 -0
- package/.github/dependabot.yaml +28 -0
- package/.github/workflows/ci.yaml +119 -0
- package/.github/workflows/publish.yaml +151 -0
- package/.github/workflows/release.yaml +150 -0
- package/.versionrc.json +19 -0
- package/CHANGELOG.md +21 -0
- package/README.md +62 -0
- package/bin/rag-update.ts +49 -0
- package/dist/config-loader.d.ts +3 -0
- package/dist/config-loader.d.ts.map +1 -0
- package/dist/config-loader.js +13 -0
- package/dist/config-loader.js.map +1 -0
- package/dist/core/chunk-processor.d.ts +12 -0
- package/dist/core/chunk-processor.d.ts.map +1 -0
- package/dist/core/chunk-processor.js +65 -0
- package/dist/core/chunk-processor.js.map +1 -0
- package/dist/core/embedder.d.ts +19 -0
- package/dist/core/embedder.d.ts.map +1 -0
- package/dist/core/embedder.js +139 -0
- package/dist/core/embedder.js.map +1 -0
- package/dist/core/git-tracker.d.ts +25 -0
- package/dist/core/git-tracker.d.ts.map +1 -0
- package/dist/core/git-tracker.js +164 -0
- package/dist/core/git-tracker.js.map +1 -0
- package/dist/core/orchestrator.d.ts +22 -0
- package/dist/core/orchestrator.d.ts.map +1 -0
- package/dist/core/orchestrator.js +57 -0
- package/dist/core/orchestrator.js.map +1 -0
- package/dist/core/uploader.d.ts +15 -0
- package/dist/core/uploader.d.ts.map +1 -0
- package/dist/core/uploader.js +79 -0
- package/dist/core/uploader.js.map +1 -0
- package/dist/core/utils.d.ts +6 -0
- package/dist/core/utils.d.ts.map +1 -0
- package/dist/core/utils.js +23 -0
- package/dist/core/utils.js.map +1 -0
- package/dist/helpers/create-chunker.d.ts +9 -0
- package/dist/helpers/create-chunker.d.ts.map +1 -0
- package/dist/helpers/create-chunker.js +24 -0
- package/dist/helpers/create-chunker.js.map +1 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/interfaces/chunker.d.ts +46 -0
- package/dist/interfaces/chunker.d.ts.map +1 -0
- package/dist/interfaces/chunker.js +5 -0
- package/dist/interfaces/chunker.js.map +1 -0
- package/dist/interfaces/embedder.d.ts +28 -0
- package/dist/interfaces/embedder.d.ts.map +1 -0
- package/dist/interfaces/embedder.js +5 -0
- package/dist/interfaces/embedder.js.map +1 -0
- package/dist/interfaces/index.d.ts +4 -0
- package/dist/interfaces/index.d.ts.map +1 -0
- package/dist/interfaces/index.js +4 -0
- package/dist/interfaces/index.js.map +1 -0
- package/dist/interfaces/vector-store.d.ts +53 -0
- package/dist/interfaces/vector-store.d.ts.map +1 -0
- package/dist/interfaces/vector-store.js +5 -0
- package/dist/interfaces/vector-store.js.map +1 -0
- package/dist/strategies/chunk/index.d.ts +5 -0
- package/dist/strategies/chunk/index.d.ts.map +1 -0
- package/dist/strategies/chunk/index.js +5 -0
- package/dist/strategies/chunk/index.js.map +1 -0
- package/dist/strategies/chunk/markdown-headers.d.ts +7 -0
- package/dist/strategies/chunk/markdown-headers.d.ts.map +1 -0
- package/dist/strategies/chunk/markdown-headers.js +89 -0
- package/dist/strategies/chunk/markdown-headers.js.map +1 -0
- package/dist/strategies/chunk/semantic.d.ts +7 -0
- package/dist/strategies/chunk/semantic.d.ts.map +1 -0
- package/dist/strategies/chunk/semantic.js +62 -0
- package/dist/strategies/chunk/semantic.js.map +1 -0
- package/dist/strategies/chunk/token.d.ts +12 -0
- package/dist/strategies/chunk/token.d.ts.map +1 -0
- package/dist/strategies/chunk/token.js +56 -0
- package/dist/strategies/chunk/token.js.map +1 -0
- package/dist/strategies/chunk/whole-file.d.ts +3 -0
- package/dist/strategies/chunk/whole-file.d.ts.map +1 -0
- package/dist/strategies/chunk/whole-file.js +31 -0
- package/dist/strategies/chunk/whole-file.js.map +1 -0
- package/eslint.config.js +25 -0
- package/package.json +102 -0
- package/src/config-loader.ts +21 -0
- package/src/core/chunk-processor.test.ts +36 -0
- package/src/core/chunk-processor.ts +92 -0
- package/src/core/embedder.ts +189 -0
- package/src/core/git-tracker.test.ts +64 -0
- package/src/core/git-tracker.ts +202 -0
- package/src/core/orchestrator.test.ts +53 -0
- package/src/core/orchestrator.ts +97 -0
- package/src/core/uploader.ts +123 -0
- package/src/core/utils.ts +27 -0
- package/src/helpers/create-chunker.test.ts +31 -0
- package/src/helpers/create-chunker.ts +40 -0
- package/src/index.test.ts +33 -0
- package/src/index.ts +30 -0
- package/src/interfaces/chunker.ts +59 -0
- package/src/interfaces/embedder.ts +36 -0
- package/src/interfaces/index.test.ts +9 -0
- package/src/interfaces/index.ts +3 -0
- package/src/interfaces/vector-store.ts +71 -0
- package/src/strategies/chunk/index.ts +4 -0
- package/src/strategies/chunk/markdown-headers.test.ts +37 -0
- package/src/strategies/chunk/markdown-headers.ts +106 -0
- package/src/strategies/chunk/semantic.test.ts +21 -0
- package/src/strategies/chunk/semantic.ts +80 -0
- package/src/strategies/chunk/token.test.ts +41 -0
- package/src/strategies/chunk/token.ts +72 -0
- package/src/strategies/chunk/whole-file.test.ts +24 -0
- package/src/strategies/chunk/whole-file.ts +35 -0
- package/tsconfig.json +21 -0
- package/typedoc.json +11 -0
- package/vitest.config.ts +19 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export function wholeFileStrategy() {
|
|
2
|
+
return {
|
|
3
|
+
name: "whole-file",
|
|
4
|
+
async chunk(text, filePath) {
|
|
5
|
+
if (!text || text.trim().length === 0) {
|
|
6
|
+
return [];
|
|
7
|
+
}
|
|
8
|
+
return [
|
|
9
|
+
{
|
|
10
|
+
content: text,
|
|
11
|
+
metadata: {
|
|
12
|
+
strategy: this.name,
|
|
13
|
+
source_file: filePath,
|
|
14
|
+
char_count: text.length,
|
|
15
|
+
line_count: text.split("\n").length,
|
|
16
|
+
},
|
|
17
|
+
sourceFile: filePath || "unknown",
|
|
18
|
+
commitHash: "",
|
|
19
|
+
},
|
|
20
|
+
];
|
|
21
|
+
},
|
|
22
|
+
extractMetadata(text, _filePath) {
|
|
23
|
+
return {
|
|
24
|
+
strategy: this.name,
|
|
25
|
+
char_count: text.length,
|
|
26
|
+
line_count: text.split("\n").length,
|
|
27
|
+
};
|
|
28
|
+
},
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
//# sourceMappingURL=whole-file.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"whole-file.js","sourceRoot":"","sources":["../../../src/strategies/chunk/whole-file.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,iBAAiB;IAC/B,OAAO;QACL,IAAI,EAAE,YAAY;QAElB,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,QAAiB;YACzC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACtC,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,OAAO;gBACL;oBACE,OAAO,EAAE,IAAI;oBACb,QAAQ,EAAE;wBACR,QAAQ,EAAE,IAAI,CAAC,IAAI;wBACnB,WAAW,EAAE,QAAQ;wBACrB,UAAU,EAAE,IAAI,CAAC,MAAM;wBACvB,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM;qBACpC;oBACD,UAAU,EAAE,QAAQ,IAAI,SAAS;oBACjC,UAAU,EAAE,EAAE;iBACf;aACF,CAAC;QACJ,CAAC;QAED,eAAe,CAAC,IAAY,EAAE,SAAkB;YAC9C,OAAO;gBACL,QAAQ,EAAE,IAAI,CAAC,IAAI;gBACnB,UAAU,EAAE,IAAI,CAAC,MAAM;gBACvB,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM;aACpC,CAAC;QACJ,CAAC;KACF,CAAC;AACJ,CAAC"}
|
package/eslint.config.js
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import js from '@eslint/js';
|
|
2
|
+
import tseslint from 'typescript-eslint';
|
|
3
|
+
|
|
4
|
+
export default tseslint.config(
|
|
5
|
+
js.configs.recommended,
|
|
6
|
+
...tseslint.configs.recommended,
|
|
7
|
+
{
|
|
8
|
+
ignores: [
|
|
9
|
+
'dist/**',
|
|
10
|
+
'node_modules/**',
|
|
11
|
+
'coverage/**',
|
|
12
|
+
'*.config.js',
|
|
13
|
+
'*.config.ts'
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
files: ['src/**/*.ts'],
|
|
18
|
+
rules: {
|
|
19
|
+
'@typescript-eslint/no-explicit-any': 'warn',
|
|
20
|
+
'@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
|
|
21
|
+
'no-console': 'off',
|
|
22
|
+
'no-undef': 'off'
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
);
|
package/package.json
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@vivantel/rag-core",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Core RAG pipeline tools - universal chunking, embedding, vector store interfaces",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"rag-update": "./dist/bin/rag-update.js"
|
|
10
|
+
},
|
|
11
|
+
"scripts": {
|
|
12
|
+
"audit:fix": "npm audit fix",
|
|
13
|
+
"audit:report": "npm audit --json > npm-audit-report.json",
|
|
14
|
+
"audit": "npm audit",
|
|
15
|
+
"build:clean": "rm -rf dist && npm run build",
|
|
16
|
+
"build:verbose": "tsc --listEmittedFiles",
|
|
17
|
+
"build": "tsc && echo \"ā
Build completed successfully\"",
|
|
18
|
+
"check:full": "npm run check && npm run test:coverage && npm run type-coverage",
|
|
19
|
+
"check:quick": "npm run type-check && npm run lint",
|
|
20
|
+
"check": "npm run type-check && npm run lint && npm run format:check && npm run audit",
|
|
21
|
+
"clean:cache": "npm cache clean --force",
|
|
22
|
+
"clean:rebuild": "npm run clean && npm install && npm run build",
|
|
23
|
+
"clean": "rm -rf dist node_modules coverage docs",
|
|
24
|
+
"dev:debug": "tsx --inspect-brk src/index.ts",
|
|
25
|
+
"dev": "tsx watch src/index.ts",
|
|
26
|
+
"docs:generate": "typedoc",
|
|
27
|
+
"docs:open": "npm run docs:generate && open docs/index.html",
|
|
28
|
+
"docs:serve": "npm run docs:generate && npx http-server docs -p 8080",
|
|
29
|
+
"fix:aggressive": "npm run fix:all && npm run update:apply && npm run clean:rebuild",
|
|
30
|
+
"fix:all": "npm run fix && npm run audit:fix",
|
|
31
|
+
"fix": "npm run lint:fix && npm run format",
|
|
32
|
+
"format:check": "prettier --check \"src/**/*.ts\"",
|
|
33
|
+
"format": "prettier --write \"src/**/*.ts\"",
|
|
34
|
+
"help": "echo 'Available commands:' && grep -E '^\\s+\"[a-z]' package.json | sed 's/\"//g' | sed 's/://g' | column -t",
|
|
35
|
+
"info": "echo 'Node: $(node --version) | npm: $(npm --version)' && npm list --depth=0",
|
|
36
|
+
"lint:fix": "eslint src/ --fix",
|
|
37
|
+
"lint:strict": "eslint src/ --max-warnings 0",
|
|
38
|
+
"lint": "eslint src/",
|
|
39
|
+
"outdated:report": "npm outdated --json > npm-outdated-report.json",
|
|
40
|
+
"outdated": "npm outdated",
|
|
41
|
+
"postpublish": "echo \"ā
Published to npm\"",
|
|
42
|
+
"postversion": "git push && git push --tags",
|
|
43
|
+
"precommit": "npm run type-check && npm run format:check && npm run lint && npm audit",
|
|
44
|
+
"prepublishOnly": "npm run build && npm test",
|
|
45
|
+
"prepush": "npm run test:run && npm run audit && npm run format:check",
|
|
46
|
+
"release:major": "npm version major && npm publish",
|
|
47
|
+
"release:minor": "npm version minor && npm publish",
|
|
48
|
+
"release:patch": "npm version patch && npm publish",
|
|
49
|
+
"release:pre": "npm version prerelease && npm publish --tag next",
|
|
50
|
+
"status": "npm run outdated && npm run audit",
|
|
51
|
+
"test:coverage": "vitest --coverage --config vitest.config.ts",
|
|
52
|
+
"test:debug": "vitest --inspect-brk --config vitest.config.ts",
|
|
53
|
+
"test:run": "vitest run --config vitest.config.ts",
|
|
54
|
+
"test:update": "vitest -u --config vitest.config.ts",
|
|
55
|
+
"test:watch": "vitest watch --config vitest.config.ts",
|
|
56
|
+
"test": "vitest run --config vitest.config.ts",
|
|
57
|
+
"type-check:watch": "tsc --noEmit --watch",
|
|
58
|
+
"type-check": "tsc --noEmit",
|
|
59
|
+
"type-coverage:report": "type-coverage --detail",
|
|
60
|
+
"type-coverage": "type-coverage --at-least 90",
|
|
61
|
+
"update:apply": "npx npm-check-updates -u && npm install",
|
|
62
|
+
"update:check": "npx npm-check-updates",
|
|
63
|
+
"update": "npm update",
|
|
64
|
+
"version": "conventional-changelog -p angular -i CHANGELOG.md -s && git add CHANGELOG.md",
|
|
65
|
+
"watch:test": "vitest --watch --config vitest.config.ts",
|
|
66
|
+
"watch": "tsc --watch"
|
|
67
|
+
},
|
|
68
|
+
"keywords": [
|
|
69
|
+
"rag",
|
|
70
|
+
"embeddings",
|
|
71
|
+
"vector",
|
|
72
|
+
"git",
|
|
73
|
+
"pipeline"
|
|
74
|
+
],
|
|
75
|
+
"author": "Vivantel",
|
|
76
|
+
"license": "MIT",
|
|
77
|
+
"dependencies": {
|
|
78
|
+
"commander": "^15.0.0",
|
|
79
|
+
"dotenv": "^17.4.2",
|
|
80
|
+
"glob": "^13.0.6",
|
|
81
|
+
"simple-git": "^3.25.0"
|
|
82
|
+
},
|
|
83
|
+
"devDependencies": {
|
|
84
|
+
"@eslint/js": "^10.0.1",
|
|
85
|
+
"@types/node": "^25.9.1",
|
|
86
|
+
"eslint": "^10.4.1",
|
|
87
|
+
"prettier": "^3.8.3",
|
|
88
|
+
"tsx": "^4.11.0",
|
|
89
|
+
"typedoc": "^0.28.19",
|
|
90
|
+
"typedoc-plugin-markdown": "^4.11.0",
|
|
91
|
+
"typescript": "^6.0.3",
|
|
92
|
+
"typescript-eslint": "^8.60.0",
|
|
93
|
+
"vitest": "^4.1.7"
|
|
94
|
+
},
|
|
95
|
+
"engines": {
|
|
96
|
+
"node": ">=18.0.0"
|
|
97
|
+
},
|
|
98
|
+
"vitest": {
|
|
99
|
+
"environment": "node",
|
|
100
|
+
"globals": false
|
|
101
|
+
}
|
|
102
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { pathToFileURL } from "url";
|
|
2
|
+
import { RAGPipelineConfig } from "./core/orchestrator.js";
|
|
3
|
+
|
|
4
|
+
export async function loadConfig(
|
|
5
|
+
configPath: string,
|
|
6
|
+
): Promise<RAGPipelineConfig> {
|
|
7
|
+
// Clear cache for hot reload
|
|
8
|
+
delete require.cache[require.resolve(configPath)];
|
|
9
|
+
|
|
10
|
+
const configUrl = pathToFileURL(configPath).href;
|
|
11
|
+
const configModule = await import(configUrl);
|
|
12
|
+
const config = configModule.default;
|
|
13
|
+
|
|
14
|
+
if (!config.chunkers || !config.embedder || !config.vectorStore) {
|
|
15
|
+
throw new Error(
|
|
16
|
+
"Invalid config: missing chunkers, embedder, or vectorStore",
|
|
17
|
+
);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return config;
|
|
21
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from "vitest";
|
|
2
|
+
import { ChunkProcessor } from "./chunk-processor.js";
|
|
3
|
+
import { FileChunker } from "../interfaces/index.js";
|
|
4
|
+
|
|
5
|
+
describe("ChunkProcessor", () => {
|
|
6
|
+
const mockChunker: FileChunker = {
|
|
7
|
+
name: "test",
|
|
8
|
+
patterns: ["**/*.txt"],
|
|
9
|
+
chunk: vi.fn().mockResolvedValue([
|
|
10
|
+
{
|
|
11
|
+
content: "test content",
|
|
12
|
+
metadata: { type: "test" },
|
|
13
|
+
sourceFile: "test.txt",
|
|
14
|
+
commitHash: "abc123",
|
|
15
|
+
contentHash: "hash123",
|
|
16
|
+
},
|
|
17
|
+
]),
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
it("should be instantiable", () => {
|
|
21
|
+
const processor = new ChunkProcessor([mockChunker]);
|
|
22
|
+
expect(processor).toBeInstanceOf(ChunkProcessor);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("should have processFile method", () => {
|
|
26
|
+
const processor = new ChunkProcessor([mockChunker]);
|
|
27
|
+
expect(processor.processFile).toBeDefined();
|
|
28
|
+
expect(typeof processor.processFile).toBe("function");
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it("should have processFiles method", () => {
|
|
32
|
+
const processor = new ChunkProcessor([mockChunker]);
|
|
33
|
+
expect(processor.processFiles).toBeDefined();
|
|
34
|
+
expect(typeof processor.processFiles).toBe("function");
|
|
35
|
+
});
|
|
36
|
+
});
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { Chunk, FileChunker } from "../interfaces/index.js";
|
|
2
|
+
import { createHash } from "crypto";
|
|
3
|
+
|
|
4
|
+
function computeContentHash(content: string): string {
|
|
5
|
+
return createHash("sha256").update(content).digest("hex").slice(0, 16);
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export class ChunkProcessor {
|
|
9
|
+
private chunkers: Map<string, FileChunker>;
|
|
10
|
+
|
|
11
|
+
constructor(chunkers: FileChunker[]) {
|
|
12
|
+
this.chunkers = new Map(chunkers.map((c) => [c.name, c]));
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async processFile(
|
|
16
|
+
filePath: string,
|
|
17
|
+
commitHash: string,
|
|
18
|
+
chunker: FileChunker,
|
|
19
|
+
): Promise<Chunk[]> {
|
|
20
|
+
try {
|
|
21
|
+
const chunks = await chunker.chunk(filePath, commitHash);
|
|
22
|
+
|
|
23
|
+
for (const chunk of chunks) {
|
|
24
|
+
chunk.contentHash = computeContentHash(chunk.content);
|
|
25
|
+
chunk.sourceFile = filePath;
|
|
26
|
+
chunk.commitHash = commitHash;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return chunks;
|
|
30
|
+
} catch (error) {
|
|
31
|
+
console.error(` ā Error processing ${filePath}: ${error}`);
|
|
32
|
+
return [];
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async processFiles(
|
|
37
|
+
files: string[],
|
|
38
|
+
fileState: Map<string, { commitHash: string; chunker: FileChunker }>,
|
|
39
|
+
): Promise<Chunk[]> {
|
|
40
|
+
const allChunks: Chunk[] = [];
|
|
41
|
+
|
|
42
|
+
for (let i = 0; i < files.length; i++) {
|
|
43
|
+
const filePath = files[i];
|
|
44
|
+
const info = fileState.get(filePath);
|
|
45
|
+
|
|
46
|
+
if (!info) {
|
|
47
|
+
console.log(` ā ļø No chunker for: ${filePath}`);
|
|
48
|
+
continue;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
console.log(` [${i + 1}/${files.length}] ${filePath}`);
|
|
52
|
+
|
|
53
|
+
const chunks = await this.processFile(
|
|
54
|
+
filePath,
|
|
55
|
+
info.commitHash,
|
|
56
|
+
info.chunker,
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
if (chunks.length > 0) {
|
|
60
|
+
allChunks.push(...chunks);
|
|
61
|
+
console.log(` ā
Generated ${chunks.length} chunk(s)`);
|
|
62
|
+
} else {
|
|
63
|
+
console.log(` ā ļø No chunks generated (skipped)`);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return allChunks;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
async saveChunksLocal(chunks: Chunk[], outputFile: string): Promise<void> {
|
|
71
|
+
const { dirname } = await import("path");
|
|
72
|
+
const { mkdir, writeFile, readFile } = await import("fs/promises");
|
|
73
|
+
|
|
74
|
+
await mkdir(dirname(outputFile), { recursive: true });
|
|
75
|
+
|
|
76
|
+
let existing: Chunk[] = [];
|
|
77
|
+
try {
|
|
78
|
+
const content = await readFile(outputFile, "utf-8");
|
|
79
|
+
existing = JSON.parse(content);
|
|
80
|
+
} catch {
|
|
81
|
+
// File doesn't exist
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const processedFiles = new Set(chunks.map((c) => c.sourceFile));
|
|
85
|
+
const filtered = existing.filter((c) => !processedFiles.has(c.sourceFile));
|
|
86
|
+
|
|
87
|
+
const allChunks = [...filtered, ...chunks];
|
|
88
|
+
|
|
89
|
+
await writeFile(outputFile, JSON.stringify(allChunks, null, 2));
|
|
90
|
+
console.log(`\nš¾ Saved ${allChunks.length} chunks to ${outputFile}`);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import {
|
|
2
|
+
EmbeddingProvider,
|
|
3
|
+
EmbeddedChunk,
|
|
4
|
+
Chunk,
|
|
5
|
+
} from "../interfaces/index.js";
|
|
6
|
+
import { readFile, writeFile, mkdir } from "fs/promises";
|
|
7
|
+
import { dirname } from "path";
|
|
8
|
+
import { createHash } from "crypto";
|
|
9
|
+
|
|
10
|
+
function chunkContentHash(chunk: Chunk): string {
|
|
11
|
+
if (chunk.contentHash) return chunk.contentHash;
|
|
12
|
+
return createHash("sha256").update(chunk.content).digest("hex").slice(0, 16);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export class EmbedderProcessor {
|
|
16
|
+
private provider: EmbeddingProvider;
|
|
17
|
+
private rateLimitMs: number;
|
|
18
|
+
private batchSize: number;
|
|
19
|
+
|
|
20
|
+
constructor(
|
|
21
|
+
provider: EmbeddingProvider,
|
|
22
|
+
options: { rateLimitMs?: number; batchSize?: number } = {},
|
|
23
|
+
) {
|
|
24
|
+
this.provider = provider;
|
|
25
|
+
this.rateLimitMs = options.rateLimitMs ?? 500;
|
|
26
|
+
this.batchSize = options.batchSize ?? 10;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
private async sleep(ms: number): Promise<void> {
|
|
30
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
async embedChunk(chunk: Chunk): Promise<EmbeddedChunk> {
|
|
34
|
+
const embedding = await this.provider.embed(chunk.content);
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
...chunk,
|
|
38
|
+
embedding,
|
|
39
|
+
embeddedAt: Date.now() / 1000,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
async embedBatch(chunks: Chunk[]): Promise<EmbeddedChunk[]> {
|
|
44
|
+
const results: EmbeddedChunk[] = [];
|
|
45
|
+
|
|
46
|
+
if (this.provider.embedBatch && chunks.length >= this.batchSize) {
|
|
47
|
+
const texts = chunks.map((c) => c.content);
|
|
48
|
+
const embeddings = await this.provider.embedBatch(texts);
|
|
49
|
+
|
|
50
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
51
|
+
results.push({
|
|
52
|
+
...chunks[i],
|
|
53
|
+
embedding: embeddings[i],
|
|
54
|
+
embeddedAt: Date.now() / 1000,
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
} else {
|
|
58
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
59
|
+
const chunk = chunks[i];
|
|
60
|
+
const eventType =
|
|
61
|
+
(chunk.metadata.event_type as string) ||
|
|
62
|
+
(chunk.metadata.title as string) ||
|
|
63
|
+
chunk.sourceFile.split("/").pop() ||
|
|
64
|
+
"unknown";
|
|
65
|
+
|
|
66
|
+
console.log(` [${i + 1}/${chunks.length}] ${eventType}`);
|
|
67
|
+
|
|
68
|
+
const embedded = await this.embedChunk(chunk);
|
|
69
|
+
results.push(embedded);
|
|
70
|
+
|
|
71
|
+
if (this.rateLimitMs > 0 && i < chunks.length - 1) {
|
|
72
|
+
await this.sleep(this.rateLimitMs);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return results;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
async getChunksToEmbed(
|
|
81
|
+
chunksFile: string,
|
|
82
|
+
force: boolean = false,
|
|
83
|
+
): Promise<{
|
|
84
|
+
chunksToEmbed: Chunk[];
|
|
85
|
+
}> {
|
|
86
|
+
let chunks: Chunk[];
|
|
87
|
+
try {
|
|
88
|
+
const content = await readFile(chunksFile, "utf-8");
|
|
89
|
+
chunks = JSON.parse(content);
|
|
90
|
+
} catch {
|
|
91
|
+
throw new Error(`Chunks file not found: ${chunksFile}`);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
console.log(`š Loaded ${chunks.length} chunks from ${chunksFile}`);
|
|
95
|
+
|
|
96
|
+
if (force) {
|
|
97
|
+
console.log(" ā ļø Force mode: embedding all chunks");
|
|
98
|
+
return { chunksToEmbed: chunks };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
let existingEmbeddings: EmbeddedChunk[] = [];
|
|
102
|
+
const embeddingsFile = chunksFile.replace("chunks", "embeddings");
|
|
103
|
+
try {
|
|
104
|
+
const content = await readFile(embeddingsFile, "utf-8");
|
|
105
|
+
existingEmbeddings = JSON.parse(content);
|
|
106
|
+
} catch {
|
|
107
|
+
// No existing embeddings
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const existingState = new Map<string, EmbeddedChunk>();
|
|
111
|
+
for (const emb of existingEmbeddings) {
|
|
112
|
+
const hash = emb.contentHash || chunkContentHash(emb);
|
|
113
|
+
existingState.set(hash, emb);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
console.log(`š Existing embeddings: ${existingState.size} chunks`);
|
|
117
|
+
|
|
118
|
+
const chunksToEmbed: Chunk[] = [];
|
|
119
|
+
for (const chunk of chunks) {
|
|
120
|
+
const chunkHash = chunkContentHash(chunk);
|
|
121
|
+
if (!existingState.has(chunkHash)) {
|
|
122
|
+
chunksToEmbed.push(chunk);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return { chunksToEmbed };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async saveEmbeddings(
|
|
130
|
+
newEmbeddings: EmbeddedChunk[],
|
|
131
|
+
chunksFile: string,
|
|
132
|
+
force: boolean = false,
|
|
133
|
+
): Promise<void> {
|
|
134
|
+
const embeddingsFile = chunksFile.replace("chunks", "embeddings");
|
|
135
|
+
await mkdir(dirname(embeddingsFile), { recursive: true });
|
|
136
|
+
|
|
137
|
+
const newByHash = new Map<string, EmbeddedChunk>();
|
|
138
|
+
for (const emb of newEmbeddings) {
|
|
139
|
+
const hash = emb.contentHash || chunkContentHash(emb);
|
|
140
|
+
newByHash.set(hash, emb);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
let existing: EmbeddedChunk[] = [];
|
|
144
|
+
if (!force) {
|
|
145
|
+
try {
|
|
146
|
+
const content = await readFile(embeddingsFile, "utf-8");
|
|
147
|
+
existing = JSON.parse(content);
|
|
148
|
+
} catch {
|
|
149
|
+
// No existing embeddings
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const final = force
|
|
154
|
+
? []
|
|
155
|
+
: existing.filter((e) => {
|
|
156
|
+
const hash = e.contentHash || chunkContentHash(e);
|
|
157
|
+
return !newByHash.has(hash);
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
final.push(...newEmbeddings);
|
|
161
|
+
|
|
162
|
+
await writeFile(embeddingsFile, JSON.stringify(final, null, 2));
|
|
163
|
+
console.log(`\nš¾ Saved ${final.length} embeddings to ${embeddingsFile}`);
|
|
164
|
+
console.log(
|
|
165
|
+
` New: ${newEmbeddings.length}, Existing: ${final.length - newEmbeddings.length}`,
|
|
166
|
+
);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async run(
|
|
170
|
+
chunksFile: string,
|
|
171
|
+
force: boolean = false,
|
|
172
|
+
): Promise<EmbeddedChunk[]> {
|
|
173
|
+
console.log("š¢ Starting incremental embedding generation...");
|
|
174
|
+
|
|
175
|
+
const { chunksToEmbed } = await this.getChunksToEmbed(chunksFile, force);
|
|
176
|
+
|
|
177
|
+
if (chunksToEmbed.length === 0) {
|
|
178
|
+
console.log("\n⨠No chunks need embedding.");
|
|
179
|
+
return [];
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
console.log(`\nš Need to embed ${chunksToEmbed.length} chunks`);
|
|
183
|
+
|
|
184
|
+
const newEmbeddings = await this.embedBatch(chunksToEmbed);
|
|
185
|
+
await this.saveEmbeddings(newEmbeddings, chunksFile, force);
|
|
186
|
+
|
|
187
|
+
return newEmbeddings;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
+
import { GitTracker } from "./git-tracker.js";
|
|
3
|
+
import { FileChunker } from "../interfaces/index.js";
|
|
4
|
+
import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "fs";
|
|
5
|
+
import { join } from "path";
|
|
6
|
+
import { tmpdir } from "os";
|
|
7
|
+
import { execSync } from "child_process";
|
|
8
|
+
|
|
9
|
+
describe("GitTracker", () => {
|
|
10
|
+
let testDir: string;
|
|
11
|
+
let originalCwd: string;
|
|
12
|
+
|
|
13
|
+
const mockChunker: FileChunker = {
|
|
14
|
+
name: "test",
|
|
15
|
+
patterns: ["**/*.txt", "**/*.yaml", "**/*.json"],
|
|
16
|
+
chunk: vi.fn().mockResolvedValue([]),
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
beforeEach(() => {
|
|
20
|
+
testDir = mkdtempSync(join(tmpdir(), "git-test-"));
|
|
21
|
+
originalCwd = process.cwd();
|
|
22
|
+
process.chdir(testDir);
|
|
23
|
+
|
|
24
|
+
mkdirSync(join(testDir, "src", "events"), { recursive: true });
|
|
25
|
+
|
|
26
|
+
writeFileSync(join(testDir, "test.txt"), "test content");
|
|
27
|
+
writeFileSync(
|
|
28
|
+
join(testDir, "src", "events", "booking.yaml"),
|
|
29
|
+
"event_type: BookingCreated",
|
|
30
|
+
);
|
|
31
|
+
writeFileSync(join(testDir, "config.json"), '{"key": "value"}');
|
|
32
|
+
|
|
33
|
+
execSync("git init", { stdio: "ignore" });
|
|
34
|
+
execSync('git config user.email "test@example.com"', { stdio: "ignore" });
|
|
35
|
+
execSync('git config user.name "Test"', { stdio: "ignore" });
|
|
36
|
+
execSync("git add .", { stdio: "ignore" });
|
|
37
|
+
execSync('git commit -m "initial"', { stdio: "ignore" });
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
afterEach(() => {
|
|
41
|
+
process.chdir(originalCwd);
|
|
42
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("should be instantiable", () => {
|
|
46
|
+
const tracker = new GitTracker([mockChunker]);
|
|
47
|
+
expect(tracker).toBeInstanceOf(GitTracker);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
it("should getAllTrackedFiles", async () => {
|
|
51
|
+
const tracker = new GitTracker([mockChunker]);
|
|
52
|
+
const files = await tracker.getAllTrackedFiles();
|
|
53
|
+
|
|
54
|
+
expect(files.length).toBeGreaterThan(0);
|
|
55
|
+
expect(files.some((f) => f.includes("test.txt"))).toBe(true);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
it("should getCurrentState", async () => {
|
|
59
|
+
const tracker = new GitTracker([mockChunker]);
|
|
60
|
+
const state = await tracker.getCurrentState();
|
|
61
|
+
|
|
62
|
+
expect(state.size).toBeGreaterThan(0);
|
|
63
|
+
});
|
|
64
|
+
});
|