@vivantel/rag-core 1.1.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/rag-update.d.ts +3 -0
- package/dist/bin/rag-update.d.ts.map +1 -0
- package/dist/bin/rag-update.js +116 -0
- package/dist/bin/rag-update.js.map +1 -0
- package/dist/cli/init.d.ts +2 -0
- package/dist/cli/init.d.ts.map +1 -0
- package/dist/cli/init.js +262 -0
- package/dist/cli/init.js.map +1 -0
- package/dist/cli/validate.d.ts +2 -0
- package/dist/cli/validate.d.ts.map +1 -0
- package/dist/cli/validate.js +54 -0
- package/dist/cli/validate.js.map +1 -0
- package/dist/config-loader.d.ts.map +1 -1
- package/dist/config-loader.js +73 -7
- package/dist/config-loader.js.map +1 -1
- package/dist/core/chunk-processor.d.ts +1 -1
- package/dist/core/chunk-processor.d.ts.map +1 -1
- package/dist/core/chunk-processor.js +50 -21
- package/dist/core/chunk-processor.js.map +1 -1
- package/dist/core/embedder.d.ts +5 -1
- package/dist/core/embedder.d.ts.map +1 -1
- package/dist/core/embedder.js +40 -29
- package/dist/core/embedder.js.map +1 -1
- package/dist/core/errors.d.ts +16 -0
- package/dist/core/errors.d.ts.map +1 -0
- package/dist/core/errors.js +17 -0
- package/dist/core/errors.js.map +1 -0
- package/dist/core/git-tracker.d.ts.map +1 -1
- package/dist/core/git-tracker.js +9 -59
- package/dist/core/git-tracker.js.map +1 -1
- package/dist/core/orchestrator.d.ts +8 -0
- package/dist/core/orchestrator.d.ts.map +1 -1
- package/dist/core/orchestrator.js +153 -37
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/plugin-discovery.d.ts +19 -0
- package/dist/core/plugin-discovery.d.ts.map +1 -0
- package/dist/core/plugin-discovery.js +47 -0
- package/dist/core/plugin-discovery.js.map +1 -0
- package/dist/core/telemetry.d.ts +61 -0
- package/dist/core/telemetry.d.ts.map +1 -0
- package/dist/core/telemetry.js +50 -0
- package/dist/core/telemetry.js.map +1 -0
- package/dist/core/uploader.d.ts +5 -1
- package/dist/core/uploader.d.ts.map +1 -1
- package/dist/core/uploader.js +23 -7
- package/dist/core/uploader.js.map +1 -1
- package/dist/core/utils.d.ts +7 -0
- package/dist/core/utils.d.ts.map +1 -1
- package/dist/core/utils.js +35 -0
- package/dist/core/utils.js.map +1 -1
- package/dist/index.d.ts +14 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -2
- package/dist/index.js.map +1 -1
- package/dist/interfaces/embedder.d.ts +2 -0
- package/dist/interfaces/embedder.d.ts.map +1 -1
- package/dist/interfaces/vector-store.d.ts +2 -0
- package/dist/interfaces/vector-store.d.ts.map +1 -1
- package/dist/strategies/chunk/token.js +1 -1
- package/dist/strategies/chunk/token.js.map +1 -1
- package/package.json +12 -2
- package/.github/config/release-please.json +0 -38
- package/.github/dependabot.yaml +0 -28
- package/.github/workflows/ci.yaml +0 -119
- package/.github/workflows/publish.yaml +0 -155
- package/.github/workflows/release.yaml +0 -54
- package/.release-please-manifest.json +0 -3
- package/.versionrc.json +0 -19
- package/CHANGELOG.md +0 -51
- package/bin/rag-update.ts +0 -49
- package/eslint.config.js +0 -25
- package/src/config-loader.ts +0 -21
- package/src/core/chunk-processor.test.ts +0 -36
- package/src/core/chunk-processor.ts +0 -92
- package/src/core/embedder.ts +0 -189
- package/src/core/git-tracker.test.ts +0 -64
- package/src/core/git-tracker.ts +0 -202
- package/src/core/orchestrator.test.ts +0 -53
- package/src/core/orchestrator.ts +0 -97
- package/src/core/uploader.ts +0 -123
- package/src/core/utils.ts +0 -27
- package/src/helpers/create-chunker.test.ts +0 -31
- package/src/helpers/create-chunker.ts +0 -40
- package/src/index.test.ts +0 -33
- package/src/index.ts +0 -30
- package/src/interfaces/chunker.ts +0 -59
- package/src/interfaces/embedder.ts +0 -36
- package/src/interfaces/index.test.ts +0 -9
- package/src/interfaces/index.ts +0 -3
- package/src/interfaces/vector-store.ts +0 -71
- package/src/strategies/chunk/index.ts +0 -4
- package/src/strategies/chunk/markdown-headers.test.ts +0 -37
- package/src/strategies/chunk/markdown-headers.ts +0 -106
- package/src/strategies/chunk/semantic.test.ts +0 -21
- package/src/strategies/chunk/semantic.ts +0 -80
- package/src/strategies/chunk/token.test.ts +0 -41
- package/src/strategies/chunk/token.ts +0 -72
- package/src/strategies/chunk/whole-file.test.ts +0 -24
- package/src/strategies/chunk/whole-file.ts +0 -35
- package/tsconfig.json +0 -21
- package/typedoc.json +0 -11
- package/vitest.config.ts +0 -19
package/eslint.config.js
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import js from '@eslint/js';
|
|
2
|
-
import tseslint from 'typescript-eslint';
|
|
3
|
-
|
|
4
|
-
export default tseslint.config(
|
|
5
|
-
js.configs.recommended,
|
|
6
|
-
...tseslint.configs.recommended,
|
|
7
|
-
{
|
|
8
|
-
ignores: [
|
|
9
|
-
'dist/**',
|
|
10
|
-
'node_modules/**',
|
|
11
|
-
'coverage/**',
|
|
12
|
-
'*.config.js',
|
|
13
|
-
'*.config.ts'
|
|
14
|
-
]
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
files: ['src/**/*.ts'],
|
|
18
|
-
rules: {
|
|
19
|
-
'@typescript-eslint/no-explicit-any': 'warn',
|
|
20
|
-
'@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
|
|
21
|
-
'no-console': 'off',
|
|
22
|
-
'no-undef': 'off'
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
);
|
package/src/config-loader.ts
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import { pathToFileURL } from "url";
|
|
2
|
-
import { RAGPipelineConfig } from "./core/orchestrator.js";
|
|
3
|
-
|
|
4
|
-
export async function loadConfig(
|
|
5
|
-
configPath: string,
|
|
6
|
-
): Promise<RAGPipelineConfig> {
|
|
7
|
-
// Clear cache for hot reload
|
|
8
|
-
delete require.cache[require.resolve(configPath)];
|
|
9
|
-
|
|
10
|
-
const configUrl = pathToFileURL(configPath).href;
|
|
11
|
-
const configModule = await import(configUrl);
|
|
12
|
-
const config = configModule.default;
|
|
13
|
-
|
|
14
|
-
if (!config.chunkers || !config.embedder || !config.vectorStore) {
|
|
15
|
-
throw new Error(
|
|
16
|
-
"Invalid config: missing chunkers, embedder, or vectorStore",
|
|
17
|
-
);
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
return config;
|
|
21
|
-
}
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi } from "vitest";
|
|
2
|
-
import { ChunkProcessor } from "./chunk-processor.js";
|
|
3
|
-
import { FileChunker } from "../interfaces/index.js";
|
|
4
|
-
|
|
5
|
-
describe("ChunkProcessor", () => {
|
|
6
|
-
const mockChunker: FileChunker = {
|
|
7
|
-
name: "test",
|
|
8
|
-
patterns: ["**/*.txt"],
|
|
9
|
-
chunk: vi.fn().mockResolvedValue([
|
|
10
|
-
{
|
|
11
|
-
content: "test content",
|
|
12
|
-
metadata: { type: "test" },
|
|
13
|
-
sourceFile: "test.txt",
|
|
14
|
-
commitHash: "abc123",
|
|
15
|
-
contentHash: "hash123",
|
|
16
|
-
},
|
|
17
|
-
]),
|
|
18
|
-
};
|
|
19
|
-
|
|
20
|
-
it("should be instantiable", () => {
|
|
21
|
-
const processor = new ChunkProcessor([mockChunker]);
|
|
22
|
-
expect(processor).toBeInstanceOf(ChunkProcessor);
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it("should have processFile method", () => {
|
|
26
|
-
const processor = new ChunkProcessor([mockChunker]);
|
|
27
|
-
expect(processor.processFile).toBeDefined();
|
|
28
|
-
expect(typeof processor.processFile).toBe("function");
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
it("should have processFiles method", () => {
|
|
32
|
-
const processor = new ChunkProcessor([mockChunker]);
|
|
33
|
-
expect(processor.processFiles).toBeDefined();
|
|
34
|
-
expect(typeof processor.processFiles).toBe("function");
|
|
35
|
-
});
|
|
36
|
-
});
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
import { Chunk, FileChunker } from "../interfaces/index.js";
|
|
2
|
-
import { createHash } from "crypto";
|
|
3
|
-
|
|
4
|
-
function computeContentHash(content: string): string {
|
|
5
|
-
return createHash("sha256").update(content).digest("hex").slice(0, 16);
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
export class ChunkProcessor {
|
|
9
|
-
private chunkers: Map<string, FileChunker>;
|
|
10
|
-
|
|
11
|
-
constructor(chunkers: FileChunker[]) {
|
|
12
|
-
this.chunkers = new Map(chunkers.map((c) => [c.name, c]));
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
async processFile(
|
|
16
|
-
filePath: string,
|
|
17
|
-
commitHash: string,
|
|
18
|
-
chunker: FileChunker,
|
|
19
|
-
): Promise<Chunk[]> {
|
|
20
|
-
try {
|
|
21
|
-
const chunks = await chunker.chunk(filePath, commitHash);
|
|
22
|
-
|
|
23
|
-
for (const chunk of chunks) {
|
|
24
|
-
chunk.contentHash = computeContentHash(chunk.content);
|
|
25
|
-
chunk.sourceFile = filePath;
|
|
26
|
-
chunk.commitHash = commitHash;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
return chunks;
|
|
30
|
-
} catch (error) {
|
|
31
|
-
console.error(` ❌ Error processing ${filePath}: ${error}`);
|
|
32
|
-
return [];
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
async processFiles(
|
|
37
|
-
files: string[],
|
|
38
|
-
fileState: Map<string, { commitHash: string; chunker: FileChunker }>,
|
|
39
|
-
): Promise<Chunk[]> {
|
|
40
|
-
const allChunks: Chunk[] = [];
|
|
41
|
-
|
|
42
|
-
for (let i = 0; i < files.length; i++) {
|
|
43
|
-
const filePath = files[i];
|
|
44
|
-
const info = fileState.get(filePath);
|
|
45
|
-
|
|
46
|
-
if (!info) {
|
|
47
|
-
console.log(` ⚠️ No chunker for: ${filePath}`);
|
|
48
|
-
continue;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
console.log(` [${i + 1}/${files.length}] ${filePath}`);
|
|
52
|
-
|
|
53
|
-
const chunks = await this.processFile(
|
|
54
|
-
filePath,
|
|
55
|
-
info.commitHash,
|
|
56
|
-
info.chunker,
|
|
57
|
-
);
|
|
58
|
-
|
|
59
|
-
if (chunks.length > 0) {
|
|
60
|
-
allChunks.push(...chunks);
|
|
61
|
-
console.log(` ✅ Generated ${chunks.length} chunk(s)`);
|
|
62
|
-
} else {
|
|
63
|
-
console.log(` ⚠️ No chunks generated (skipped)`);
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
return allChunks;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
async saveChunksLocal(chunks: Chunk[], outputFile: string): Promise<void> {
|
|
71
|
-
const { dirname } = await import("path");
|
|
72
|
-
const { mkdir, writeFile, readFile } = await import("fs/promises");
|
|
73
|
-
|
|
74
|
-
await mkdir(dirname(outputFile), { recursive: true });
|
|
75
|
-
|
|
76
|
-
let existing: Chunk[] = [];
|
|
77
|
-
try {
|
|
78
|
-
const content = await readFile(outputFile, "utf-8");
|
|
79
|
-
existing = JSON.parse(content);
|
|
80
|
-
} catch {
|
|
81
|
-
// File doesn't exist
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
const processedFiles = new Set(chunks.map((c) => c.sourceFile));
|
|
85
|
-
const filtered = existing.filter((c) => !processedFiles.has(c.sourceFile));
|
|
86
|
-
|
|
87
|
-
const allChunks = [...filtered, ...chunks];
|
|
88
|
-
|
|
89
|
-
await writeFile(outputFile, JSON.stringify(allChunks, null, 2));
|
|
90
|
-
console.log(`\n💾 Saved ${allChunks.length} chunks to ${outputFile}`);
|
|
91
|
-
}
|
|
92
|
-
}
|
package/src/core/embedder.ts
DELETED
|
@@ -1,189 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
EmbeddingProvider,
|
|
3
|
-
EmbeddedChunk,
|
|
4
|
-
Chunk,
|
|
5
|
-
} from "../interfaces/index.js";
|
|
6
|
-
import { readFile, writeFile, mkdir } from "fs/promises";
|
|
7
|
-
import { dirname } from "path";
|
|
8
|
-
import { createHash } from "crypto";
|
|
9
|
-
|
|
10
|
-
function chunkContentHash(chunk: Chunk): string {
|
|
11
|
-
if (chunk.contentHash) return chunk.contentHash;
|
|
12
|
-
return createHash("sha256").update(chunk.content).digest("hex").slice(0, 16);
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
export class EmbedderProcessor {
|
|
16
|
-
private provider: EmbeddingProvider;
|
|
17
|
-
private rateLimitMs: number;
|
|
18
|
-
private batchSize: number;
|
|
19
|
-
|
|
20
|
-
constructor(
|
|
21
|
-
provider: EmbeddingProvider,
|
|
22
|
-
options: { rateLimitMs?: number; batchSize?: number } = {},
|
|
23
|
-
) {
|
|
24
|
-
this.provider = provider;
|
|
25
|
-
this.rateLimitMs = options.rateLimitMs ?? 500;
|
|
26
|
-
this.batchSize = options.batchSize ?? 10;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
private async sleep(ms: number): Promise<void> {
|
|
30
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
async embedChunk(chunk: Chunk): Promise<EmbeddedChunk> {
|
|
34
|
-
const embedding = await this.provider.embed(chunk.content);
|
|
35
|
-
|
|
36
|
-
return {
|
|
37
|
-
...chunk,
|
|
38
|
-
embedding,
|
|
39
|
-
embeddedAt: Date.now() / 1000,
|
|
40
|
-
};
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
async embedBatch(chunks: Chunk[]): Promise<EmbeddedChunk[]> {
|
|
44
|
-
const results: EmbeddedChunk[] = [];
|
|
45
|
-
|
|
46
|
-
if (this.provider.embedBatch && chunks.length >= this.batchSize) {
|
|
47
|
-
const texts = chunks.map((c) => c.content);
|
|
48
|
-
const embeddings = await this.provider.embedBatch(texts);
|
|
49
|
-
|
|
50
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
51
|
-
results.push({
|
|
52
|
-
...chunks[i],
|
|
53
|
-
embedding: embeddings[i],
|
|
54
|
-
embeddedAt: Date.now() / 1000,
|
|
55
|
-
});
|
|
56
|
-
}
|
|
57
|
-
} else {
|
|
58
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
59
|
-
const chunk = chunks[i];
|
|
60
|
-
const eventType =
|
|
61
|
-
(chunk.metadata.event_type as string) ||
|
|
62
|
-
(chunk.metadata.title as string) ||
|
|
63
|
-
chunk.sourceFile.split("/").pop() ||
|
|
64
|
-
"unknown";
|
|
65
|
-
|
|
66
|
-
console.log(` [${i + 1}/${chunks.length}] ${eventType}`);
|
|
67
|
-
|
|
68
|
-
const embedded = await this.embedChunk(chunk);
|
|
69
|
-
results.push(embedded);
|
|
70
|
-
|
|
71
|
-
if (this.rateLimitMs > 0 && i < chunks.length - 1) {
|
|
72
|
-
await this.sleep(this.rateLimitMs);
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
return results;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
async getChunksToEmbed(
|
|
81
|
-
chunksFile: string,
|
|
82
|
-
force: boolean = false,
|
|
83
|
-
): Promise<{
|
|
84
|
-
chunksToEmbed: Chunk[];
|
|
85
|
-
}> {
|
|
86
|
-
let chunks: Chunk[];
|
|
87
|
-
try {
|
|
88
|
-
const content = await readFile(chunksFile, "utf-8");
|
|
89
|
-
chunks = JSON.parse(content);
|
|
90
|
-
} catch {
|
|
91
|
-
throw new Error(`Chunks file not found: ${chunksFile}`);
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
console.log(`📖 Loaded ${chunks.length} chunks from ${chunksFile}`);
|
|
95
|
-
|
|
96
|
-
if (force) {
|
|
97
|
-
console.log(" ⚠️ Force mode: embedding all chunks");
|
|
98
|
-
return { chunksToEmbed: chunks };
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
let existingEmbeddings: EmbeddedChunk[] = [];
|
|
102
|
-
const embeddingsFile = chunksFile.replace("chunks", "embeddings");
|
|
103
|
-
try {
|
|
104
|
-
const content = await readFile(embeddingsFile, "utf-8");
|
|
105
|
-
existingEmbeddings = JSON.parse(content);
|
|
106
|
-
} catch {
|
|
107
|
-
// No existing embeddings
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
const existingState = new Map<string, EmbeddedChunk>();
|
|
111
|
-
for (const emb of existingEmbeddings) {
|
|
112
|
-
const hash = emb.contentHash || chunkContentHash(emb);
|
|
113
|
-
existingState.set(hash, emb);
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
console.log(`📊 Existing embeddings: ${existingState.size} chunks`);
|
|
117
|
-
|
|
118
|
-
const chunksToEmbed: Chunk[] = [];
|
|
119
|
-
for (const chunk of chunks) {
|
|
120
|
-
const chunkHash = chunkContentHash(chunk);
|
|
121
|
-
if (!existingState.has(chunkHash)) {
|
|
122
|
-
chunksToEmbed.push(chunk);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
return { chunksToEmbed };
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
async saveEmbeddings(
|
|
130
|
-
newEmbeddings: EmbeddedChunk[],
|
|
131
|
-
chunksFile: string,
|
|
132
|
-
force: boolean = false,
|
|
133
|
-
): Promise<void> {
|
|
134
|
-
const embeddingsFile = chunksFile.replace("chunks", "embeddings");
|
|
135
|
-
await mkdir(dirname(embeddingsFile), { recursive: true });
|
|
136
|
-
|
|
137
|
-
const newByHash = new Map<string, EmbeddedChunk>();
|
|
138
|
-
for (const emb of newEmbeddings) {
|
|
139
|
-
const hash = emb.contentHash || chunkContentHash(emb);
|
|
140
|
-
newByHash.set(hash, emb);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
let existing: EmbeddedChunk[] = [];
|
|
144
|
-
if (!force) {
|
|
145
|
-
try {
|
|
146
|
-
const content = await readFile(embeddingsFile, "utf-8");
|
|
147
|
-
existing = JSON.parse(content);
|
|
148
|
-
} catch {
|
|
149
|
-
// No existing embeddings
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
const final = force
|
|
154
|
-
? []
|
|
155
|
-
: existing.filter((e) => {
|
|
156
|
-
const hash = e.contentHash || chunkContentHash(e);
|
|
157
|
-
return !newByHash.has(hash);
|
|
158
|
-
});
|
|
159
|
-
|
|
160
|
-
final.push(...newEmbeddings);
|
|
161
|
-
|
|
162
|
-
await writeFile(embeddingsFile, JSON.stringify(final, null, 2));
|
|
163
|
-
console.log(`\n💾 Saved ${final.length} embeddings to ${embeddingsFile}`);
|
|
164
|
-
console.log(
|
|
165
|
-
` New: ${newEmbeddings.length}, Existing: ${final.length - newEmbeddings.length}`,
|
|
166
|
-
);
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
async run(
|
|
170
|
-
chunksFile: string,
|
|
171
|
-
force: boolean = false,
|
|
172
|
-
): Promise<EmbeddedChunk[]> {
|
|
173
|
-
console.log("🔢 Starting incremental embedding generation...");
|
|
174
|
-
|
|
175
|
-
const { chunksToEmbed } = await this.getChunksToEmbed(chunksFile, force);
|
|
176
|
-
|
|
177
|
-
if (chunksToEmbed.length === 0) {
|
|
178
|
-
console.log("\n✨ No chunks need embedding.");
|
|
179
|
-
return [];
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
console.log(`\n📝 Need to embed ${chunksToEmbed.length} chunks`);
|
|
183
|
-
|
|
184
|
-
const newEmbeddings = await this.embedBatch(chunksToEmbed);
|
|
185
|
-
await this.saveEmbeddings(newEmbeddings, chunksFile, force);
|
|
186
|
-
|
|
187
|
-
return newEmbeddings;
|
|
188
|
-
}
|
|
189
|
-
}
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
-
import { GitTracker } from "./git-tracker.js";
|
|
3
|
-
import { FileChunker } from "../interfaces/index.js";
|
|
4
|
-
import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "fs";
|
|
5
|
-
import { join } from "path";
|
|
6
|
-
import { tmpdir } from "os";
|
|
7
|
-
import { execSync } from "child_process";
|
|
8
|
-
|
|
9
|
-
describe("GitTracker", () => {
|
|
10
|
-
let testDir: string;
|
|
11
|
-
let originalCwd: string;
|
|
12
|
-
|
|
13
|
-
const mockChunker: FileChunker = {
|
|
14
|
-
name: "test",
|
|
15
|
-
patterns: ["**/*.txt", "**/*.yaml", "**/*.json"],
|
|
16
|
-
chunk: vi.fn().mockResolvedValue([]),
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
beforeEach(() => {
|
|
20
|
-
testDir = mkdtempSync(join(tmpdir(), "git-test-"));
|
|
21
|
-
originalCwd = process.cwd();
|
|
22
|
-
process.chdir(testDir);
|
|
23
|
-
|
|
24
|
-
mkdirSync(join(testDir, "src", "events"), { recursive: true });
|
|
25
|
-
|
|
26
|
-
writeFileSync(join(testDir, "test.txt"), "test content");
|
|
27
|
-
writeFileSync(
|
|
28
|
-
join(testDir, "src", "events", "booking.yaml"),
|
|
29
|
-
"event_type: BookingCreated",
|
|
30
|
-
);
|
|
31
|
-
writeFileSync(join(testDir, "config.json"), '{"key": "value"}');
|
|
32
|
-
|
|
33
|
-
execSync("git init", { stdio: "ignore" });
|
|
34
|
-
execSync('git config user.email "test@example.com"', { stdio: "ignore" });
|
|
35
|
-
execSync('git config user.name "Test"', { stdio: "ignore" });
|
|
36
|
-
execSync("git add .", { stdio: "ignore" });
|
|
37
|
-
execSync('git commit -m "initial"', { stdio: "ignore" });
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
afterEach(() => {
|
|
41
|
-
process.chdir(originalCwd);
|
|
42
|
-
rmSync(testDir, { recursive: true, force: true });
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
it("should be instantiable", () => {
|
|
46
|
-
const tracker = new GitTracker([mockChunker]);
|
|
47
|
-
expect(tracker).toBeInstanceOf(GitTracker);
|
|
48
|
-
});
|
|
49
|
-
|
|
50
|
-
it("should getAllTrackedFiles", async () => {
|
|
51
|
-
const tracker = new GitTracker([mockChunker]);
|
|
52
|
-
const files = await tracker.getAllTrackedFiles();
|
|
53
|
-
|
|
54
|
-
expect(files.length).toBeGreaterThan(0);
|
|
55
|
-
expect(files.some((f) => f.includes("test.txt"))).toBe(true);
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
it("should getCurrentState", async () => {
|
|
59
|
-
const tracker = new GitTracker([mockChunker]);
|
|
60
|
-
const state = await tracker.getCurrentState();
|
|
61
|
-
|
|
62
|
-
expect(state.size).toBeGreaterThan(0);
|
|
63
|
-
});
|
|
64
|
-
});
|
package/src/core/git-tracker.ts
DELETED
|
@@ -1,202 +0,0 @@
|
|
|
1
|
-
import { simpleGit, SimpleGit } from "simple-git";
|
|
2
|
-
import { glob } from "glob";
|
|
3
|
-
import { FileChunker } from "../interfaces/index.js";
|
|
4
|
-
import { minimatch } from "minimatch";
|
|
5
|
-
import path from "path";
|
|
6
|
-
|
|
7
|
-
const MAX_FILES_PER_BATCH = 100;
|
|
8
|
-
const MAX_CMD_LEN = 32000;
|
|
9
|
-
|
|
10
|
-
function batchFiles(files: string[]): string[][] {
|
|
11
|
-
const batches: string[][] = [];
|
|
12
|
-
let currentBatch: string[] = [];
|
|
13
|
-
let currentLen = 0;
|
|
14
|
-
const baseCmdLen = "git log -1 --format=%H --all -- ".length;
|
|
15
|
-
|
|
16
|
-
for (const file of files) {
|
|
17
|
-
const fileLen = file.length + 1;
|
|
18
|
-
|
|
19
|
-
if (
|
|
20
|
-
currentBatch.length >= MAX_FILES_PER_BATCH ||
|
|
21
|
-
currentLen + fileLen > MAX_CMD_LEN
|
|
22
|
-
) {
|
|
23
|
-
if (currentBatch.length > 0) {
|
|
24
|
-
batches.push(currentBatch);
|
|
25
|
-
currentBatch = [];
|
|
26
|
-
currentLen = baseCmdLen;
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
currentBatch.push(file);
|
|
31
|
-
currentLen += fileLen;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
if (currentBatch.length > 0) {
|
|
35
|
-
batches.push(currentBatch);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
return batches;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
export class GitTracker {
|
|
42
|
-
private git: SimpleGit;
|
|
43
|
-
private chunkers: FileChunker[];
|
|
44
|
-
private allPatterns: string[];
|
|
45
|
-
private currentHeadCache: string | null = null;
|
|
46
|
-
private uncommittedCache: boolean | null = null;
|
|
47
|
-
|
|
48
|
-
constructor(chunkers: FileChunker[]) {
|
|
49
|
-
this.git = simpleGit();
|
|
50
|
-
this.chunkers = chunkers;
|
|
51
|
-
this.allPatterns = chunkers.flatMap((c) => c.patterns);
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
private async getCurrentHead(): Promise<string> {
|
|
55
|
-
if (!this.currentHeadCache) {
|
|
56
|
-
try {
|
|
57
|
-
this.currentHeadCache = await this.git.revparse(["HEAD"]);
|
|
58
|
-
} catch {
|
|
59
|
-
this.currentHeadCache = "dev_0000000000000000000000000000000000000000";
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
return this.currentHeadCache;
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
private async hasUncommittedChanges(): Promise<boolean> {
|
|
66
|
-
if (this.uncommittedCache === null) {
|
|
67
|
-
try {
|
|
68
|
-
const status = await this.git.status();
|
|
69
|
-
this.uncommittedCache = status.files.length > 0;
|
|
70
|
-
} catch {
|
|
71
|
-
this.uncommittedCache = false;
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
return this.uncommittedCache;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
private getChunkerForFile(filePath: string): FileChunker | null {
|
|
78
|
-
for (const chunker of this.chunkers) {
|
|
79
|
-
for (const pattern of chunker.patterns) {
|
|
80
|
-
if (this.matchesPattern(filePath, pattern)) {
|
|
81
|
-
return chunker;
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
return null;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
private matchesPattern(filePath: string, pattern: string): boolean {
|
|
89
|
-
const normalizedPath = filePath.split(path.sep).join("/");
|
|
90
|
-
const normalizedPattern = pattern.split(path.sep).join("/");
|
|
91
|
-
|
|
92
|
-
return minimatch(normalizedPath, normalizedPattern);
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
async getAllTrackedFiles(): Promise<string[]> {
|
|
96
|
-
const files = await glob(this.allPatterns, { nodir: true });
|
|
97
|
-
return [...new Set(files)].sort();
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
async getCommitHashes(files: string[]): Promise<Map<string, string>> {
|
|
101
|
-
const commitMap = new Map<string, string>();
|
|
102
|
-
const batches = batchFiles(files);
|
|
103
|
-
const currentHead = await this.getCurrentHead();
|
|
104
|
-
|
|
105
|
-
for (const batch of batches) {
|
|
106
|
-
try {
|
|
107
|
-
const output = await this.git.raw([
|
|
108
|
-
"log",
|
|
109
|
-
"-1",
|
|
110
|
-
"--format=%H",
|
|
111
|
-
"--all",
|
|
112
|
-
"--",
|
|
113
|
-
...batch,
|
|
114
|
-
]);
|
|
115
|
-
const lines = output.trim().split("\n");
|
|
116
|
-
|
|
117
|
-
for (let i = 0; i < lines.length && i < batch.length; i++) {
|
|
118
|
-
const hash = lines[i].trim();
|
|
119
|
-
if (hash) {
|
|
120
|
-
commitMap.set(batch[i], hash);
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
for (const file of batch) {
|
|
125
|
-
if (!commitMap.has(file)) {
|
|
126
|
-
commitMap.set(file, currentHead);
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
} catch {
|
|
130
|
-
for (const file of batch) {
|
|
131
|
-
commitMap.set(file, currentHead);
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
return commitMap;
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
async getCurrentState(): Promise<
|
|
140
|
-
Map<string, { commitHash: string; chunker: FileChunker }>
|
|
141
|
-
> {
|
|
142
|
-
const allFiles = await this.getAllTrackedFiles();
|
|
143
|
-
const commitMap = await this.getCommitHashes(allFiles);
|
|
144
|
-
const hasDirty = await this.hasUncommittedChanges();
|
|
145
|
-
const currentHead = await this.getCurrentHead();
|
|
146
|
-
|
|
147
|
-
const state = new Map<
|
|
148
|
-
string,
|
|
149
|
-
{ commitHash: string; chunker: FileChunker }
|
|
150
|
-
>();
|
|
151
|
-
|
|
152
|
-
for (const file of allFiles) {
|
|
153
|
-
let commitHash = commitMap.get(file) || currentHead;
|
|
154
|
-
if (hasDirty) {
|
|
155
|
-
commitHash = `${commitHash}-dirty`;
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const chunker = this.getChunkerForFile(file);
|
|
159
|
-
if (chunker) {
|
|
160
|
-
state.set(file, { commitHash, chunker });
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
return state;
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
async getChangedFiles(previousState: Map<string, string>): Promise<{
|
|
168
|
-
toProcess: string[];
|
|
169
|
-
toDelete: string[];
|
|
170
|
-
unchanged: string[];
|
|
171
|
-
}> {
|
|
172
|
-
const current = await this.getCurrentState();
|
|
173
|
-
const toProcess: string[] = [];
|
|
174
|
-
const toDelete: string[] = [];
|
|
175
|
-
const unchanged: string[] = [];
|
|
176
|
-
|
|
177
|
-
for (const [filePath, info] of current) {
|
|
178
|
-
const prevHash = previousState.get(filePath);
|
|
179
|
-
|
|
180
|
-
if (!prevHash) {
|
|
181
|
-
console.log(` 🆕 New: ${filePath}`);
|
|
182
|
-
toProcess.push(filePath);
|
|
183
|
-
} else if (prevHash !== info.commitHash) {
|
|
184
|
-
console.log(
|
|
185
|
-
` 📝 Changed: ${filePath} (${prevHash.slice(0, 8)} → ${info.commitHash.slice(0, 8)})`,
|
|
186
|
-
);
|
|
187
|
-
toProcess.push(filePath);
|
|
188
|
-
} else {
|
|
189
|
-
unchanged.push(filePath);
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
for (const [filePath] of previousState) {
|
|
194
|
-
if (!current.has(filePath)) {
|
|
195
|
-
console.log(` 🗑️ Deleted: ${filePath}`);
|
|
196
|
-
toDelete.push(filePath);
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
return { toProcess, toDelete, unchanged };
|
|
201
|
-
}
|
|
202
|
-
}
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi } from "vitest";
|
|
2
|
-
import { Orchestrator, RAGPipelineConfig } from "./orchestrator.js";
|
|
3
|
-
import {
|
|
4
|
-
FileChunker,
|
|
5
|
-
EmbeddingProvider,
|
|
6
|
-
VectorStore,
|
|
7
|
-
} from "../interfaces/index.js";
|
|
8
|
-
|
|
9
|
-
describe("Orchestrator", () => {
|
|
10
|
-
const mockChunker: FileChunker = {
|
|
11
|
-
name: "test",
|
|
12
|
-
patterns: ["**/*.txt"],
|
|
13
|
-
chunk: vi.fn().mockResolvedValue([]),
|
|
14
|
-
};
|
|
15
|
-
|
|
16
|
-
const mockEmbedder: EmbeddingProvider = {
|
|
17
|
-
name: "mock",
|
|
18
|
-
dimensions: 384,
|
|
19
|
-
embed: vi.fn().mockResolvedValue(new Array(384).fill(0)),
|
|
20
|
-
};
|
|
21
|
-
|
|
22
|
-
const mockVectorStore: VectorStore = {
|
|
23
|
-
name: "mock",
|
|
24
|
-
initialize: vi.fn().mockResolvedValue(undefined),
|
|
25
|
-
upsert: vi.fn().mockResolvedValue(undefined),
|
|
26
|
-
deleteBySourceFile: vi.fn().mockResolvedValue(undefined),
|
|
27
|
-
getCurrentState: vi.fn().mockResolvedValue(new Map()),
|
|
28
|
-
search: vi.fn().mockResolvedValue([]),
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
const mockConfig: RAGPipelineConfig = {
|
|
32
|
-
chunkers: [mockChunker],
|
|
33
|
-
embedder: mockEmbedder,
|
|
34
|
-
vectorStore: mockVectorStore,
|
|
35
|
-
options: {
|
|
36
|
-
chunksFile: "./test-chunks.json",
|
|
37
|
-
embeddingsFile: "./test-embeddings.json",
|
|
38
|
-
force: false,
|
|
39
|
-
skipUpload: true,
|
|
40
|
-
},
|
|
41
|
-
};
|
|
42
|
-
|
|
43
|
-
it("should be instantiable", () => {
|
|
44
|
-
const orchestrator = new Orchestrator(mockConfig);
|
|
45
|
-
expect(orchestrator).toBeInstanceOf(Orchestrator);
|
|
46
|
-
});
|
|
47
|
-
|
|
48
|
-
it("should have run method", () => {
|
|
49
|
-
const orchestrator = new Orchestrator(mockConfig);
|
|
50
|
-
expect(orchestrator.run).toBeDefined();
|
|
51
|
-
expect(typeof orchestrator.run).toBe("function");
|
|
52
|
-
});
|
|
53
|
-
});
|