@vivantel/rag-core 0.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +62 -62
  2. package/dist/config-loader.d.ts.map +1 -1
  3. package/dist/config-loader.js +0 -2
  4. package/dist/config-loader.js.map +1 -1
  5. package/dist/core/chunk-processor.d.ts.map +1 -1
  6. package/dist/core/chunk-processor.js +27 -20
  7. package/dist/core/chunk-processor.js.map +1 -1
  8. package/dist/core/embedder.d.ts.map +1 -1
  9. package/dist/core/embedder.js +10 -3
  10. package/dist/core/embedder.js.map +1 -1
  11. package/dist/core/git-tracker.d.ts.map +1 -1
  12. package/dist/core/git-tracker.js +9 -59
  13. package/dist/core/git-tracker.js.map +1 -1
  14. package/dist/core/orchestrator.d.ts.map +1 -1
  15. package/dist/core/orchestrator.js +22 -1
  16. package/dist/core/orchestrator.js.map +1 -1
  17. package/dist/core/uploader.d.ts.map +1 -1
  18. package/dist/core/uploader.js +13 -4
  19. package/dist/core/uploader.js.map +1 -1
  20. package/dist/strategies/chunk/token.js +1 -1
  21. package/dist/strategies/chunk/token.js.map +1 -1
  22. package/package.json +110 -102
  23. package/.github/config/release-please.json +0 -38
  24. package/.github/dependabot.yaml +0 -28
  25. package/.github/workflows/ci.yaml +0 -119
  26. package/.github/workflows/publish.yaml +0 -151
  27. package/.github/workflows/release.yaml +0 -150
  28. package/.versionrc.json +0 -19
  29. package/CHANGELOG.md +0 -21
  30. package/bin/rag-update.ts +0 -49
  31. package/eslint.config.js +0 -25
  32. package/src/config-loader.ts +0 -21
  33. package/src/core/chunk-processor.test.ts +0 -36
  34. package/src/core/chunk-processor.ts +0 -92
  35. package/src/core/embedder.ts +0 -189
  36. package/src/core/git-tracker.test.ts +0 -64
  37. package/src/core/git-tracker.ts +0 -202
  38. package/src/core/orchestrator.test.ts +0 -53
  39. package/src/core/orchestrator.ts +0 -97
  40. package/src/core/uploader.ts +0 -123
  41. package/src/core/utils.ts +0 -27
  42. package/src/helpers/create-chunker.test.ts +0 -31
  43. package/src/helpers/create-chunker.ts +0 -40
  44. package/src/index.test.ts +0 -33
  45. package/src/index.ts +0 -30
  46. package/src/interfaces/chunker.ts +0 -59
  47. package/src/interfaces/embedder.ts +0 -36
  48. package/src/interfaces/index.test.ts +0 -9
  49. package/src/interfaces/index.ts +0 -3
  50. package/src/interfaces/vector-store.ts +0 -71
  51. package/src/strategies/chunk/index.ts +0 -4
  52. package/src/strategies/chunk/markdown-headers.test.ts +0 -37
  53. package/src/strategies/chunk/markdown-headers.ts +0 -106
  54. package/src/strategies/chunk/semantic.test.ts +0 -21
  55. package/src/strategies/chunk/semantic.ts +0 -80
  56. package/src/strategies/chunk/token.test.ts +0 -41
  57. package/src/strategies/chunk/token.ts +0 -72
  58. package/src/strategies/chunk/whole-file.test.ts +0 -24
  59. package/src/strategies/chunk/whole-file.ts +0 -35
  60. package/tsconfig.json +0 -21
  61. package/typedoc.json +0 -11
  62. package/vitest.config.ts +0 -19
@@ -1,202 +0,0 @@
1
- import { simpleGit, SimpleGit } from "simple-git";
2
- import { glob } from "glob";
3
- import { FileChunker } from "../interfaces/index.js";
4
- import { minimatch } from "minimatch";
5
- import path from "path";
6
-
7
- const MAX_FILES_PER_BATCH = 100;
8
- const MAX_CMD_LEN = 32000;
9
-
10
- function batchFiles(files: string[]): string[][] {
11
- const batches: string[][] = [];
12
- let currentBatch: string[] = [];
13
- let currentLen = 0;
14
- const baseCmdLen = "git log -1 --format=%H --all -- ".length;
15
-
16
- for (const file of files) {
17
- const fileLen = file.length + 1;
18
-
19
- if (
20
- currentBatch.length >= MAX_FILES_PER_BATCH ||
21
- currentLen + fileLen > MAX_CMD_LEN
22
- ) {
23
- if (currentBatch.length > 0) {
24
- batches.push(currentBatch);
25
- currentBatch = [];
26
- currentLen = baseCmdLen;
27
- }
28
- }
29
-
30
- currentBatch.push(file);
31
- currentLen += fileLen;
32
- }
33
-
34
- if (currentBatch.length > 0) {
35
- batches.push(currentBatch);
36
- }
37
-
38
- return batches;
39
- }
40
-
41
- export class GitTracker {
42
- private git: SimpleGit;
43
- private chunkers: FileChunker[];
44
- private allPatterns: string[];
45
- private currentHeadCache: string | null = null;
46
- private uncommittedCache: boolean | null = null;
47
-
48
- constructor(chunkers: FileChunker[]) {
49
- this.git = simpleGit();
50
- this.chunkers = chunkers;
51
- this.allPatterns = chunkers.flatMap((c) => c.patterns);
52
- }
53
-
54
- private async getCurrentHead(): Promise<string> {
55
- if (!this.currentHeadCache) {
56
- try {
57
- this.currentHeadCache = await this.git.revparse(["HEAD"]);
58
- } catch {
59
- this.currentHeadCache = "dev_0000000000000000000000000000000000000000";
60
- }
61
- }
62
- return this.currentHeadCache;
63
- }
64
-
65
- private async hasUncommittedChanges(): Promise<boolean> {
66
- if (this.uncommittedCache === null) {
67
- try {
68
- const status = await this.git.status();
69
- this.uncommittedCache = status.files.length > 0;
70
- } catch {
71
- this.uncommittedCache = false;
72
- }
73
- }
74
- return this.uncommittedCache;
75
- }
76
-
77
- private getChunkerForFile(filePath: string): FileChunker | null {
78
- for (const chunker of this.chunkers) {
79
- for (const pattern of chunker.patterns) {
80
- if (this.matchesPattern(filePath, pattern)) {
81
- return chunker;
82
- }
83
- }
84
- }
85
- return null;
86
- }
87
-
88
- private matchesPattern(filePath: string, pattern: string): boolean {
89
- const normalizedPath = filePath.split(path.sep).join("/");
90
- const normalizedPattern = pattern.split(path.sep).join("/");
91
-
92
- return minimatch(normalizedPath, normalizedPattern);
93
- }
94
-
95
- async getAllTrackedFiles(): Promise<string[]> {
96
- const files = await glob(this.allPatterns, { nodir: true });
97
- return [...new Set(files)].sort();
98
- }
99
-
100
- async getCommitHashes(files: string[]): Promise<Map<string, string>> {
101
- const commitMap = new Map<string, string>();
102
- const batches = batchFiles(files);
103
- const currentHead = await this.getCurrentHead();
104
-
105
- for (const batch of batches) {
106
- try {
107
- const output = await this.git.raw([
108
- "log",
109
- "-1",
110
- "--format=%H",
111
- "--all",
112
- "--",
113
- ...batch,
114
- ]);
115
- const lines = output.trim().split("\n");
116
-
117
- for (let i = 0; i < lines.length && i < batch.length; i++) {
118
- const hash = lines[i].trim();
119
- if (hash) {
120
- commitMap.set(batch[i], hash);
121
- }
122
- }
123
-
124
- for (const file of batch) {
125
- if (!commitMap.has(file)) {
126
- commitMap.set(file, currentHead);
127
- }
128
- }
129
- } catch {
130
- for (const file of batch) {
131
- commitMap.set(file, currentHead);
132
- }
133
- }
134
- }
135
-
136
- return commitMap;
137
- }
138
-
139
- async getCurrentState(): Promise<
140
- Map<string, { commitHash: string; chunker: FileChunker }>
141
- > {
142
- const allFiles = await this.getAllTrackedFiles();
143
- const commitMap = await this.getCommitHashes(allFiles);
144
- const hasDirty = await this.hasUncommittedChanges();
145
- const currentHead = await this.getCurrentHead();
146
-
147
- const state = new Map<
148
- string,
149
- { commitHash: string; chunker: FileChunker }
150
- >();
151
-
152
- for (const file of allFiles) {
153
- let commitHash = commitMap.get(file) || currentHead;
154
- if (hasDirty) {
155
- commitHash = `${commitHash}-dirty`;
156
- }
157
-
158
- const chunker = this.getChunkerForFile(file);
159
- if (chunker) {
160
- state.set(file, { commitHash, chunker });
161
- }
162
- }
163
-
164
- return state;
165
- }
166
-
167
- async getChangedFiles(previousState: Map<string, string>): Promise<{
168
- toProcess: string[];
169
- toDelete: string[];
170
- unchanged: string[];
171
- }> {
172
- const current = await this.getCurrentState();
173
- const toProcess: string[] = [];
174
- const toDelete: string[] = [];
175
- const unchanged: string[] = [];
176
-
177
- for (const [filePath, info] of current) {
178
- const prevHash = previousState.get(filePath);
179
-
180
- if (!prevHash) {
181
- console.log(` 🆕 New: ${filePath}`);
182
- toProcess.push(filePath);
183
- } else if (prevHash !== info.commitHash) {
184
- console.log(
185
- ` 📝 Changed: ${filePath} (${prevHash.slice(0, 8)} → ${info.commitHash.slice(0, 8)})`,
186
- );
187
- toProcess.push(filePath);
188
- } else {
189
- unchanged.push(filePath);
190
- }
191
- }
192
-
193
- for (const [filePath] of previousState) {
194
- if (!current.has(filePath)) {
195
- console.log(` 🗑️ Deleted: ${filePath}`);
196
- toDelete.push(filePath);
197
- }
198
- }
199
-
200
- return { toProcess, toDelete, unchanged };
201
- }
202
- }
@@ -1,53 +0,0 @@
1
- import { describe, it, expect, vi } from "vitest";
2
- import { Orchestrator, RAGPipelineConfig } from "./orchestrator.js";
3
- import {
4
- FileChunker,
5
- EmbeddingProvider,
6
- VectorStore,
7
- } from "../interfaces/index.js";
8
-
9
- describe("Orchestrator", () => {
10
- const mockChunker: FileChunker = {
11
- name: "test",
12
- patterns: ["**/*.txt"],
13
- chunk: vi.fn().mockResolvedValue([]),
14
- };
15
-
16
- const mockEmbedder: EmbeddingProvider = {
17
- name: "mock",
18
- dimensions: 384,
19
- embed: vi.fn().mockResolvedValue(new Array(384).fill(0)),
20
- };
21
-
22
- const mockVectorStore: VectorStore = {
23
- name: "mock",
24
- initialize: vi.fn().mockResolvedValue(undefined),
25
- upsert: vi.fn().mockResolvedValue(undefined),
26
- deleteBySourceFile: vi.fn().mockResolvedValue(undefined),
27
- getCurrentState: vi.fn().mockResolvedValue(new Map()),
28
- search: vi.fn().mockResolvedValue([]),
29
- };
30
-
31
- const mockConfig: RAGPipelineConfig = {
32
- chunkers: [mockChunker],
33
- embedder: mockEmbedder,
34
- vectorStore: mockVectorStore,
35
- options: {
36
- chunksFile: "./test-chunks.json",
37
- embeddingsFile: "./test-embeddings.json",
38
- force: false,
39
- skipUpload: true,
40
- },
41
- };
42
-
43
- it("should be instantiable", () => {
44
- const orchestrator = new Orchestrator(mockConfig);
45
- expect(orchestrator).toBeInstanceOf(Orchestrator);
46
- });
47
-
48
- it("should have run method", () => {
49
- const orchestrator = new Orchestrator(mockConfig);
50
- expect(orchestrator.run).toBeDefined();
51
- expect(typeof orchestrator.run).toBe("function");
52
- });
53
- });
@@ -1,97 +0,0 @@
1
- import { GitTracker } from "./git-tracker.js";
2
- import { ChunkProcessor } from "./chunk-processor.js";
3
- import { EmbedderProcessor } from "./embedder.js";
4
- import { Uploader } from "./uploader.js";
5
- import {
6
- FileChunker,
7
- EmbeddingProvider,
8
- VectorStore,
9
- } from "../interfaces/index.js";
10
-
11
- export interface RAGPipelineConfig {
12
- chunkers: FileChunker[];
13
- embedder: EmbeddingProvider;
14
- vectorStore: VectorStore;
15
- options?: {
16
- chunksFile?: string;
17
- embeddingsFile?: string;
18
- force?: boolean;
19
- skipUpload?: boolean;
20
- rateLimitMs?: number;
21
- batchSize?: number;
22
- };
23
- }
24
-
25
- export class Orchestrator {
26
- private config: RAGPipelineConfig;
27
- private chunksFile: string;
28
- private embeddingsFile: string;
29
-
30
- constructor(config: RAGPipelineConfig) {
31
- this.config = config;
32
- this.chunksFile = config.options?.chunksFile || "./docs/rag/chunks.json";
33
- this.embeddingsFile =
34
- config.options?.embeddingsFile || "./docs/rag/embeddings.json";
35
- }
36
-
37
- async run(): Promise<void> {
38
- console.log("🚀 Starting RAG pipeline...\n");
39
-
40
- console.log("📂 Step 1: Scanning for changes...");
41
- const gitTracker = new GitTracker(this.config.chunkers);
42
- const currentState = await gitTracker.getCurrentState();
43
-
44
- const previousState = new Map<string, string>();
45
- const { toProcess, toDelete } =
46
- await gitTracker.getChangedFiles(previousState);
47
-
48
- if (
49
- toProcess.length === 0 &&
50
- toDelete.length === 0 &&
51
- !this.config.options?.force
52
- ) {
53
- console.log("\n✨ No changes detected.");
54
- return;
55
- }
56
-
57
- console.log(
58
- `\n📊 Changes: ${toProcess.length} to process, ${toDelete.length} to delete\n`,
59
- );
60
-
61
- console.log("🔪 Step 2: Generating chunks...");
62
- const chunkProcessor = new ChunkProcessor(this.config.chunkers);
63
-
64
- const fileState = new Map();
65
- for (const file of toProcess) {
66
- const info = currentState.get(file);
67
- if (info) fileState.set(file, info);
68
- }
69
-
70
- const chunks = await chunkProcessor.processFiles(toProcess, fileState);
71
- await chunkProcessor.saveChunksLocal(chunks, this.chunksFile);
72
-
73
- if (chunks.length === 0) {
74
- console.log("\n⚠️ No chunks generated. Exiting.");
75
- return;
76
- }
77
-
78
- console.log("\n🔢 Step 3: Generating embeddings...");
79
- const embedder = new EmbedderProcessor(this.config.embedder, {
80
- rateLimitMs: this.config.options?.rateLimitMs,
81
- batchSize: this.config.options?.batchSize,
82
- });
83
-
84
- await embedder.run(this.chunksFile, this.config.options?.force || false);
85
-
86
- if (!this.config.options?.skipUpload) {
87
- console.log("\n📤 Step 4: Uploading to vector store...");
88
- const uploader = new Uploader(this.config.vectorStore);
89
- await uploader.sync(
90
- this.embeddingsFile,
91
- this.config.options?.force || false,
92
- );
93
- }
94
-
95
- console.log("\n✨ RAG pipeline complete!");
96
- }
97
- }
@@ -1,123 +0,0 @@
1
- import {
2
- VectorStore,
3
- VectorDocument,
4
- EmbeddedChunk,
5
- } from "../interfaces/index.js";
6
- import { readFile } from "fs/promises";
7
-
8
- export class Uploader {
9
- private vectorStore: VectorStore;
10
-
11
- constructor(vectorStore: VectorStore) {
12
- this.vectorStore = vectorStore;
13
- }
14
-
15
- private chunkToDocument(
16
- chunk: EmbeddedChunk,
17
- collection?: string,
18
- ): VectorDocument {
19
- return {
20
- content: chunk.content,
21
- metadata: chunk.metadata,
22
- embedding: chunk.embedding,
23
- sourceFile: chunk.sourceFile,
24
- commitHash: chunk.commitHash,
25
- contentHash: chunk.contentHash!,
26
- collection,
27
- };
28
- }
29
-
30
- async getItemsToUpload(
31
- embeddingsFile: string,
32
- force: boolean = false,
33
- ): Promise<{
34
- toUpload: EmbeddedChunk[];
35
- toDelete: string[];
36
- }> {
37
- let embeddings: EmbeddedChunk[];
38
- try {
39
- const content = await readFile(embeddingsFile, "utf-8");
40
- embeddings = JSON.parse(content);
41
- } catch {
42
- throw new Error(`Embeddings file not found: ${embeddingsFile}`);
43
- }
44
-
45
- console.log(
46
- `📖 Loaded ${embeddings.length} embeddings from ${embeddingsFile}`,
47
- );
48
-
49
- if (force) {
50
- const allSourceFiles = [...new Set(embeddings.map((e) => e.sourceFile))];
51
- return { toUpload: embeddings, toDelete: allSourceFiles };
52
- }
53
-
54
- const existingState = await this.vectorStore.getCurrentState();
55
- const toUploadList: EmbeddedChunk[] = [];
56
- const toDeleteSet = new Set<string>();
57
-
58
- for (const emb of embeddings) {
59
- const existingHash = existingState.get(emb.sourceFile);
60
-
61
- if (!existingHash) {
62
- toUploadList.push(emb);
63
- } else if (existingHash !== emb.commitHash) {
64
- toDeleteSet.add(emb.sourceFile);
65
- toUploadList.push(emb);
66
- }
67
- }
68
-
69
- return {
70
- toUpload: toUploadList,
71
- toDelete: [...toDeleteSet],
72
- };
73
- }
74
-
75
- async sync(
76
- embeddingsFile: string,
77
- force: boolean = false,
78
- ): Promise<{
79
- uploaded: number;
80
- deleted: number;
81
- }> {
82
- console.log("📤 Starting incremental upload...");
83
-
84
- await this.vectorStore.initialize();
85
-
86
- const { toUpload, toDelete } = await this.getItemsToUpload(
87
- embeddingsFile,
88
- force,
89
- );
90
-
91
- console.log(`\n📊 Need to upload: ${toUpload.length} documents`);
92
- console.log(` Need to delete: ${toDelete.length} files`);
93
-
94
- if (toUpload.length === 0 && toDelete.length === 0) {
95
- console.log("\n✨ No changes detected.");
96
- return { uploaded: 0, deleted: 0 };
97
- }
98
-
99
- if (toDelete.length > 0) {
100
- await this.vectorStore.deleteBySourceFile(toDelete);
101
- console.log(` 🗑️ Deleted ${toDelete.length} obsolete documents`);
102
- }
103
-
104
- if (toUpload.length > 0) {
105
- const documents = toUpload.map((e) => this.chunkToDocument(e));
106
-
107
- const batchSize = 50;
108
- for (let i = 0; i < documents.length; i += batchSize) {
109
- const batch = documents.slice(i, i + batchSize);
110
- await this.vectorStore.upsert(batch);
111
- console.log(
112
- ` ✅ Uploaded batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(documents.length / batchSize)}`,
113
- );
114
- }
115
- }
116
-
117
- console.log(`\n✨ Upload complete!`);
118
- console.log(` Uploaded: ${toUpload.length}`);
119
- console.log(` Deleted: ${toDelete.length}`);
120
-
121
- return { uploaded: toUpload.length, deleted: toDelete.length };
122
- }
123
- }
package/src/core/utils.ts DELETED
@@ -1,27 +0,0 @@
1
- import { createHash } from "crypto";
2
-
3
- export function computeContentHash(content: string): string {
4
- return createHash("sha256").update(content).digest("hex").slice(0, 16);
5
- }
6
-
7
- export function sleep(ms: number): Promise<void> {
8
- return new Promise((resolve) => setTimeout(resolve, ms));
9
- }
10
-
11
- export function batchArray<T>(array: T[], batchSize: number): T[][] {
12
- const batches: T[][] = [];
13
- for (let i = 0; i < array.length; i += batchSize) {
14
- batches.push(array.slice(i, i + batchSize));
15
- }
16
- return batches;
17
- }
18
-
19
- export function extractFileName(filePath: string): string {
20
- return filePath.split("/").pop() || filePath;
21
- }
22
-
23
- export function extractDirectory(filePath: string): string {
24
- const parts = filePath.split("/");
25
- parts.pop();
26
- return parts.join("/");
27
- }
@@ -1,31 +0,0 @@
1
- import { describe, it, expect, vi } from "vitest";
2
- import { createChunker } from "./create-chunker.js";
3
-
4
- describe("createChunker", () => {
5
- it("should create a chunker with given options", () => {
6
- const mockProcess = vi.fn().mockResolvedValue([]);
7
-
8
- const chunker = createChunker({
9
- name: "test-chunker",
10
- patterns: ["**/*.txt"],
11
- process: mockProcess,
12
- });
13
-
14
- expect(chunker.name).toBe("test-chunker");
15
- expect(chunker.patterns).toEqual(["**/*.txt"]);
16
- expect(chunker.chunk).toBeDefined();
17
- });
18
-
19
- it("should have canProcess method when provided", () => {
20
- const mockCanProcess = vi.fn().mockResolvedValue(true);
21
-
22
- const chunker = createChunker({
23
- name: "test",
24
- patterns: ["**/*.txt"],
25
- process: vi.fn().mockResolvedValue([]),
26
- canProcess: mockCanProcess,
27
- });
28
-
29
- expect(chunker.canProcess).toBeDefined();
30
- });
31
- });
@@ -1,40 +0,0 @@
1
- import { FileChunker, Chunk } from "../interfaces/index.js";
2
-
3
- export interface CreateChunkerOptions {
4
- name: string;
5
- patterns: string[];
6
- process: (
7
- content: string,
8
- filePath: string,
9
- commitHash: string,
10
- ) => Promise<Chunk[]>;
11
- canProcess?: (filePath: string, content?: string) => Promise<boolean>;
12
- }
13
-
14
- export function createChunker(options: CreateChunkerOptions): FileChunker {
15
- return {
16
- name: options.name,
17
- patterns: options.patterns,
18
-
19
- async chunk(filePath: string, commitHash: string): Promise<Chunk[]> {
20
- const { readFile } = await import("fs/promises");
21
- const content = await readFile(filePath, "utf-8");
22
-
23
- if (options.canProcess) {
24
- const canProcess = await options.canProcess(filePath, content);
25
- if (!canProcess) {
26
- return [];
27
- }
28
- }
29
-
30
- return options.process(content, filePath, commitHash);
31
- },
32
-
33
- async canProcess(filePath: string, content?: string): Promise<boolean> {
34
- if (options.canProcess) {
35
- return options.canProcess(filePath, content);
36
- }
37
- return true;
38
- },
39
- };
40
- }
package/src/index.test.ts DELETED
@@ -1,33 +0,0 @@
1
- import { describe, it, expect } from "vitest";
2
-
3
- describe("@vivantel/rag-core", () => {
4
- it("should export all public interfaces", async () => {
5
- const module = await import("./index.js");
6
-
7
- // Core
8
- expect(module.GitTracker).toBeDefined();
9
- expect(module.ChunkProcessor).toBeDefined();
10
- expect(module.EmbedderProcessor).toBeDefined();
11
- expect(module.Uploader).toBeDefined();
12
- expect(module.Orchestrator).toBeDefined();
13
-
14
- // Utils
15
- expect(module.computeContentHash).toBeDefined();
16
- expect(module.sleep).toBeDefined();
17
- expect(module.batchArray).toBeDefined();
18
- expect(module.extractFileName).toBeDefined();
19
- expect(module.extractDirectory).toBeDefined();
20
-
21
- // Strategies
22
- expect(module.tokenStrategy).toBeDefined();
23
- expect(module.markdownHeadersStrategy).toBeDefined();
24
- expect(module.semanticStrategy).toBeDefined();
25
- expect(module.wholeFileStrategy).toBeDefined();
26
-
27
- // Helpers
28
- expect(module.createChunker).toBeDefined();
29
-
30
- // Config loader
31
- expect(module.loadConfig).toBeDefined();
32
- });
33
- });
package/src/index.ts DELETED
@@ -1,30 +0,0 @@
1
- // Interfaces
2
- export * from "./interfaces/index.js";
3
-
4
- // Core
5
- export { GitTracker } from "./core/git-tracker.js";
6
- export { ChunkProcessor } from "./core/chunk-processor.js";
7
- export { EmbedderProcessor } from "./core/embedder.js";
8
- export { Uploader } from "./core/uploader.js";
9
- export { Orchestrator, RAGPipelineConfig } from "./core/orchestrator.js";
10
- export {
11
- computeContentHash,
12
- sleep,
13
- batchArray,
14
- extractFileName,
15
- extractDirectory,
16
- } from "./core/utils.js";
17
-
18
- // Strategies
19
- export {
20
- tokenStrategy,
21
- markdownHeadersStrategy,
22
- semanticStrategy,
23
- wholeFileStrategy,
24
- } from "./strategies/chunk/index.js";
25
-
26
- // Helpers
27
- export { createChunker } from "./helpers/create-chunker.js";
28
-
29
- // Config loader
30
- export { loadConfig } from "./config-loader.js";
@@ -1,59 +0,0 @@
1
- /**
2
- * Chunk interfaces - core building blocks for document processing
3
- */
4
-
5
- export interface Chunk {
6
- /** The actual text content of the chunk */
7
- content: string;
8
-
9
- /** Metadata about this chunk (source file, type, etc.) */
10
- metadata: Record<string, unknown>;
11
-
12
- /** Original source file path */
13
- sourceFile: string;
14
-
15
- /** Git commit hash when this chunk was generated */
16
- commitHash: string;
17
-
18
- /** Optional unique hash of the content (for change detection) */
19
- contentHash?: string;
20
- }
21
-
22
- export interface FileChunker {
23
- /** Unique name of this chunker */
24
- name: string;
25
-
26
- /** Glob patterns this chunker handles */
27
- patterns: string[];
28
-
29
- /**
30
- * Process a file and return chunks.
31
- * Returns empty array if file should be skipped.
32
- */
33
- chunk(filePath: string, commitHash: string): Promise<Chunk[]>;
34
-
35
- /**
36
- * Optional: validate if this chunker can process the file
37
- * (called before chunk() to filter early)
38
- */
39
- canProcess?(filePath: string, content?: string): Promise<boolean>;
40
- }
41
-
42
- export interface ChunkStrategy {
43
- /** Strategy name */
44
- name: string;
45
-
46
- /** Split text into chunks according to strategy */
47
- chunk(text: string, filePath?: string): Promise<Chunk[]>;
48
-
49
- /** Optional: extract metadata without full chunking */
50
- extractMetadata?(text: string, filePath?: string): Record<string, unknown>;
51
- }
52
-
53
- export interface ChunkTransformer {
54
- /** Transformer name */
55
- name: string;
56
-
57
- /** Transform a chunk (return null to skip) */
58
- transform(chunk: Chunk): Promise<Chunk | null>;
59
- }