@vivantel/rag-core 0.1.0 ā 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -62
- package/dist/config-loader.d.ts.map +1 -1
- package/dist/config-loader.js +0 -2
- package/dist/config-loader.js.map +1 -1
- package/dist/core/chunk-processor.d.ts.map +1 -1
- package/dist/core/chunk-processor.js +27 -20
- package/dist/core/chunk-processor.js.map +1 -1
- package/dist/core/embedder.d.ts.map +1 -1
- package/dist/core/embedder.js +10 -3
- package/dist/core/embedder.js.map +1 -1
- package/dist/core/git-tracker.d.ts.map +1 -1
- package/dist/core/git-tracker.js +9 -59
- package/dist/core/git-tracker.js.map +1 -1
- package/dist/core/orchestrator.d.ts.map +1 -1
- package/dist/core/orchestrator.js +22 -1
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/uploader.d.ts.map +1 -1
- package/dist/core/uploader.js +13 -4
- package/dist/core/uploader.js.map +1 -1
- package/dist/strategies/chunk/token.js +1 -1
- package/dist/strategies/chunk/token.js.map +1 -1
- package/package.json +110 -102
- package/.github/config/release-please.json +0 -38
- package/.github/dependabot.yaml +0 -28
- package/.github/workflows/ci.yaml +0 -119
- package/.github/workflows/publish.yaml +0 -151
- package/.github/workflows/release.yaml +0 -150
- package/.versionrc.json +0 -19
- package/CHANGELOG.md +0 -21
- package/bin/rag-update.ts +0 -49
- package/eslint.config.js +0 -25
- package/src/config-loader.ts +0 -21
- package/src/core/chunk-processor.test.ts +0 -36
- package/src/core/chunk-processor.ts +0 -92
- package/src/core/embedder.ts +0 -189
- package/src/core/git-tracker.test.ts +0 -64
- package/src/core/git-tracker.ts +0 -202
- package/src/core/orchestrator.test.ts +0 -53
- package/src/core/orchestrator.ts +0 -97
- package/src/core/uploader.ts +0 -123
- package/src/core/utils.ts +0 -27
- package/src/helpers/create-chunker.test.ts +0 -31
- package/src/helpers/create-chunker.ts +0 -40
- package/src/index.test.ts +0 -33
- package/src/index.ts +0 -30
- package/src/interfaces/chunker.ts +0 -59
- package/src/interfaces/embedder.ts +0 -36
- package/src/interfaces/index.test.ts +0 -9
- package/src/interfaces/index.ts +0 -3
- package/src/interfaces/vector-store.ts +0 -71
- package/src/strategies/chunk/index.ts +0 -4
- package/src/strategies/chunk/markdown-headers.test.ts +0 -37
- package/src/strategies/chunk/markdown-headers.ts +0 -106
- package/src/strategies/chunk/semantic.test.ts +0 -21
- package/src/strategies/chunk/semantic.ts +0 -80
- package/src/strategies/chunk/token.test.ts +0 -41
- package/src/strategies/chunk/token.ts +0 -72
- package/src/strategies/chunk/whole-file.test.ts +0 -24
- package/src/strategies/chunk/whole-file.ts +0 -35
- package/tsconfig.json +0 -21
- package/typedoc.json +0 -11
- package/vitest.config.ts +0 -19
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
name: Release
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
branches:
|
|
6
|
-
- master
|
|
7
|
-
|
|
8
|
-
permissions:
|
|
9
|
-
contents: write
|
|
10
|
-
pull-requests: write
|
|
11
|
-
issues: write
|
|
12
|
-
packages: write
|
|
13
|
-
|
|
14
|
-
jobs:
|
|
15
|
-
release-please:
|
|
16
|
-
name: Release Please
|
|
17
|
-
runs-on: ubuntu-latest
|
|
18
|
-
|
|
19
|
-
outputs:
|
|
20
|
-
release_created: ${{ steps.release.outputs.release_created }}
|
|
21
|
-
tag_name: ${{ steps.release.outputs.tag_name }}
|
|
22
|
-
version: ${{ steps.release.outputs.version }}
|
|
23
|
-
|
|
24
|
-
steps:
|
|
25
|
-
- name: Run Release Please
|
|
26
|
-
id: release
|
|
27
|
-
uses: googleapis/release-please-action@v5
|
|
28
|
-
with:
|
|
29
|
-
token: ${{ secrets.GITHUB_TOKEN }}
|
|
30
|
-
release-type: node
|
|
31
|
-
package-name: '@vivantel/rag-core'
|
|
32
|
-
config-file: .github/config/release-please.json
|
|
33
|
-
manifest-file: .release-please-manifest.json
|
|
34
|
-
|
|
35
|
-
publish-npm:
|
|
36
|
-
name: Publish to npm
|
|
37
|
-
needs: release-please
|
|
38
|
-
if: ${{ needs.release-please.outputs.release_created == 'true' }}
|
|
39
|
-
runs-on: ubuntu-latest
|
|
40
|
-
|
|
41
|
-
permissions:
|
|
42
|
-
contents: read
|
|
43
|
-
id-token: write # Š“Š»Ń npm provenance
|
|
44
|
-
|
|
45
|
-
steps:
|
|
46
|
-
- name: Checkout code
|
|
47
|
-
uses: actions/checkout@v6
|
|
48
|
-
with:
|
|
49
|
-
ref: ${{ needs.release-please.outputs.tag_name }}
|
|
50
|
-
|
|
51
|
-
- name: Setup Node.js
|
|
52
|
-
uses: actions/setup-node@v6
|
|
53
|
-
with:
|
|
54
|
-
node-version: '20.x'
|
|
55
|
-
registry-url: 'https://registry.npmjs.org'
|
|
56
|
-
cache: 'npm'
|
|
57
|
-
|
|
58
|
-
- name: Install dependencies
|
|
59
|
-
run: npm ci
|
|
60
|
-
|
|
61
|
-
- name: Build
|
|
62
|
-
run: npm run build
|
|
63
|
-
|
|
64
|
-
- name: Run tests
|
|
65
|
-
run: npm test
|
|
66
|
-
|
|
67
|
-
- name: Check package size
|
|
68
|
-
run: npx package-size
|
|
69
|
-
|
|
70
|
-
- name: Publish to npm
|
|
71
|
-
run: npm publish --provenance --access public
|
|
72
|
-
env:
|
|
73
|
-
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
|
74
|
-
|
|
75
|
-
- name: Create GitHub Release
|
|
76
|
-
uses: softprops/action-gh-release@v1
|
|
77
|
-
with:
|
|
78
|
-
tag_name: ${{ needs.release-please.outputs.tag_name }}
|
|
79
|
-
name: Release ${{ needs.release-please.outputs.version }}
|
|
80
|
-
body_path: CHANGELOG.md
|
|
81
|
-
draft: false
|
|
82
|
-
prerelease: false
|
|
83
|
-
env:
|
|
84
|
-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
85
|
-
|
|
86
|
-
create-github-package:
|
|
87
|
-
name: Publish to GitHub Packages
|
|
88
|
-
needs: release-please
|
|
89
|
-
if: ${{ needs.release-please.outputs.release_created == 'true' }}
|
|
90
|
-
runs-on: ubuntu-latest
|
|
91
|
-
|
|
92
|
-
permissions:
|
|
93
|
-
contents: read
|
|
94
|
-
packages: write
|
|
95
|
-
|
|
96
|
-
steps:
|
|
97
|
-
- name: Checkout code
|
|
98
|
-
uses: actions/checkout@v6
|
|
99
|
-
with:
|
|
100
|
-
ref: ${{ needs.release-please.outputs.tag_name }}
|
|
101
|
-
|
|
102
|
-
- name: Setup Node.js
|
|
103
|
-
uses: actions/setup-node@v6
|
|
104
|
-
with:
|
|
105
|
-
node-version: '20.x'
|
|
106
|
-
registry-url: 'https://npm.pkg.github.com'
|
|
107
|
-
cache: 'npm'
|
|
108
|
-
|
|
109
|
-
- name: Install dependencies
|
|
110
|
-
run: npm ci
|
|
111
|
-
|
|
112
|
-
- name: Build
|
|
113
|
-
run: npm run build
|
|
114
|
-
|
|
115
|
-
- name: Publish to GitHub Packages
|
|
116
|
-
run: npm publish
|
|
117
|
-
env:
|
|
118
|
-
NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
119
|
-
|
|
120
|
-
update-docs:
|
|
121
|
-
name: Update Documentation
|
|
122
|
-
needs: release-please
|
|
123
|
-
if: ${{ needs.release-please.outputs.release_created == 'true' }}
|
|
124
|
-
runs-on: ubuntu-latest
|
|
125
|
-
|
|
126
|
-
steps:
|
|
127
|
-
- name: Checkout code
|
|
128
|
-
uses: actions/checkout@v6
|
|
129
|
-
with:
|
|
130
|
-
ref: ${{ needs.release-please.outputs.tag_name }}
|
|
131
|
-
|
|
132
|
-
- name: Setup Node.js
|
|
133
|
-
uses: actions/setup-node@v6
|
|
134
|
-
with:
|
|
135
|
-
node-version: '20.x'
|
|
136
|
-
cache: 'npm'
|
|
137
|
-
|
|
138
|
-
- name: Install dependencies
|
|
139
|
-
run: npm ci
|
|
140
|
-
|
|
141
|
-
- name: Generate API documentation
|
|
142
|
-
run: npm run docs:generate
|
|
143
|
-
|
|
144
|
-
- name: Deploy to GitHub Pages
|
|
145
|
-
uses: peaceiris/actions-gh-pages@v4
|
|
146
|
-
with:
|
|
147
|
-
github_token: ${{ secrets.GITHUB_TOKEN }}
|
|
148
|
-
publish_dir: ./docs/api
|
|
149
|
-
destination_dir: ${{ needs.release-please.outputs.version }}
|
|
150
|
-
keep_files: false
|
package/.versionrc.json
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"types": [
|
|
3
|
-
{"type": "feat", "section": "Features"},
|
|
4
|
-
{"type": "fix", "section": "Bug Fixes"},
|
|
5
|
-
{"type": "perf", "section": "Performance Improvements"},
|
|
6
|
-
{"type": "refactor", "section": "Code Refactoring"},
|
|
7
|
-
{"type": "docs", "section": "Documentation"},
|
|
8
|
-
{"type": "style", "hidden": true},
|
|
9
|
-
{"type": "chore", "hidden": true},
|
|
10
|
-
{"type": "test", "hidden": true}
|
|
11
|
-
],
|
|
12
|
-
"releaseCommitMessageFormat": "chore: release {{currentTag}}",
|
|
13
|
-
"skip": {
|
|
14
|
-
"bump": false,
|
|
15
|
-
"changelog": false,
|
|
16
|
-
"commit": false,
|
|
17
|
-
"tag": false
|
|
18
|
-
}
|
|
19
|
-
}
|
package/CHANGELOG.md
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
# Changelog
|
|
2
|
-
|
|
3
|
-
All notable changes to this project will be documented in this file.
|
|
4
|
-
|
|
5
|
-
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
-
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
-
|
|
8
|
-
## [Unreleased]
|
|
9
|
-
|
|
10
|
-
### Added
|
|
11
|
-
- Initial release
|
|
12
|
-
|
|
13
|
-
### Changed
|
|
14
|
-
|
|
15
|
-
### Deprecated
|
|
16
|
-
|
|
17
|
-
### Removed
|
|
18
|
-
|
|
19
|
-
### Fixed
|
|
20
|
-
|
|
21
|
-
### Security
|
package/bin/rag-update.ts
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
import { Command } from 'commander';
|
|
4
|
-
import { config } from 'dotenv';
|
|
5
|
-
import { loadConfig } from '../src/config-loader.js';
|
|
6
|
-
import { Orchestrator } from '../src/core/orchestrator.js';
|
|
7
|
-
|
|
8
|
-
config();
|
|
9
|
-
|
|
10
|
-
const program = new Command();
|
|
11
|
-
|
|
12
|
-
program
|
|
13
|
-
.name('rag-update')
|
|
14
|
-
.description('Update RAG index with latest changes')
|
|
15
|
-
.version('1.0.0')
|
|
16
|
-
.option('-c, --config <path>', 'Path to config file', './rag.config.ts')
|
|
17
|
-
.option('-f, --force', 'Force full rebuild', false)
|
|
18
|
-
.option('--skip-upload', 'Skip upload to vector store', false)
|
|
19
|
-
.option('--chunks-file <path>', 'Output path for chunks.json')
|
|
20
|
-
.option('--embeddings-file <path>', 'Output path for embeddings.json')
|
|
21
|
-
.parse();
|
|
22
|
-
|
|
23
|
-
async function main() {
|
|
24
|
-
const options = program.opts();
|
|
25
|
-
|
|
26
|
-
console.log('š RAG Update Tool\n');
|
|
27
|
-
|
|
28
|
-
try {
|
|
29
|
-
const config = await loadConfig(options.config);
|
|
30
|
-
|
|
31
|
-
const orchestrator = new Orchestrator({
|
|
32
|
-
...config,
|
|
33
|
-
options: {
|
|
34
|
-
...config.options,
|
|
35
|
-
force: options.force || config.options?.force,
|
|
36
|
-
skipUpload: options.skipUpload || config.options?.skipUpload,
|
|
37
|
-
chunksFile: options.chunksFile || config.options?.chunksFile,
|
|
38
|
-
embeddingsFile: options.embeddingsFile || config.options?.embeddingsFile
|
|
39
|
-
}
|
|
40
|
-
});
|
|
41
|
-
|
|
42
|
-
await orchestrator.run();
|
|
43
|
-
} catch (error) {
|
|
44
|
-
console.error('ā Error:', error instanceof Error ? error.message : error);
|
|
45
|
-
process.exit(1);
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
main();
|
package/eslint.config.js
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import js from '@eslint/js';
|
|
2
|
-
import tseslint from 'typescript-eslint';
|
|
3
|
-
|
|
4
|
-
export default tseslint.config(
|
|
5
|
-
js.configs.recommended,
|
|
6
|
-
...tseslint.configs.recommended,
|
|
7
|
-
{
|
|
8
|
-
ignores: [
|
|
9
|
-
'dist/**',
|
|
10
|
-
'node_modules/**',
|
|
11
|
-
'coverage/**',
|
|
12
|
-
'*.config.js',
|
|
13
|
-
'*.config.ts'
|
|
14
|
-
]
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
files: ['src/**/*.ts'],
|
|
18
|
-
rules: {
|
|
19
|
-
'@typescript-eslint/no-explicit-any': 'warn',
|
|
20
|
-
'@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
|
|
21
|
-
'no-console': 'off',
|
|
22
|
-
'no-undef': 'off'
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
);
|
package/src/config-loader.ts
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import { pathToFileURL } from "url";
|
|
2
|
-
import { RAGPipelineConfig } from "./core/orchestrator.js";
|
|
3
|
-
|
|
4
|
-
export async function loadConfig(
|
|
5
|
-
configPath: string,
|
|
6
|
-
): Promise<RAGPipelineConfig> {
|
|
7
|
-
// Clear cache for hot reload
|
|
8
|
-
delete require.cache[require.resolve(configPath)];
|
|
9
|
-
|
|
10
|
-
const configUrl = pathToFileURL(configPath).href;
|
|
11
|
-
const configModule = await import(configUrl);
|
|
12
|
-
const config = configModule.default;
|
|
13
|
-
|
|
14
|
-
if (!config.chunkers || !config.embedder || !config.vectorStore) {
|
|
15
|
-
throw new Error(
|
|
16
|
-
"Invalid config: missing chunkers, embedder, or vectorStore",
|
|
17
|
-
);
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
return config;
|
|
21
|
-
}
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi } from "vitest";
|
|
2
|
-
import { ChunkProcessor } from "./chunk-processor.js";
|
|
3
|
-
import { FileChunker } from "../interfaces/index.js";
|
|
4
|
-
|
|
5
|
-
describe("ChunkProcessor", () => {
|
|
6
|
-
const mockChunker: FileChunker = {
|
|
7
|
-
name: "test",
|
|
8
|
-
patterns: ["**/*.txt"],
|
|
9
|
-
chunk: vi.fn().mockResolvedValue([
|
|
10
|
-
{
|
|
11
|
-
content: "test content",
|
|
12
|
-
metadata: { type: "test" },
|
|
13
|
-
sourceFile: "test.txt",
|
|
14
|
-
commitHash: "abc123",
|
|
15
|
-
contentHash: "hash123",
|
|
16
|
-
},
|
|
17
|
-
]),
|
|
18
|
-
};
|
|
19
|
-
|
|
20
|
-
it("should be instantiable", () => {
|
|
21
|
-
const processor = new ChunkProcessor([mockChunker]);
|
|
22
|
-
expect(processor).toBeInstanceOf(ChunkProcessor);
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it("should have processFile method", () => {
|
|
26
|
-
const processor = new ChunkProcessor([mockChunker]);
|
|
27
|
-
expect(processor.processFile).toBeDefined();
|
|
28
|
-
expect(typeof processor.processFile).toBe("function");
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
it("should have processFiles method", () => {
|
|
32
|
-
const processor = new ChunkProcessor([mockChunker]);
|
|
33
|
-
expect(processor.processFiles).toBeDefined();
|
|
34
|
-
expect(typeof processor.processFiles).toBe("function");
|
|
35
|
-
});
|
|
36
|
-
});
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
import { Chunk, FileChunker } from "../interfaces/index.js";
|
|
2
|
-
import { createHash } from "crypto";
|
|
3
|
-
|
|
4
|
-
function computeContentHash(content: string): string {
|
|
5
|
-
return createHash("sha256").update(content).digest("hex").slice(0, 16);
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
export class ChunkProcessor {
|
|
9
|
-
private chunkers: Map<string, FileChunker>;
|
|
10
|
-
|
|
11
|
-
constructor(chunkers: FileChunker[]) {
|
|
12
|
-
this.chunkers = new Map(chunkers.map((c) => [c.name, c]));
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
async processFile(
|
|
16
|
-
filePath: string,
|
|
17
|
-
commitHash: string,
|
|
18
|
-
chunker: FileChunker,
|
|
19
|
-
): Promise<Chunk[]> {
|
|
20
|
-
try {
|
|
21
|
-
const chunks = await chunker.chunk(filePath, commitHash);
|
|
22
|
-
|
|
23
|
-
for (const chunk of chunks) {
|
|
24
|
-
chunk.contentHash = computeContentHash(chunk.content);
|
|
25
|
-
chunk.sourceFile = filePath;
|
|
26
|
-
chunk.commitHash = commitHash;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
return chunks;
|
|
30
|
-
} catch (error) {
|
|
31
|
-
console.error(` ā Error processing ${filePath}: ${error}`);
|
|
32
|
-
return [];
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
async processFiles(
|
|
37
|
-
files: string[],
|
|
38
|
-
fileState: Map<string, { commitHash: string; chunker: FileChunker }>,
|
|
39
|
-
): Promise<Chunk[]> {
|
|
40
|
-
const allChunks: Chunk[] = [];
|
|
41
|
-
|
|
42
|
-
for (let i = 0; i < files.length; i++) {
|
|
43
|
-
const filePath = files[i];
|
|
44
|
-
const info = fileState.get(filePath);
|
|
45
|
-
|
|
46
|
-
if (!info) {
|
|
47
|
-
console.log(` ā ļø No chunker for: ${filePath}`);
|
|
48
|
-
continue;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
console.log(` [${i + 1}/${files.length}] ${filePath}`);
|
|
52
|
-
|
|
53
|
-
const chunks = await this.processFile(
|
|
54
|
-
filePath,
|
|
55
|
-
info.commitHash,
|
|
56
|
-
info.chunker,
|
|
57
|
-
);
|
|
58
|
-
|
|
59
|
-
if (chunks.length > 0) {
|
|
60
|
-
allChunks.push(...chunks);
|
|
61
|
-
console.log(` ā
Generated ${chunks.length} chunk(s)`);
|
|
62
|
-
} else {
|
|
63
|
-
console.log(` ā ļø No chunks generated (skipped)`);
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
return allChunks;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
async saveChunksLocal(chunks: Chunk[], outputFile: string): Promise<void> {
|
|
71
|
-
const { dirname } = await import("path");
|
|
72
|
-
const { mkdir, writeFile, readFile } = await import("fs/promises");
|
|
73
|
-
|
|
74
|
-
await mkdir(dirname(outputFile), { recursive: true });
|
|
75
|
-
|
|
76
|
-
let existing: Chunk[] = [];
|
|
77
|
-
try {
|
|
78
|
-
const content = await readFile(outputFile, "utf-8");
|
|
79
|
-
existing = JSON.parse(content);
|
|
80
|
-
} catch {
|
|
81
|
-
// File doesn't exist
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
const processedFiles = new Set(chunks.map((c) => c.sourceFile));
|
|
85
|
-
const filtered = existing.filter((c) => !processedFiles.has(c.sourceFile));
|
|
86
|
-
|
|
87
|
-
const allChunks = [...filtered, ...chunks];
|
|
88
|
-
|
|
89
|
-
await writeFile(outputFile, JSON.stringify(allChunks, null, 2));
|
|
90
|
-
console.log(`\nš¾ Saved ${allChunks.length} chunks to ${outputFile}`);
|
|
91
|
-
}
|
|
92
|
-
}
|
package/src/core/embedder.ts
DELETED
|
@@ -1,189 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
EmbeddingProvider,
|
|
3
|
-
EmbeddedChunk,
|
|
4
|
-
Chunk,
|
|
5
|
-
} from "../interfaces/index.js";
|
|
6
|
-
import { readFile, writeFile, mkdir } from "fs/promises";
|
|
7
|
-
import { dirname } from "path";
|
|
8
|
-
import { createHash } from "crypto";
|
|
9
|
-
|
|
10
|
-
function chunkContentHash(chunk: Chunk): string {
|
|
11
|
-
if (chunk.contentHash) return chunk.contentHash;
|
|
12
|
-
return createHash("sha256").update(chunk.content).digest("hex").slice(0, 16);
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
export class EmbedderProcessor {
|
|
16
|
-
private provider: EmbeddingProvider;
|
|
17
|
-
private rateLimitMs: number;
|
|
18
|
-
private batchSize: number;
|
|
19
|
-
|
|
20
|
-
constructor(
|
|
21
|
-
provider: EmbeddingProvider,
|
|
22
|
-
options: { rateLimitMs?: number; batchSize?: number } = {},
|
|
23
|
-
) {
|
|
24
|
-
this.provider = provider;
|
|
25
|
-
this.rateLimitMs = options.rateLimitMs ?? 500;
|
|
26
|
-
this.batchSize = options.batchSize ?? 10;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
private async sleep(ms: number): Promise<void> {
|
|
30
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
async embedChunk(chunk: Chunk): Promise<EmbeddedChunk> {
|
|
34
|
-
const embedding = await this.provider.embed(chunk.content);
|
|
35
|
-
|
|
36
|
-
return {
|
|
37
|
-
...chunk,
|
|
38
|
-
embedding,
|
|
39
|
-
embeddedAt: Date.now() / 1000,
|
|
40
|
-
};
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
async embedBatch(chunks: Chunk[]): Promise<EmbeddedChunk[]> {
|
|
44
|
-
const results: EmbeddedChunk[] = [];
|
|
45
|
-
|
|
46
|
-
if (this.provider.embedBatch && chunks.length >= this.batchSize) {
|
|
47
|
-
const texts = chunks.map((c) => c.content);
|
|
48
|
-
const embeddings = await this.provider.embedBatch(texts);
|
|
49
|
-
|
|
50
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
51
|
-
results.push({
|
|
52
|
-
...chunks[i],
|
|
53
|
-
embedding: embeddings[i],
|
|
54
|
-
embeddedAt: Date.now() / 1000,
|
|
55
|
-
});
|
|
56
|
-
}
|
|
57
|
-
} else {
|
|
58
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
59
|
-
const chunk = chunks[i];
|
|
60
|
-
const eventType =
|
|
61
|
-
(chunk.metadata.event_type as string) ||
|
|
62
|
-
(chunk.metadata.title as string) ||
|
|
63
|
-
chunk.sourceFile.split("/").pop() ||
|
|
64
|
-
"unknown";
|
|
65
|
-
|
|
66
|
-
console.log(` [${i + 1}/${chunks.length}] ${eventType}`);
|
|
67
|
-
|
|
68
|
-
const embedded = await this.embedChunk(chunk);
|
|
69
|
-
results.push(embedded);
|
|
70
|
-
|
|
71
|
-
if (this.rateLimitMs > 0 && i < chunks.length - 1) {
|
|
72
|
-
await this.sleep(this.rateLimitMs);
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
return results;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
async getChunksToEmbed(
|
|
81
|
-
chunksFile: string,
|
|
82
|
-
force: boolean = false,
|
|
83
|
-
): Promise<{
|
|
84
|
-
chunksToEmbed: Chunk[];
|
|
85
|
-
}> {
|
|
86
|
-
let chunks: Chunk[];
|
|
87
|
-
try {
|
|
88
|
-
const content = await readFile(chunksFile, "utf-8");
|
|
89
|
-
chunks = JSON.parse(content);
|
|
90
|
-
} catch {
|
|
91
|
-
throw new Error(`Chunks file not found: ${chunksFile}`);
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
console.log(`š Loaded ${chunks.length} chunks from ${chunksFile}`);
|
|
95
|
-
|
|
96
|
-
if (force) {
|
|
97
|
-
console.log(" ā ļø Force mode: embedding all chunks");
|
|
98
|
-
return { chunksToEmbed: chunks };
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
let existingEmbeddings: EmbeddedChunk[] = [];
|
|
102
|
-
const embeddingsFile = chunksFile.replace("chunks", "embeddings");
|
|
103
|
-
try {
|
|
104
|
-
const content = await readFile(embeddingsFile, "utf-8");
|
|
105
|
-
existingEmbeddings = JSON.parse(content);
|
|
106
|
-
} catch {
|
|
107
|
-
// No existing embeddings
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
const existingState = new Map<string, EmbeddedChunk>();
|
|
111
|
-
for (const emb of existingEmbeddings) {
|
|
112
|
-
const hash = emb.contentHash || chunkContentHash(emb);
|
|
113
|
-
existingState.set(hash, emb);
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
console.log(`š Existing embeddings: ${existingState.size} chunks`);
|
|
117
|
-
|
|
118
|
-
const chunksToEmbed: Chunk[] = [];
|
|
119
|
-
for (const chunk of chunks) {
|
|
120
|
-
const chunkHash = chunkContentHash(chunk);
|
|
121
|
-
if (!existingState.has(chunkHash)) {
|
|
122
|
-
chunksToEmbed.push(chunk);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
return { chunksToEmbed };
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
async saveEmbeddings(
|
|
130
|
-
newEmbeddings: EmbeddedChunk[],
|
|
131
|
-
chunksFile: string,
|
|
132
|
-
force: boolean = false,
|
|
133
|
-
): Promise<void> {
|
|
134
|
-
const embeddingsFile = chunksFile.replace("chunks", "embeddings");
|
|
135
|
-
await mkdir(dirname(embeddingsFile), { recursive: true });
|
|
136
|
-
|
|
137
|
-
const newByHash = new Map<string, EmbeddedChunk>();
|
|
138
|
-
for (const emb of newEmbeddings) {
|
|
139
|
-
const hash = emb.contentHash || chunkContentHash(emb);
|
|
140
|
-
newByHash.set(hash, emb);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
let existing: EmbeddedChunk[] = [];
|
|
144
|
-
if (!force) {
|
|
145
|
-
try {
|
|
146
|
-
const content = await readFile(embeddingsFile, "utf-8");
|
|
147
|
-
existing = JSON.parse(content);
|
|
148
|
-
} catch {
|
|
149
|
-
// No existing embeddings
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
const final = force
|
|
154
|
-
? []
|
|
155
|
-
: existing.filter((e) => {
|
|
156
|
-
const hash = e.contentHash || chunkContentHash(e);
|
|
157
|
-
return !newByHash.has(hash);
|
|
158
|
-
});
|
|
159
|
-
|
|
160
|
-
final.push(...newEmbeddings);
|
|
161
|
-
|
|
162
|
-
await writeFile(embeddingsFile, JSON.stringify(final, null, 2));
|
|
163
|
-
console.log(`\nš¾ Saved ${final.length} embeddings to ${embeddingsFile}`);
|
|
164
|
-
console.log(
|
|
165
|
-
` New: ${newEmbeddings.length}, Existing: ${final.length - newEmbeddings.length}`,
|
|
166
|
-
);
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
async run(
|
|
170
|
-
chunksFile: string,
|
|
171
|
-
force: boolean = false,
|
|
172
|
-
): Promise<EmbeddedChunk[]> {
|
|
173
|
-
console.log("š¢ Starting incremental embedding generation...");
|
|
174
|
-
|
|
175
|
-
const { chunksToEmbed } = await this.getChunksToEmbed(chunksFile, force);
|
|
176
|
-
|
|
177
|
-
if (chunksToEmbed.length === 0) {
|
|
178
|
-
console.log("\n⨠No chunks need embedding.");
|
|
179
|
-
return [];
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
console.log(`\nš Need to embed ${chunksToEmbed.length} chunks`);
|
|
183
|
-
|
|
184
|
-
const newEmbeddings = await this.embedBatch(chunksToEmbed);
|
|
185
|
-
await this.saveEmbeddings(newEmbeddings, chunksFile, force);
|
|
186
|
-
|
|
187
|
-
return newEmbeddings;
|
|
188
|
-
}
|
|
189
|
-
}
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
-
import { GitTracker } from "./git-tracker.js";
|
|
3
|
-
import { FileChunker } from "../interfaces/index.js";
|
|
4
|
-
import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "fs";
|
|
5
|
-
import { join } from "path";
|
|
6
|
-
import { tmpdir } from "os";
|
|
7
|
-
import { execSync } from "child_process";
|
|
8
|
-
|
|
9
|
-
describe("GitTracker", () => {
|
|
10
|
-
let testDir: string;
|
|
11
|
-
let originalCwd: string;
|
|
12
|
-
|
|
13
|
-
const mockChunker: FileChunker = {
|
|
14
|
-
name: "test",
|
|
15
|
-
patterns: ["**/*.txt", "**/*.yaml", "**/*.json"],
|
|
16
|
-
chunk: vi.fn().mockResolvedValue([]),
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
beforeEach(() => {
|
|
20
|
-
testDir = mkdtempSync(join(tmpdir(), "git-test-"));
|
|
21
|
-
originalCwd = process.cwd();
|
|
22
|
-
process.chdir(testDir);
|
|
23
|
-
|
|
24
|
-
mkdirSync(join(testDir, "src", "events"), { recursive: true });
|
|
25
|
-
|
|
26
|
-
writeFileSync(join(testDir, "test.txt"), "test content");
|
|
27
|
-
writeFileSync(
|
|
28
|
-
join(testDir, "src", "events", "booking.yaml"),
|
|
29
|
-
"event_type: BookingCreated",
|
|
30
|
-
);
|
|
31
|
-
writeFileSync(join(testDir, "config.json"), '{"key": "value"}');
|
|
32
|
-
|
|
33
|
-
execSync("git init", { stdio: "ignore" });
|
|
34
|
-
execSync('git config user.email "test@example.com"', { stdio: "ignore" });
|
|
35
|
-
execSync('git config user.name "Test"', { stdio: "ignore" });
|
|
36
|
-
execSync("git add .", { stdio: "ignore" });
|
|
37
|
-
execSync('git commit -m "initial"', { stdio: "ignore" });
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
afterEach(() => {
|
|
41
|
-
process.chdir(originalCwd);
|
|
42
|
-
rmSync(testDir, { recursive: true, force: true });
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
it("should be instantiable", () => {
|
|
46
|
-
const tracker = new GitTracker([mockChunker]);
|
|
47
|
-
expect(tracker).toBeInstanceOf(GitTracker);
|
|
48
|
-
});
|
|
49
|
-
|
|
50
|
-
it("should getAllTrackedFiles", async () => {
|
|
51
|
-
const tracker = new GitTracker([mockChunker]);
|
|
52
|
-
const files = await tracker.getAllTrackedFiles();
|
|
53
|
-
|
|
54
|
-
expect(files.length).toBeGreaterThan(0);
|
|
55
|
-
expect(files.some((f) => f.includes("test.txt"))).toBe(true);
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
it("should getCurrentState", async () => {
|
|
59
|
-
const tracker = new GitTracker([mockChunker]);
|
|
60
|
-
const state = await tracker.getCurrentState();
|
|
61
|
-
|
|
62
|
-
expect(state.size).toBeGreaterThan(0);
|
|
63
|
-
});
|
|
64
|
-
});
|