@vivantel/rag-core 1.1.2 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config-loader.d.ts.map +1 -1
- package/dist/config-loader.js +0 -2
- package/dist/config-loader.js.map +1 -1
- package/dist/core/chunk-processor.d.ts.map +1 -1
- package/dist/core/chunk-processor.js +27 -20
- package/dist/core/chunk-processor.js.map +1 -1
- package/dist/core/embedder.d.ts.map +1 -1
- package/dist/core/embedder.js +10 -3
- package/dist/core/embedder.js.map +1 -1
- package/dist/core/git-tracker.d.ts.map +1 -1
- package/dist/core/git-tracker.js +9 -59
- package/dist/core/git-tracker.js.map +1 -1
- package/dist/core/orchestrator.d.ts.map +1 -1
- package/dist/core/orchestrator.js +22 -1
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/uploader.d.ts.map +1 -1
- package/dist/core/uploader.js +13 -4
- package/dist/core/uploader.js.map +1 -1
- package/dist/strategies/chunk/token.js +1 -1
- package/dist/strategies/chunk/token.js.map +1 -1
- package/package.json +5 -1
- package/.github/config/release-please.json +0 -38
- package/.github/dependabot.yaml +0 -28
- package/.github/workflows/ci.yaml +0 -119
- package/.github/workflows/publish.yaml +0 -155
- package/.github/workflows/release.yaml +0 -54
- package/.release-please-manifest.json +0 -3
- package/.versionrc.json +0 -19
- package/CHANGELOG.md +0 -51
- package/bin/rag-update.ts +0 -49
- package/eslint.config.js +0 -25
- package/src/config-loader.ts +0 -21
- package/src/core/chunk-processor.test.ts +0 -36
- package/src/core/chunk-processor.ts +0 -92
- package/src/core/embedder.ts +0 -189
- package/src/core/git-tracker.test.ts +0 -64
- package/src/core/git-tracker.ts +0 -202
- package/src/core/orchestrator.test.ts +0 -53
- package/src/core/orchestrator.ts +0 -97
- package/src/core/uploader.ts +0 -123
- package/src/core/utils.ts +0 -27
- package/src/helpers/create-chunker.test.ts +0 -31
- package/src/helpers/create-chunker.ts +0 -40
- package/src/index.test.ts +0 -33
- package/src/index.ts +0 -30
- package/src/interfaces/chunker.ts +0 -59
- package/src/interfaces/embedder.ts +0 -36
- package/src/interfaces/index.test.ts +0 -9
- package/src/interfaces/index.ts +0 -3
- package/src/interfaces/vector-store.ts +0 -71
- package/src/strategies/chunk/index.ts +0 -4
- package/src/strategies/chunk/markdown-headers.test.ts +0 -37
- package/src/strategies/chunk/markdown-headers.ts +0 -106
- package/src/strategies/chunk/semantic.test.ts +0 -21
- package/src/strategies/chunk/semantic.ts +0 -80
- package/src/strategies/chunk/token.test.ts +0 -41
- package/src/strategies/chunk/token.ts +0 -72
- package/src/strategies/chunk/whole-file.test.ts +0 -24
- package/src/strategies/chunk/whole-file.ts +0 -35
- package/tsconfig.json +0 -21
- package/typedoc.json +0 -11
- package/vitest.config.ts +0 -19
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
import { ChunkStrategy, Chunk } from "../../interfaces/index.js";
|
|
2
|
-
|
|
3
|
-
export interface SemanticStrategyOptions {
|
|
4
|
-
maxChars?: number;
|
|
5
|
-
minChars?: number;
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
export function semanticStrategy(
|
|
9
|
-
options: SemanticStrategyOptions = {},
|
|
10
|
-
): ChunkStrategy {
|
|
11
|
-
const maxChars = options.maxChars ?? 2000;
|
|
12
|
-
const minChars = options.minChars ?? 100;
|
|
13
|
-
|
|
14
|
-
return {
|
|
15
|
-
name: "semantic",
|
|
16
|
-
|
|
17
|
-
async chunk(text: string, filePath?: string): Promise<Chunk[]> {
|
|
18
|
-
const chunks: Chunk[] = [];
|
|
19
|
-
|
|
20
|
-
// Split by sentences (simple approach)
|
|
21
|
-
const sentences = text.split(/(?<=[.!?])\s+/);
|
|
22
|
-
|
|
23
|
-
let currentChunk: string[] = [];
|
|
24
|
-
let currentSize = 0;
|
|
25
|
-
|
|
26
|
-
for (const sentence of sentences) {
|
|
27
|
-
const sentenceSize = sentence.length;
|
|
28
|
-
|
|
29
|
-
if (currentSize + sentenceSize > maxChars && currentChunk.length > 0) {
|
|
30
|
-
const content = currentChunk.join(" ").trim();
|
|
31
|
-
if (content.length >= minChars) {
|
|
32
|
-
chunks.push({
|
|
33
|
-
content,
|
|
34
|
-
metadata: {
|
|
35
|
-
strategy: this.name,
|
|
36
|
-
sentence_count: currentChunk.length,
|
|
37
|
-
source_file: filePath,
|
|
38
|
-
},
|
|
39
|
-
sourceFile: filePath || "unknown",
|
|
40
|
-
commitHash: "",
|
|
41
|
-
});
|
|
42
|
-
}
|
|
43
|
-
currentChunk = [];
|
|
44
|
-
currentSize = 0;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
currentChunk.push(sentence);
|
|
48
|
-
currentSize += sentenceSize;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
// Last chunk
|
|
52
|
-
if (currentChunk.length > 0) {
|
|
53
|
-
const content = currentChunk.join(" ").trim();
|
|
54
|
-
if (content.length >= minChars) {
|
|
55
|
-
chunks.push({
|
|
56
|
-
content,
|
|
57
|
-
metadata: {
|
|
58
|
-
strategy: this.name,
|
|
59
|
-
sentence_count: currentChunk.length,
|
|
60
|
-
source_file: filePath,
|
|
61
|
-
is_last: true,
|
|
62
|
-
},
|
|
63
|
-
sourceFile: filePath || "unknown",
|
|
64
|
-
commitHash: "",
|
|
65
|
-
});
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
return chunks;
|
|
70
|
-
},
|
|
71
|
-
|
|
72
|
-
extractMetadata(text: string, _filePath?: string): Record<string, unknown> {
|
|
73
|
-
return {
|
|
74
|
-
strategy: this.name,
|
|
75
|
-
sentence_count: text.split(/[.!?]+/).length,
|
|
76
|
-
char_count: text.length,
|
|
77
|
-
};
|
|
78
|
-
},
|
|
79
|
-
};
|
|
80
|
-
}
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from "vitest";
|
|
2
|
-
import { tokenStrategy } from "./token.js";
|
|
3
|
-
|
|
4
|
-
describe.skip("tokenStrategy", () => {
|
|
5
|
-
const strategy = tokenStrategy({ maxTokens: 50, overlap: 10 });
|
|
6
|
-
|
|
7
|
-
it("should have correct name", () => {
|
|
8
|
-
expect(strategy.name).toContain("token");
|
|
9
|
-
expect(typeof strategy.name).toBe("string");
|
|
10
|
-
});
|
|
11
|
-
|
|
12
|
-
it("should chunk text", async () => {
|
|
13
|
-
const text = "This is a test sentence. ".repeat(100);
|
|
14
|
-
const chunks = await strategy.chunk(text, "test.txt");
|
|
15
|
-
|
|
16
|
-
expect(Array.isArray(chunks)).toBe(true);
|
|
17
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
18
|
-
|
|
19
|
-
if (chunks.length > 0) {
|
|
20
|
-
expect(chunks[0].content).toBeDefined();
|
|
21
|
-
expect(typeof chunks[0].content).toBe("string");
|
|
22
|
-
expect(chunks[0].metadata).toBeDefined();
|
|
23
|
-
expect(chunks[0].metadata.strategy).toBeDefined();
|
|
24
|
-
}
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
it("should extract metadata", () => {
|
|
28
|
-
const text = "Test content";
|
|
29
|
-
const metadata = strategy.extractMetadata?.(text);
|
|
30
|
-
|
|
31
|
-
// extractMetadata is optional, so it might be undefined
|
|
32
|
-
if (metadata) {
|
|
33
|
-
expect(metadata.strategy).toBe(strategy.name);
|
|
34
|
-
expect(metadata.char_count).toBeDefined();
|
|
35
|
-
expect(metadata.estimated_tokens).toBeDefined();
|
|
36
|
-
} else {
|
|
37
|
-
// If extractMetadata is not implemented, just pass
|
|
38
|
-
expect(true).toBe(true);
|
|
39
|
-
}
|
|
40
|
-
});
|
|
41
|
-
});
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
import { ChunkStrategy, Chunk } from "../../interfaces/index.js";
|
|
2
|
-
|
|
3
|
-
export interface TokenStrategyOptions {
|
|
4
|
-
maxTokens?: number;
|
|
5
|
-
overlap?: number;
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Split text by approximate token count.
|
|
10
|
-
* Simple implementation: ~4 chars per token for English.
|
|
11
|
-
* For production, use a proper tokenizer (tiktoken, etc.)
|
|
12
|
-
*/
|
|
13
|
-
export function tokenStrategy(
|
|
14
|
-
options: TokenStrategyOptions = {},
|
|
15
|
-
): ChunkStrategy {
|
|
16
|
-
const maxTokens = options.maxTokens ?? 500;
|
|
17
|
-
const overlap = options.overlap ?? 50;
|
|
18
|
-
const charsPerToken = 4;
|
|
19
|
-
const maxChars = maxTokens * charsPerToken;
|
|
20
|
-
const overlapChars = overlap * charsPerToken;
|
|
21
|
-
|
|
22
|
-
return {
|
|
23
|
-
name: `token-${maxTokens}`,
|
|
24
|
-
|
|
25
|
-
async chunk(text: string, filePath?: string): Promise<Chunk[]> {
|
|
26
|
-
const chunks: Chunk[] = [];
|
|
27
|
-
let start = 0;
|
|
28
|
-
|
|
29
|
-
while (start < text.length) {
|
|
30
|
-
let end = Math.min(start + maxChars, text.length);
|
|
31
|
-
|
|
32
|
-
// Try to break at sentence boundary
|
|
33
|
-
if (end < text.length) {
|
|
34
|
-
const lastPeriod = text.lastIndexOf(".", end);
|
|
35
|
-
const lastNewline = text.lastIndexOf("\n", end);
|
|
36
|
-
const breakPoint = Math.max(lastPeriod, lastNewline);
|
|
37
|
-
if (breakPoint > start) {
|
|
38
|
-
end = breakPoint + 1;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
const content = text.slice(start, end).trim();
|
|
43
|
-
if (content) {
|
|
44
|
-
chunks.push({
|
|
45
|
-
content,
|
|
46
|
-
metadata: {
|
|
47
|
-
strategy: this.name,
|
|
48
|
-
chunk_index: chunks.length,
|
|
49
|
-
source_file: filePath,
|
|
50
|
-
start_char: start,
|
|
51
|
-
end_char: end,
|
|
52
|
-
},
|
|
53
|
-
sourceFile: filePath || "unknown",
|
|
54
|
-
commitHash: "", // Will be filled by caller
|
|
55
|
-
});
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
start = end - overlapChars;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
return chunks;
|
|
62
|
-
},
|
|
63
|
-
|
|
64
|
-
extractMetadata(text: string, _filePath?: string): Record<string, unknown> {
|
|
65
|
-
return {
|
|
66
|
-
strategy: this.name,
|
|
67
|
-
char_count: text.length,
|
|
68
|
-
estimated_tokens: Math.ceil(text.length / charsPerToken),
|
|
69
|
-
};
|
|
70
|
-
},
|
|
71
|
-
};
|
|
72
|
-
}
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from "vitest";
|
|
2
|
-
import { wholeFileStrategy } from "./whole-file.js";
|
|
3
|
-
|
|
4
|
-
describe("wholeFileStrategy", () => {
|
|
5
|
-
const strategy = wholeFileStrategy();
|
|
6
|
-
|
|
7
|
-
it("should have correct name", () => {
|
|
8
|
-
expect(strategy.name).toBe("whole-file");
|
|
9
|
-
});
|
|
10
|
-
|
|
11
|
-
it("should return single chunk", async () => {
|
|
12
|
-
const text = "Complete file content.";
|
|
13
|
-
const chunks = await strategy.chunk(text);
|
|
14
|
-
|
|
15
|
-
expect(chunks).toHaveLength(1);
|
|
16
|
-
expect(chunks[0].content).toBe(text);
|
|
17
|
-
expect(chunks[0].metadata.strategy).toBe("whole-file");
|
|
18
|
-
});
|
|
19
|
-
|
|
20
|
-
it("should return empty array for empty text", async () => {
|
|
21
|
-
const chunks = await strategy.chunk("");
|
|
22
|
-
expect(chunks).toHaveLength(0);
|
|
23
|
-
});
|
|
24
|
-
});
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import { ChunkStrategy, Chunk } from "../../interfaces/index.js";
|
|
2
|
-
|
|
3
|
-
export function wholeFileStrategy(): ChunkStrategy {
|
|
4
|
-
return {
|
|
5
|
-
name: "whole-file",
|
|
6
|
-
|
|
7
|
-
async chunk(text: string, filePath?: string): Promise<Chunk[]> {
|
|
8
|
-
if (!text || text.trim().length === 0) {
|
|
9
|
-
return [];
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
return [
|
|
13
|
-
{
|
|
14
|
-
content: text,
|
|
15
|
-
metadata: {
|
|
16
|
-
strategy: this.name,
|
|
17
|
-
source_file: filePath,
|
|
18
|
-
char_count: text.length,
|
|
19
|
-
line_count: text.split("\n").length,
|
|
20
|
-
},
|
|
21
|
-
sourceFile: filePath || "unknown",
|
|
22
|
-
commitHash: "",
|
|
23
|
-
},
|
|
24
|
-
];
|
|
25
|
-
},
|
|
26
|
-
|
|
27
|
-
extractMetadata(text: string, _filePath?: string): Record<string, unknown> {
|
|
28
|
-
return {
|
|
29
|
-
strategy: this.name,
|
|
30
|
-
char_count: text.length,
|
|
31
|
-
line_count: text.split("\n").length,
|
|
32
|
-
};
|
|
33
|
-
},
|
|
34
|
-
};
|
|
35
|
-
}
|
package/tsconfig.json
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"compilerOptions": {
|
|
3
|
-
"target": "ES2022",
|
|
4
|
-
"module": "NodeNext",
|
|
5
|
-
"moduleResolution": "NodeNext",
|
|
6
|
-
"lib": ["ES2022"],
|
|
7
|
-
"outDir": "./dist",
|
|
8
|
-
"rootDir": "./src",
|
|
9
|
-
"declaration": true,
|
|
10
|
-
"declarationMap": true,
|
|
11
|
-
"sourceMap": true,
|
|
12
|
-
"strict": true,
|
|
13
|
-
"esModuleInterop": true,
|
|
14
|
-
"skipLibCheck": true,
|
|
15
|
-
"forceConsistentCasingInFileNames": true,
|
|
16
|
-
"resolveJsonModule": true,
|
|
17
|
-
"types": ["node"]
|
|
18
|
-
},
|
|
19
|
-
"include": ["src/**/*"],
|
|
20
|
-
"exclude": ["node_modules", "dist", "**/*.test.ts"]
|
|
21
|
-
}
|
package/typedoc.json
DELETED
package/vitest.config.ts
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import { defineConfig } from 'vitest/config';
|
|
2
|
-
|
|
3
|
-
export default defineConfig({
|
|
4
|
-
test: {
|
|
5
|
-
maxWorkers: 4,
|
|
6
|
-
isolate: false,
|
|
7
|
-
|
|
8
|
-
coverage: {
|
|
9
|
-
provider: 'v8',
|
|
10
|
-
reporter: ['text', 'json', 'html'],
|
|
11
|
-
include: ['src/**/*.ts'],
|
|
12
|
-
exclude: ['src/**/*.test.ts'],
|
|
13
|
-
},
|
|
14
|
-
exclude: ['node_modules', '.git'],
|
|
15
|
-
globals: true,
|
|
16
|
-
testTimeout: 10000,
|
|
17
|
-
environment: 'node',
|
|
18
|
-
},
|
|
19
|
-
});
|