@vivantel/rag-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/config/release-please.json +38 -0
- package/.github/dependabot.yaml +28 -0
- package/.github/workflows/ci.yaml +119 -0
- package/.github/workflows/publish.yaml +151 -0
- package/.github/workflows/release.yaml +150 -0
- package/.versionrc.json +19 -0
- package/CHANGELOG.md +21 -0
- package/README.md +62 -0
- package/bin/rag-update.ts +49 -0
- package/dist/config-loader.d.ts +3 -0
- package/dist/config-loader.d.ts.map +1 -0
- package/dist/config-loader.js +13 -0
- package/dist/config-loader.js.map +1 -0
- package/dist/core/chunk-processor.d.ts +12 -0
- package/dist/core/chunk-processor.d.ts.map +1 -0
- package/dist/core/chunk-processor.js +65 -0
- package/dist/core/chunk-processor.js.map +1 -0
- package/dist/core/embedder.d.ts +19 -0
- package/dist/core/embedder.d.ts.map +1 -0
- package/dist/core/embedder.js +139 -0
- package/dist/core/embedder.js.map +1 -0
- package/dist/core/git-tracker.d.ts +25 -0
- package/dist/core/git-tracker.d.ts.map +1 -0
- package/dist/core/git-tracker.js +164 -0
- package/dist/core/git-tracker.js.map +1 -0
- package/dist/core/orchestrator.d.ts +22 -0
- package/dist/core/orchestrator.d.ts.map +1 -0
- package/dist/core/orchestrator.js +57 -0
- package/dist/core/orchestrator.js.map +1 -0
- package/dist/core/uploader.d.ts +15 -0
- package/dist/core/uploader.d.ts.map +1 -0
- package/dist/core/uploader.js +79 -0
- package/dist/core/uploader.js.map +1 -0
- package/dist/core/utils.d.ts +6 -0
- package/dist/core/utils.d.ts.map +1 -0
- package/dist/core/utils.js +23 -0
- package/dist/core/utils.js.map +1 -0
- package/dist/helpers/create-chunker.d.ts +9 -0
- package/dist/helpers/create-chunker.d.ts.map +1 -0
- package/dist/helpers/create-chunker.js +24 -0
- package/dist/helpers/create-chunker.js.map +1 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/interfaces/chunker.d.ts +46 -0
- package/dist/interfaces/chunker.d.ts.map +1 -0
- package/dist/interfaces/chunker.js +5 -0
- package/dist/interfaces/chunker.js.map +1 -0
- package/dist/interfaces/embedder.d.ts +28 -0
- package/dist/interfaces/embedder.d.ts.map +1 -0
- package/dist/interfaces/embedder.js +5 -0
- package/dist/interfaces/embedder.js.map +1 -0
- package/dist/interfaces/index.d.ts +4 -0
- package/dist/interfaces/index.d.ts.map +1 -0
- package/dist/interfaces/index.js +4 -0
- package/dist/interfaces/index.js.map +1 -0
- package/dist/interfaces/vector-store.d.ts +53 -0
- package/dist/interfaces/vector-store.d.ts.map +1 -0
- package/dist/interfaces/vector-store.js +5 -0
- package/dist/interfaces/vector-store.js.map +1 -0
- package/dist/strategies/chunk/index.d.ts +5 -0
- package/dist/strategies/chunk/index.d.ts.map +1 -0
- package/dist/strategies/chunk/index.js +5 -0
- package/dist/strategies/chunk/index.js.map +1 -0
- package/dist/strategies/chunk/markdown-headers.d.ts +7 -0
- package/dist/strategies/chunk/markdown-headers.d.ts.map +1 -0
- package/dist/strategies/chunk/markdown-headers.js +89 -0
- package/dist/strategies/chunk/markdown-headers.js.map +1 -0
- package/dist/strategies/chunk/semantic.d.ts +7 -0
- package/dist/strategies/chunk/semantic.d.ts.map +1 -0
- package/dist/strategies/chunk/semantic.js +62 -0
- package/dist/strategies/chunk/semantic.js.map +1 -0
- package/dist/strategies/chunk/token.d.ts +12 -0
- package/dist/strategies/chunk/token.d.ts.map +1 -0
- package/dist/strategies/chunk/token.js +56 -0
- package/dist/strategies/chunk/token.js.map +1 -0
- package/dist/strategies/chunk/whole-file.d.ts +3 -0
- package/dist/strategies/chunk/whole-file.d.ts.map +1 -0
- package/dist/strategies/chunk/whole-file.js +31 -0
- package/dist/strategies/chunk/whole-file.js.map +1 -0
- package/eslint.config.js +25 -0
- package/package.json +102 -0
- package/src/config-loader.ts +21 -0
- package/src/core/chunk-processor.test.ts +36 -0
- package/src/core/chunk-processor.ts +92 -0
- package/src/core/embedder.ts +189 -0
- package/src/core/git-tracker.test.ts +64 -0
- package/src/core/git-tracker.ts +202 -0
- package/src/core/orchestrator.test.ts +53 -0
- package/src/core/orchestrator.ts +97 -0
- package/src/core/uploader.ts +123 -0
- package/src/core/utils.ts +27 -0
- package/src/helpers/create-chunker.test.ts +31 -0
- package/src/helpers/create-chunker.ts +40 -0
- package/src/index.test.ts +33 -0
- package/src/index.ts +30 -0
- package/src/interfaces/chunker.ts +59 -0
- package/src/interfaces/embedder.ts +36 -0
- package/src/interfaces/index.test.ts +9 -0
- package/src/interfaces/index.ts +3 -0
- package/src/interfaces/vector-store.ts +71 -0
- package/src/strategies/chunk/index.ts +4 -0
- package/src/strategies/chunk/markdown-headers.test.ts +37 -0
- package/src/strategies/chunk/markdown-headers.ts +106 -0
- package/src/strategies/chunk/semantic.test.ts +21 -0
- package/src/strategies/chunk/semantic.ts +80 -0
- package/src/strategies/chunk/token.test.ts +41 -0
- package/src/strategies/chunk/token.ts +72 -0
- package/src/strategies/chunk/whole-file.test.ts +24 -0
- package/src/strategies/chunk/whole-file.ts +35 -0
- package/tsconfig.json +21 -0
- package/typedoc.json +11 -0
- package/vitest.config.ts +19 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding provider interfaces
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { Chunk } from "./chunker.js";
|
|
6
|
+
|
|
7
|
+
export interface EmbeddingProvider {
|
|
8
|
+
/** Provider name (e.g., 'github-models', 'openai') */
|
|
9
|
+
readonly name: string;
|
|
10
|
+
|
|
11
|
+
/** Embedding vector dimensions */
|
|
12
|
+
readonly dimensions: number;
|
|
13
|
+
|
|
14
|
+
/** Maximum tokens per request (optional) */
|
|
15
|
+
readonly maxTokens?: number;
|
|
16
|
+
|
|
17
|
+
/** Convert text to embedding vector */
|
|
18
|
+
embed(text: string): Promise<number[]>;
|
|
19
|
+
|
|
20
|
+
/** Batch convert (optional, for performance) */
|
|
21
|
+
embedBatch?(texts: string[]): Promise<number[][]>;
|
|
22
|
+
|
|
23
|
+
/** Check if provider is available (e.g., valid API key) */
|
|
24
|
+
healthCheck?(): Promise<boolean>;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface EmbeddingConfig {
|
|
28
|
+
provider: EmbeddingProvider;
|
|
29
|
+
batchSize?: number;
|
|
30
|
+
rateLimitMs?: number;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface EmbeddedChunk extends Chunk {
|
|
34
|
+
embedding: number[];
|
|
35
|
+
embeddedAt: number;
|
|
36
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector store interfaces
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export interface VectorDocument {
|
|
6
|
+
/** Unique ID (optional, auto-generated if not provided) */
|
|
7
|
+
id?: string;
|
|
8
|
+
|
|
9
|
+
/** Original text content */
|
|
10
|
+
content: string;
|
|
11
|
+
|
|
12
|
+
/** Metadata for filtering */
|
|
13
|
+
metadata: Record<string, unknown>;
|
|
14
|
+
|
|
15
|
+
/** Embedding vector */
|
|
16
|
+
embedding: number[];
|
|
17
|
+
|
|
18
|
+
/** Source file path (for tracking updates) */
|
|
19
|
+
sourceFile: string;
|
|
20
|
+
|
|
21
|
+
/** Git commit hash (for change detection) */
|
|
22
|
+
commitHash: string;
|
|
23
|
+
|
|
24
|
+
/** Content hash (for change detection) */
|
|
25
|
+
contentHash: string;
|
|
26
|
+
|
|
27
|
+
/** Collection name (for multi-collection stores) */
|
|
28
|
+
collection?: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface VectorSearchResult {
|
|
32
|
+
id: string;
|
|
33
|
+
content: string;
|
|
34
|
+
metadata: Record<string, unknown>;
|
|
35
|
+
similarity: number;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface VectorStore {
|
|
39
|
+
/** Store name */
|
|
40
|
+
readonly name: string;
|
|
41
|
+
|
|
42
|
+
/** Initialize store (create tables, indexes, etc.) */
|
|
43
|
+
initialize(): Promise<void>;
|
|
44
|
+
|
|
45
|
+
/** Insert or update documents */
|
|
46
|
+
upsert(documents: VectorDocument[]): Promise<void>;
|
|
47
|
+
|
|
48
|
+
/** Delete documents by source file */
|
|
49
|
+
deleteBySourceFile(sourceFiles: string[]): Promise<void>;
|
|
50
|
+
|
|
51
|
+
/** Get current state (sourceFile → commitHash) for change detection */
|
|
52
|
+
getCurrentState(collection?: string): Promise<Map<string, string>>;
|
|
53
|
+
|
|
54
|
+
/** Search by embedding vector */
|
|
55
|
+
search(
|
|
56
|
+
queryEmbedding: number[],
|
|
57
|
+
topK: number,
|
|
58
|
+
collection?: string,
|
|
59
|
+
): Promise<VectorSearchResult[]>;
|
|
60
|
+
|
|
61
|
+
/** Optional: delete entire collection */
|
|
62
|
+
deleteCollection?(collection: string): Promise<void>;
|
|
63
|
+
|
|
64
|
+
/** Optional: get store statistics */
|
|
65
|
+
getStats?(): Promise<{ documentCount: number; collections: string[] }>;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export interface VectorStoreConfig {
|
|
69
|
+
provider: VectorStore;
|
|
70
|
+
collection?: string;
|
|
71
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { markdownHeadersStrategy } from "./markdown-headers.js";
|
|
3
|
+
|
|
4
|
+
describe("markdownHeadersStrategy", () => {
|
|
5
|
+
const strategy = markdownHeadersStrategy({ minChunkSize: 10 });
|
|
6
|
+
|
|
7
|
+
it("should have correct name", () => {
|
|
8
|
+
expect(strategy.name).toBe("markdown-headers");
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
it("should split by headers", async () => {
|
|
12
|
+
const text = `# Header 1
|
|
13
|
+
Content for header 1.
|
|
14
|
+
|
|
15
|
+
## Header 2
|
|
16
|
+
Content for header 2.
|
|
17
|
+
|
|
18
|
+
### Header 3
|
|
19
|
+
Content for header 3.`;
|
|
20
|
+
|
|
21
|
+
const chunks = await strategy.chunk(text);
|
|
22
|
+
|
|
23
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
24
|
+
|
|
25
|
+
for (const chunk of chunks) {
|
|
26
|
+
expect(chunk.metadata.header).toBeDefined();
|
|
27
|
+
expect(chunk.metadata.header_level).toBeDefined();
|
|
28
|
+
}
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it("should handle text without headers", async () => {
|
|
32
|
+
const text = "Plain text without any markdown headers.";
|
|
33
|
+
const chunks = await strategy.chunk(text);
|
|
34
|
+
|
|
35
|
+
expect(Array.isArray(chunks)).toBe(true);
|
|
36
|
+
});
|
|
37
|
+
});
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { ChunkStrategy, Chunk } from "../../interfaces/index.js";
|
|
2
|
+
|
|
3
|
+
export interface MarkdownHeadersOptions {
|
|
4
|
+
minChunkSize?: number;
|
|
5
|
+
maxChunkSize?: number;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export function markdownHeadersStrategy(
|
|
9
|
+
options: MarkdownHeadersOptions = {},
|
|
10
|
+
): ChunkStrategy {
|
|
11
|
+
const minChunkSize = options.minChunkSize ?? 100;
|
|
12
|
+
const maxChunkSize = options.maxChunkSize ?? 8000;
|
|
13
|
+
|
|
14
|
+
return {
|
|
15
|
+
name: "markdown-headers",
|
|
16
|
+
|
|
17
|
+
async chunk(text: string, filePath?: string): Promise<Chunk[]> {
|
|
18
|
+
const chunks: Chunk[] = [];
|
|
19
|
+
const lines = text.split("\n");
|
|
20
|
+
|
|
21
|
+
let currentChunk: string[] = [];
|
|
22
|
+
let currentHeader = "";
|
|
23
|
+
let currentHeaderLevel = 0;
|
|
24
|
+
|
|
25
|
+
for (const line of lines) {
|
|
26
|
+
const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
27
|
+
|
|
28
|
+
if (headerMatch) {
|
|
29
|
+
// Save previous chunk if not empty
|
|
30
|
+
if (currentChunk.length > 0) {
|
|
31
|
+
const content = currentChunk.join("\n").trim();
|
|
32
|
+
if (content.length >= minChunkSize) {
|
|
33
|
+
chunks.push({
|
|
34
|
+
content,
|
|
35
|
+
metadata: {
|
|
36
|
+
strategy: this.name,
|
|
37
|
+
header: currentHeader,
|
|
38
|
+
header_level: currentHeaderLevel,
|
|
39
|
+
source_file: filePath,
|
|
40
|
+
},
|
|
41
|
+
sourceFile: filePath || "unknown",
|
|
42
|
+
commitHash: "",
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Start new chunk
|
|
48
|
+
currentHeaderLevel = headerMatch[1].length;
|
|
49
|
+
currentHeader = headerMatch[2];
|
|
50
|
+
currentChunk = [line];
|
|
51
|
+
} else {
|
|
52
|
+
currentChunk.push(line);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Prevent chunks from getting too large
|
|
56
|
+
const currentSize = currentChunk.join("\n").length;
|
|
57
|
+
if (currentSize > maxChunkSize && currentChunk.length > 10) {
|
|
58
|
+
const content = currentChunk.join("\n").trim();
|
|
59
|
+
chunks.push({
|
|
60
|
+
content,
|
|
61
|
+
metadata: {
|
|
62
|
+
strategy: this.name,
|
|
63
|
+
header: currentHeader,
|
|
64
|
+
header_level: currentHeaderLevel,
|
|
65
|
+
truncated: true,
|
|
66
|
+
},
|
|
67
|
+
sourceFile: filePath || "unknown",
|
|
68
|
+
commitHash: "",
|
|
69
|
+
});
|
|
70
|
+
currentChunk = [];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Last chunk
|
|
75
|
+
if (currentChunk.length > 0) {
|
|
76
|
+
const content = currentChunk.join("\n").trim();
|
|
77
|
+
if (content.length >= minChunkSize) {
|
|
78
|
+
chunks.push({
|
|
79
|
+
content,
|
|
80
|
+
metadata: {
|
|
81
|
+
strategy: this.name,
|
|
82
|
+
header: currentHeader,
|
|
83
|
+
header_level: currentHeaderLevel,
|
|
84
|
+
source_file: filePath,
|
|
85
|
+
is_last: true,
|
|
86
|
+
},
|
|
87
|
+
sourceFile: filePath || "unknown",
|
|
88
|
+
commitHash: "",
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return chunks;
|
|
94
|
+
},
|
|
95
|
+
|
|
96
|
+
extractMetadata(text: string, _filePath?: string): Record<string, unknown> {
|
|
97
|
+
const headerMatch = text.match(/^(#{1,6})\s+(.+)$/m);
|
|
98
|
+
return {
|
|
99
|
+
strategy: this.name,
|
|
100
|
+
has_headers: !!headerMatch,
|
|
101
|
+
first_header: headerMatch?.[2],
|
|
102
|
+
line_count: text.split("\n").length,
|
|
103
|
+
};
|
|
104
|
+
},
|
|
105
|
+
};
|
|
106
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { semanticStrategy } from "./semantic.js";
|
|
3
|
+
|
|
4
|
+
describe("semanticStrategy", () => {
|
|
5
|
+
const strategy = semanticStrategy({ maxChars: 100, minChars: 10 });
|
|
6
|
+
|
|
7
|
+
it("should have correct name", () => {
|
|
8
|
+
expect(strategy.name).toBe("semantic");
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
it("should split by sentences", async () => {
|
|
12
|
+
const text = "First sentence. Second sentence! Third sentence? Fourth.";
|
|
13
|
+
const chunks = await strategy.chunk(text);
|
|
14
|
+
|
|
15
|
+
expect(Array.isArray(chunks)).toBe(true);
|
|
16
|
+
|
|
17
|
+
for (const chunk of chunks) {
|
|
18
|
+
expect(chunk.metadata.strategy).toBe("semantic");
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
});
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { ChunkStrategy, Chunk } from "../../interfaces/index.js";
|
|
2
|
+
|
|
3
|
+
export interface SemanticStrategyOptions {
|
|
4
|
+
maxChars?: number;
|
|
5
|
+
minChars?: number;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export function semanticStrategy(
|
|
9
|
+
options: SemanticStrategyOptions = {},
|
|
10
|
+
): ChunkStrategy {
|
|
11
|
+
const maxChars = options.maxChars ?? 2000;
|
|
12
|
+
const minChars = options.minChars ?? 100;
|
|
13
|
+
|
|
14
|
+
return {
|
|
15
|
+
name: "semantic",
|
|
16
|
+
|
|
17
|
+
async chunk(text: string, filePath?: string): Promise<Chunk[]> {
|
|
18
|
+
const chunks: Chunk[] = [];
|
|
19
|
+
|
|
20
|
+
// Split by sentences (simple approach)
|
|
21
|
+
const sentences = text.split(/(?<=[.!?])\s+/);
|
|
22
|
+
|
|
23
|
+
let currentChunk: string[] = [];
|
|
24
|
+
let currentSize = 0;
|
|
25
|
+
|
|
26
|
+
for (const sentence of sentences) {
|
|
27
|
+
const sentenceSize = sentence.length;
|
|
28
|
+
|
|
29
|
+
if (currentSize + sentenceSize > maxChars && currentChunk.length > 0) {
|
|
30
|
+
const content = currentChunk.join(" ").trim();
|
|
31
|
+
if (content.length >= minChars) {
|
|
32
|
+
chunks.push({
|
|
33
|
+
content,
|
|
34
|
+
metadata: {
|
|
35
|
+
strategy: this.name,
|
|
36
|
+
sentence_count: currentChunk.length,
|
|
37
|
+
source_file: filePath,
|
|
38
|
+
},
|
|
39
|
+
sourceFile: filePath || "unknown",
|
|
40
|
+
commitHash: "",
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
currentChunk = [];
|
|
44
|
+
currentSize = 0;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
currentChunk.push(sentence);
|
|
48
|
+
currentSize += sentenceSize;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Last chunk
|
|
52
|
+
if (currentChunk.length > 0) {
|
|
53
|
+
const content = currentChunk.join(" ").trim();
|
|
54
|
+
if (content.length >= minChars) {
|
|
55
|
+
chunks.push({
|
|
56
|
+
content,
|
|
57
|
+
metadata: {
|
|
58
|
+
strategy: this.name,
|
|
59
|
+
sentence_count: currentChunk.length,
|
|
60
|
+
source_file: filePath,
|
|
61
|
+
is_last: true,
|
|
62
|
+
},
|
|
63
|
+
sourceFile: filePath || "unknown",
|
|
64
|
+
commitHash: "",
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return chunks;
|
|
70
|
+
},
|
|
71
|
+
|
|
72
|
+
extractMetadata(text: string, _filePath?: string): Record<string, unknown> {
|
|
73
|
+
return {
|
|
74
|
+
strategy: this.name,
|
|
75
|
+
sentence_count: text.split(/[.!?]+/).length,
|
|
76
|
+
char_count: text.length,
|
|
77
|
+
};
|
|
78
|
+
},
|
|
79
|
+
};
|
|
80
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { tokenStrategy } from "./token.js";
|
|
3
|
+
|
|
4
|
+
describe.skip("tokenStrategy", () => {
|
|
5
|
+
const strategy = tokenStrategy({ maxTokens: 50, overlap: 10 });
|
|
6
|
+
|
|
7
|
+
it("should have correct name", () => {
|
|
8
|
+
expect(strategy.name).toContain("token");
|
|
9
|
+
expect(typeof strategy.name).toBe("string");
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
it("should chunk text", async () => {
|
|
13
|
+
const text = "This is a test sentence. ".repeat(100);
|
|
14
|
+
const chunks = await strategy.chunk(text, "test.txt");
|
|
15
|
+
|
|
16
|
+
expect(Array.isArray(chunks)).toBe(true);
|
|
17
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
18
|
+
|
|
19
|
+
if (chunks.length > 0) {
|
|
20
|
+
expect(chunks[0].content).toBeDefined();
|
|
21
|
+
expect(typeof chunks[0].content).toBe("string");
|
|
22
|
+
expect(chunks[0].metadata).toBeDefined();
|
|
23
|
+
expect(chunks[0].metadata.strategy).toBeDefined();
|
|
24
|
+
}
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("should extract metadata", () => {
|
|
28
|
+
const text = "Test content";
|
|
29
|
+
const metadata = strategy.extractMetadata?.(text);
|
|
30
|
+
|
|
31
|
+
// extractMetadata is optional, so it might be undefined
|
|
32
|
+
if (metadata) {
|
|
33
|
+
expect(metadata.strategy).toBe(strategy.name);
|
|
34
|
+
expect(metadata.char_count).toBeDefined();
|
|
35
|
+
expect(metadata.estimated_tokens).toBeDefined();
|
|
36
|
+
} else {
|
|
37
|
+
// If extractMetadata is not implemented, just pass
|
|
38
|
+
expect(true).toBe(true);
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
});
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { ChunkStrategy, Chunk } from "../../interfaces/index.js";
|
|
2
|
+
|
|
3
|
+
export interface TokenStrategyOptions {
|
|
4
|
+
maxTokens?: number;
|
|
5
|
+
overlap?: number;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Split text by approximate token count.
|
|
10
|
+
* Simple implementation: ~4 chars per token for English.
|
|
11
|
+
* For production, use a proper tokenizer (tiktoken, etc.)
|
|
12
|
+
*/
|
|
13
|
+
export function tokenStrategy(
|
|
14
|
+
options: TokenStrategyOptions = {},
|
|
15
|
+
): ChunkStrategy {
|
|
16
|
+
const maxTokens = options.maxTokens ?? 500;
|
|
17
|
+
const overlap = options.overlap ?? 50;
|
|
18
|
+
const charsPerToken = 4;
|
|
19
|
+
const maxChars = maxTokens * charsPerToken;
|
|
20
|
+
const overlapChars = overlap * charsPerToken;
|
|
21
|
+
|
|
22
|
+
return {
|
|
23
|
+
name: `token-${maxTokens}`,
|
|
24
|
+
|
|
25
|
+
async chunk(text: string, filePath?: string): Promise<Chunk[]> {
|
|
26
|
+
const chunks: Chunk[] = [];
|
|
27
|
+
let start = 0;
|
|
28
|
+
|
|
29
|
+
while (start < text.length) {
|
|
30
|
+
let end = Math.min(start + maxChars, text.length);
|
|
31
|
+
|
|
32
|
+
// Try to break at sentence boundary
|
|
33
|
+
if (end < text.length) {
|
|
34
|
+
const lastPeriod = text.lastIndexOf(".", end);
|
|
35
|
+
const lastNewline = text.lastIndexOf("\n", end);
|
|
36
|
+
const breakPoint = Math.max(lastPeriod, lastNewline);
|
|
37
|
+
if (breakPoint > start) {
|
|
38
|
+
end = breakPoint + 1;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const content = text.slice(start, end).trim();
|
|
43
|
+
if (content) {
|
|
44
|
+
chunks.push({
|
|
45
|
+
content,
|
|
46
|
+
metadata: {
|
|
47
|
+
strategy: this.name,
|
|
48
|
+
chunk_index: chunks.length,
|
|
49
|
+
source_file: filePath,
|
|
50
|
+
start_char: start,
|
|
51
|
+
end_char: end,
|
|
52
|
+
},
|
|
53
|
+
sourceFile: filePath || "unknown",
|
|
54
|
+
commitHash: "", // Will be filled by caller
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
start = end - overlapChars;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return chunks;
|
|
62
|
+
},
|
|
63
|
+
|
|
64
|
+
extractMetadata(text: string, _filePath?: string): Record<string, unknown> {
|
|
65
|
+
return {
|
|
66
|
+
strategy: this.name,
|
|
67
|
+
char_count: text.length,
|
|
68
|
+
estimated_tokens: Math.ceil(text.length / charsPerToken),
|
|
69
|
+
};
|
|
70
|
+
},
|
|
71
|
+
};
|
|
72
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { wholeFileStrategy } from "./whole-file.js";
|
|
3
|
+
|
|
4
|
+
describe("wholeFileStrategy", () => {
|
|
5
|
+
const strategy = wholeFileStrategy();
|
|
6
|
+
|
|
7
|
+
it("should have correct name", () => {
|
|
8
|
+
expect(strategy.name).toBe("whole-file");
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
it("should return single chunk", async () => {
|
|
12
|
+
const text = "Complete file content.";
|
|
13
|
+
const chunks = await strategy.chunk(text);
|
|
14
|
+
|
|
15
|
+
expect(chunks).toHaveLength(1);
|
|
16
|
+
expect(chunks[0].content).toBe(text);
|
|
17
|
+
expect(chunks[0].metadata.strategy).toBe("whole-file");
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it("should return empty array for empty text", async () => {
|
|
21
|
+
const chunks = await strategy.chunk("");
|
|
22
|
+
expect(chunks).toHaveLength(0);
|
|
23
|
+
});
|
|
24
|
+
});
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { ChunkStrategy, Chunk } from "../../interfaces/index.js";
|
|
2
|
+
|
|
3
|
+
export function wholeFileStrategy(): ChunkStrategy {
|
|
4
|
+
return {
|
|
5
|
+
name: "whole-file",
|
|
6
|
+
|
|
7
|
+
async chunk(text: string, filePath?: string): Promise<Chunk[]> {
|
|
8
|
+
if (!text || text.trim().length === 0) {
|
|
9
|
+
return [];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
return [
|
|
13
|
+
{
|
|
14
|
+
content: text,
|
|
15
|
+
metadata: {
|
|
16
|
+
strategy: this.name,
|
|
17
|
+
source_file: filePath,
|
|
18
|
+
char_count: text.length,
|
|
19
|
+
line_count: text.split("\n").length,
|
|
20
|
+
},
|
|
21
|
+
sourceFile: filePath || "unknown",
|
|
22
|
+
commitHash: "",
|
|
23
|
+
},
|
|
24
|
+
];
|
|
25
|
+
},
|
|
26
|
+
|
|
27
|
+
extractMetadata(text: string, _filePath?: string): Record<string, unknown> {
|
|
28
|
+
return {
|
|
29
|
+
strategy: this.name,
|
|
30
|
+
char_count: text.length,
|
|
31
|
+
line_count: text.split("\n").length,
|
|
32
|
+
};
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ES2022",
|
|
4
|
+
"module": "NodeNext",
|
|
5
|
+
"moduleResolution": "NodeNext",
|
|
6
|
+
"lib": ["ES2022"],
|
|
7
|
+
"outDir": "./dist",
|
|
8
|
+
"rootDir": "./src",
|
|
9
|
+
"declaration": true,
|
|
10
|
+
"declarationMap": true,
|
|
11
|
+
"sourceMap": true,
|
|
12
|
+
"strict": true,
|
|
13
|
+
"esModuleInterop": true,
|
|
14
|
+
"skipLibCheck": true,
|
|
15
|
+
"forceConsistentCasingInFileNames": true,
|
|
16
|
+
"resolveJsonModule": true,
|
|
17
|
+
"types": ["node"]
|
|
18
|
+
},
|
|
19
|
+
"include": ["src/**/*"],
|
|
20
|
+
"exclude": ["node_modules", "dist", "**/*.test.ts"]
|
|
21
|
+
}
|
package/typedoc.json
ADDED
package/vitest.config.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { defineConfig } from 'vitest/config';
|
|
2
|
+
|
|
3
|
+
export default defineConfig({
|
|
4
|
+
test: {
|
|
5
|
+
maxWorkers: 4,
|
|
6
|
+
isolate: false,
|
|
7
|
+
|
|
8
|
+
coverage: {
|
|
9
|
+
provider: 'v8',
|
|
10
|
+
reporter: ['text', 'json', 'html'],
|
|
11
|
+
include: ['src/**/*.ts'],
|
|
12
|
+
exclude: ['src/**/*.test.ts'],
|
|
13
|
+
},
|
|
14
|
+
exclude: ['node_modules', '.git'],
|
|
15
|
+
globals: true,
|
|
16
|
+
testTimeout: 10000,
|
|
17
|
+
environment: 'node',
|
|
18
|
+
},
|
|
19
|
+
});
|