@vibe-agent-toolkit/rag 0.1.0-rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +424 -0
  2. package/dist/chunking/chunk-by-tokens.d.ts +22 -0
  3. package/dist/chunking/chunk-by-tokens.d.ts.map +1 -0
  4. package/dist/chunking/chunk-by-tokens.js +68 -0
  5. package/dist/chunking/chunk-by-tokens.js.map +1 -0
  6. package/dist/chunking/chunk-resource.d.ts +46 -0
  7. package/dist/chunking/chunk-resource.d.ts.map +1 -0
  8. package/dist/chunking/chunk-resource.js +131 -0
  9. package/dist/chunking/chunk-resource.js.map +1 -0
  10. package/dist/chunking/index.d.ts +10 -0
  11. package/dist/chunking/index.d.ts.map +1 -0
  12. package/dist/chunking/index.js +9 -0
  13. package/dist/chunking/index.js.map +1 -0
  14. package/dist/chunking/types.d.ts +42 -0
  15. package/dist/chunking/types.d.ts.map +1 -0
  16. package/dist/chunking/types.js +5 -0
  17. package/dist/chunking/types.js.map +1 -0
  18. package/dist/chunking/utils.d.ts +41 -0
  19. package/dist/chunking/utils.d.ts.map +1 -0
  20. package/dist/chunking/utils.js +62 -0
  21. package/dist/chunking/utils.js.map +1 -0
  22. package/dist/embedding-providers/index.d.ts +8 -0
  23. package/dist/embedding-providers/index.d.ts.map +1 -0
  24. package/dist/embedding-providers/index.js +8 -0
  25. package/dist/embedding-providers/index.js.map +1 -0
  26. package/dist/embedding-providers/openai-embedding-provider.d.ts +64 -0
  27. package/dist/embedding-providers/openai-embedding-provider.d.ts.map +1 -0
  28. package/dist/embedding-providers/openai-embedding-provider.js +92 -0
  29. package/dist/embedding-providers/openai-embedding-provider.js.map +1 -0
  30. package/dist/embedding-providers/transformers-embedding-provider.d.ts +62 -0
  31. package/dist/embedding-providers/transformers-embedding-provider.d.ts.map +1 -0
  32. package/dist/embedding-providers/transformers-embedding-provider.js +75 -0
  33. package/dist/embedding-providers/transformers-embedding-provider.js.map +1 -0
  34. package/dist/index.d.ts +12 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +16 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/interfaces/embedding.d.ts +33 -0
  39. package/dist/interfaces/embedding.d.ts.map +1 -0
  40. package/dist/interfaces/embedding.js +7 -0
  41. package/dist/interfaces/embedding.js.map +1 -0
  42. package/dist/interfaces/index.d.ts +9 -0
  43. package/dist/interfaces/index.d.ts.map +1 -0
  44. package/dist/interfaces/index.js +7 -0
  45. package/dist/interfaces/index.js.map +1 -0
  46. package/dist/interfaces/provider.d.ts +150 -0
  47. package/dist/interfaces/provider.d.ts.map +1 -0
  48. package/dist/interfaces/provider.js +8 -0
  49. package/dist/interfaces/provider.js.map +1 -0
  50. package/dist/interfaces/token-counter.d.ts +29 -0
  51. package/dist/interfaces/token-counter.d.ts.map +1 -0
  52. package/dist/interfaces/token-counter.js +7 -0
  53. package/dist/interfaces/token-counter.js.map +1 -0
  54. package/dist/schemas/admin.d.ts +82 -0
  55. package/dist/schemas/admin.d.ts.map +1 -0
  56. package/dist/schemas/admin.js +34 -0
  57. package/dist/schemas/admin.js.map +1 -0
  58. package/dist/schemas/chunk.d.ts +75 -0
  59. package/dist/schemas/chunk.d.ts.map +1 -0
  60. package/dist/schemas/chunk.js +39 -0
  61. package/dist/schemas/chunk.js.map +1 -0
  62. package/dist/schemas/index.d.ts +9 -0
  63. package/dist/schemas/index.d.ts.map +1 -0
  64. package/dist/schemas/index.js +9 -0
  65. package/dist/schemas/index.js.map +1 -0
  66. package/dist/schemas/json-schema.d.ts +86 -0
  67. package/dist/schemas/json-schema.d.ts.map +1 -0
  68. package/dist/schemas/json-schema.js +55 -0
  69. package/dist/schemas/json-schema.js.map +1 -0
  70. package/dist/schemas/query.d.ts +262 -0
  71. package/dist/schemas/query.d.ts.map +1 -0
  72. package/dist/schemas/query.js +56 -0
  73. package/dist/schemas/query.js.map +1 -0
  74. package/dist/token-counters/approximate-token-counter.d.ts +32 -0
  75. package/dist/token-counters/approximate-token-counter.d.ts.map +1 -0
  76. package/dist/token-counters/approximate-token-counter.js +40 -0
  77. package/dist/token-counters/approximate-token-counter.js.map +1 -0
  78. package/dist/token-counters/fast-token-counter.d.ts +33 -0
  79. package/dist/token-counters/fast-token-counter.d.ts.map +1 -0
  80. package/dist/token-counters/fast-token-counter.js +40 -0
  81. package/dist/token-counters/fast-token-counter.js.map +1 -0
  82. package/dist/token-counters/index.d.ts +8 -0
  83. package/dist/token-counters/index.d.ts.map +1 -0
  84. package/dist/token-counters/index.js +8 -0
  85. package/dist/token-counters/index.js.map +1 -0
  86. package/package.json +53 -0
package/README.md ADDED
@@ -0,0 +1,424 @@
1
+ # @vibe-agent-toolkit/rag
2
+
3
+ Abstract RAG (Retrieval-Augmented Generation) interfaces and shared implementations for vibe-agent-toolkit.
4
+
5
+ ## Overview
6
+
7
+ This package provides the core interfaces and schemas for RAG functionality in VAT (Vibe Agent Toolkit). It defines contracts that RAG implementations must follow, ensuring portability and consistency across different vector database backends.
8
+
9
+ **What's included:**
10
+ - **Interfaces**: `RAGQueryProvider`, `RAGAdminProvider`, `EmbeddingProvider`, `TokenCounter`
11
+ - **Schemas**: Zod schemas with TypeScript types and JSON Schema exports
12
+ - **Token counters**: `FastTokenCounter` (bytes/4 heuristic), `ApproximateTokenCounter` (gpt-tokenizer)
13
+ - **Embedding providers**: `TransformersEmbeddingProvider` (local, transformers.js), `OpenAIEmbeddingProvider` (cloud, OpenAI API)
14
+ - **Chunking utilities**: Hybrid heading-based + token-aware chunking with ResourceRegistry integration
15
+
16
+ **What's NOT included:**
17
+ - Vector database implementations (see `@vibe-agent-toolkit/rag-lancedb`)
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ bun add @vibe-agent-toolkit/rag
23
+ ```
24
+
25
+ ## Packages
26
+
27
+ - `@vibe-agent-toolkit/rag` - This package (interfaces + shared implementations)
28
+ - `@vibe-agent-toolkit/rag-lancedb` - LanceDB implementation (coming soon)
29
+ - Future: `@vibe-agent-toolkit/rag-pinecone`, `@vibe-agent-toolkit/rag-weaviate`, etc.
30
+
31
+ ## Usage
32
+
33
+ ### Using RAG Provider Interfaces
34
+
35
+ ```typescript
36
+ import type { RAGQueryProvider, RAGQuery } from '@vibe-agent-toolkit/rag';
37
+
38
+ // Get a RAG provider implementation (from rag-lancedb or other package)
39
+ const rag: RAGQueryProvider = ...; // Implementation
40
+
41
+ // Query the RAG database
42
+ const result = await rag.query({
43
+ text: 'How do I validate schemas?',
44
+ limit: 5,
45
+ filters: {
46
+ tags: ['validation'],
47
+ type: 'documentation'
48
+ }
49
+ });
50
+
51
+ // Use results
52
+ for (const chunk of result.chunks) {
53
+ console.log(`[${chunk.headingPath}] ${chunk.content}`);
54
+ }
55
+ ```
56
+
57
+ ### Using Schemas for Validation
58
+
59
+ ```typescript
60
+ import { RAGQuerySchema, RAGChunkSchema } from '@vibe-agent-toolkit/rag';
61
+
62
+ // Validate a query
63
+ const queryResult = RAGQuerySchema.safeParse(userInput);
64
+ if (!queryResult.success) {
65
+ console.error('Invalid query:', queryResult.error);
66
+ }
67
+
68
+ // Validate a chunk
69
+ const chunkResult = RAGChunkSchema.safeParse(data);
70
+ if (chunkResult.success) {
71
+ const chunk = chunkResult.data; // Typed as RAGChunk
72
+ }
73
+ ```
74
+
75
+ ### Using JSON Schemas
76
+
77
+ ```typescript
78
+ import { jsonSchemas } from '@vibe-agent-toolkit/rag';
79
+
80
+ // Get JSON Schema for RAGChunk
81
+ const schema = jsonSchemas.RAGChunk;
82
+
83
+ // Use for documentation, validation, code generation, etc.
84
+ console.log(JSON.stringify(schema, null, 2));
85
+ ```
86
+
87
+ ## Token Counters
88
+
89
+ Token counters are used for accurate chunking and embedding token limit management.
90
+
91
+ ### Available Implementations
92
+
93
+ #### FastTokenCounter
94
+
95
+ Fast but inaccurate token estimation using bytes/4 heuristic.
96
+
97
+ ```typescript
98
+ import { FastTokenCounter } from '@vibe-agent-toolkit/rag';
99
+
100
+ const counter = new FastTokenCounter();
101
+ const tokens = counter.count('Hello world'); // ~3 tokens (bytes/4)
102
+ ```
103
+
104
+ **Characteristics:**
105
+ - **Speed**: Very fast (< 1ms for long text)
106
+ - **Accuracy**: ~75% accurate for English text
107
+ - **Recommended padding factor**: 0.8 (80% of target)
108
+ - **Use case**: Quick validation, ResourceRegistry estimation
109
+
110
+ #### ApproximateTokenCounter
111
+
112
+ Accurate token counting using gpt-tokenizer library.
113
+
114
+ ```typescript
115
+ import { ApproximateTokenCounter } from '@vibe-agent-toolkit/rag';
116
+
117
+ const counter = new ApproximateTokenCounter();
118
+ const tokens = counter.count('Hello world'); // 2 tokens (accurate)
119
+ ```
120
+
121
+ **Characteristics:**
122
+ - **Speed**: Fast (< 10ms for long text)
123
+ - **Accuracy**: ~95% accurate (GPT-3.5/GPT-4 tokenization)
124
+ - **Recommended padding factor**: 0.9 (90% of target)
125
+ - **Use case**: RAG chunking, embedding preparation
126
+
127
+ ### Choosing a Token Counter
128
+
129
+ | Counter | Speed | Accuracy | Padding Factor | Use Case |
130
+ |---------|-------|----------|----------------|----------|
131
+ | FastTokenCounter | Very Fast | ~75% | 0.8 | Quick estimation |
132
+ | ApproximateTokenCounter | Fast | ~95% | 0.9 | RAG chunking |
133
+
134
+ ### Padding Factor
135
+
136
+ The padding factor provides a safety margin to avoid exceeding embedding model token limits:
137
+
138
+ ```typescript
139
+ const targetChunkSize = 512; // tokens
140
+ const paddingFactor = 0.9; // 90%
141
+ const effectiveTarget = targetChunkSize * paddingFactor; // 460 tokens
142
+
143
+ // Chunk to effective target to avoid splits from estimation error
144
+ ```
145
+
146
+ **Why padding matters:**
147
+ - Token estimation may be imperfect
148
+ - Targeting exact limit might exceed it, forcing inefficient splits
149
+ - Lower accuracy = lower padding factor (more safety margin)
150
+
151
+ ## Embedding Providers
152
+
153
+ Embedding providers convert text to vector embeddings for semantic search.
154
+
155
+ ### Available Implementations
156
+
157
+ #### TransformersEmbeddingProvider (Default)
158
+
159
+ Local embedding generation using transformers.js - no API key required.
160
+
161
+ ```typescript
162
+ import { TransformersEmbeddingProvider } from '@vibe-agent-toolkit/rag';
163
+
164
+ const provider = new TransformersEmbeddingProvider();
165
+ // Default model: Xenova/all-MiniLM-L6-v2 (384 dimensions)
166
+
167
+ const embedding = await provider.embed('Search query text');
168
+ console.log(embedding.length); // 384
169
+
170
+ // Batch embedding for efficiency
171
+ const embeddings = await provider.embedBatch(['text1', 'text2', 'text3']);
172
+ ```
173
+
174
+ **Characteristics:**
175
+ - **Speed**: Fast (local inference)
176
+ - **Quality**: Good (suitable for most use cases)
177
+ - **Cost**: Free (no API calls)
178
+ - **API Key**: Not required
179
+ - **Dimensions**: 384 (all-MiniLM-L6-v2)
180
+ - **Use case**: Default choice for most projects
181
+
182
+ **First run**: Downloads model (~20MB for all-MiniLM-L6-v2)
183
+
184
+ #### OpenAIEmbeddingProvider (Optional)
185
+
186
+ Cloud-based embedding using OpenAI API - requires API key.
187
+
188
+ ```typescript
189
+ import { OpenAIEmbeddingProvider } from '@vibe-agent-toolkit/rag';
190
+
191
+ const provider = new OpenAIEmbeddingProvider({
192
+ apiKey: process.env.OPENAI_API_KEY!,
193
+ model: 'text-embedding-3-small', // or 'text-embedding-3-large'
194
+ });
195
+
196
+ const embedding = await provider.embed('Search query text');
197
+ console.log(embedding.length); // 1536
198
+
199
+ // Custom dimensions (text-embedding-3-* models only)
200
+ const customProvider = new OpenAIEmbeddingProvider({
201
+ apiKey: process.env.OPENAI_API_KEY!,
202
+ model: 'text-embedding-3-small',
203
+ dimensions: 512, // Reduce dimensions for faster search
204
+ });
205
+ ```
206
+
207
+ **Characteristics:**
208
+ - **Speed**: Medium (network latency)
209
+ - **Quality**: Excellent (state-of-art)
210
+ - **Cost**: Paid (per token)
211
+ - **API Key**: Required
212
+ - **Dimensions**: 1536 (small) or 3072 (large)
213
+ - **Use case**: Production agents requiring highest quality
214
+
215
+ **Installation**: `bun add openai` (optional dependency)
216
+
217
+ ### Choosing an Embedding Provider
218
+
219
+ | Provider | Speed | Quality | Cost | Dimensions | Use Case |
220
+ |----------|-------|---------|------|------------|----------|
221
+ | TransformersEmbeddingProvider | Fast | Good | Free | 384 | Default choice |
222
+ | OpenAIEmbeddingProvider | Medium | Excellent | Paid | 1536-3072 | Production, high quality |
223
+
224
+ ### Model Selection Guidelines
225
+
226
+ **Use TransformersEmbeddingProvider when:**
227
+ - Building locally or in development
228
+ - Budget-conscious or high-volume scenarios
229
+ - Good quality is sufficient (most use cases)
230
+ - Want to avoid API dependencies
231
+
232
+ **Use OpenAIEmbeddingProvider when:**
233
+ - Deploying production agents with budget
234
+ - Need highest quality search results
235
+ - Working with complex or nuanced queries
236
+ - Want proven, well-tested models
237
+
238
+ ## Chunking
239
+
240
+ Chunking utilities split documents into semantic chunks for embedding and retrieval.
241
+
242
+ ### Strategy
243
+
244
+ **Hybrid Approach:**
245
+ 1. **Heading boundaries** - Primary split points (respects markdown structure)
246
+ 2. **Token-aware splitting** - Splits large sections by paragraphs to fit token limits
247
+ 3. **Padding factor** - Safety margin to avoid exceeding model limits
248
+ 4. **Context linking** - previousChunkId/nextChunkId for context expansion
249
+
250
+ ### Usage
251
+
252
+ ```typescript
253
+ import { chunkResource, enrichChunks } from '@vibe-agent-toolkit/rag';
254
+ import { ApproximateTokenCounter } from '@vibe-agent-toolkit/rag';
255
+ import { ResourceRegistry } from '@vibe-agent-toolkit/resources';
256
+
257
+ // 1. Get resource from ResourceRegistry
258
+ const registry = new ResourceRegistry();
259
+ await registry.crawl({ baseDir: './docs' });
260
+ const metadata = registry.getResourceById('resource-id');
261
+
262
+ // 2. Read file content and parse frontmatter (not included in ResourceMetadata)
263
+ const content = await fs.readFile(metadata.filePath, 'utf-8');
264
+ const frontmatter = /* parse frontmatter */;
265
+ const resource = { ...metadata, content, frontmatter };
266
+
267
+ // 3. Configure chunking
268
+ const config = {
269
+ targetChunkSize: 512, // Ideal chunk size
270
+ modelTokenLimit: 8191, // Hard limit (embedding model)
271
+ paddingFactor: 0.9, // 90% of target (safety margin)
272
+ tokenCounter: new ApproximateTokenCounter(),
273
+ };
274
+
275
+ // 4. Chunk the resource
276
+ const result = chunkResource(resource, config);
277
+ console.log(`Created ${result.stats.totalChunks} chunks`);
278
+ console.log(`Average tokens: ${result.stats.averageTokens}`);
279
+
280
+ // 5. Enrich with embeddings (after embedding)
281
+ const embeddings = await embeddingProvider.embedBatch(
282
+ result.chunks.map(c => c.content)
283
+ );
284
+
285
+ const ragChunks = enrichChunks(
286
+ result.chunks,
287
+ resource,
288
+ embeddings,
289
+ 'text-embedding-3-small'
290
+ );
291
+ ```
292
+
293
+ ### Configuration
294
+
295
+ | Option | Description | Example |
296
+ |--------|-------------|---------|
297
+ | `targetChunkSize` | Ideal chunk size in tokens | 512 |
298
+ | `modelTokenLimit` | Hard limit (embedding model) | 8191 (OpenAI) |
299
+ | `paddingFactor` | Safety margin (0.8-1.0) | 0.9 (ApproximateTokenCounter) |
300
+ | `tokenCounter` | Token counter to use | ApproximateTokenCounter |
301
+ | `minChunkSize` | Minimum chunk size (optional) | 50 |
302
+
303
+ ### Padding Factor Guidelines
304
+
305
+ See [Token Counters](#token-counters) for padding factor recommendations:
306
+ - **FastTokenCounter**: 0.8 (80% of target)
307
+ - **ApproximateTokenCounter**: 0.9 (90% of target)
308
+
309
+ Lower accuracy = lower padding factor (more safety margin)
310
+
311
+ ### Utilities
312
+
313
+ ```typescript
314
+ import {
315
+ chunkByTokens,
316
+ splitByParagraphs,
317
+ splitBySentences,
318
+ generateContentHash,
319
+ generateChunkId,
320
+ calculateEffectiveTarget,
321
+ } from '@vibe-agent-toolkit/rag';
322
+
323
+ // Split text by token count
324
+ const chunks = chunkByTokens('long text...', config);
325
+
326
+ // Split by paragraphs
327
+ const paragraphs = splitByParagraphs(text);
328
+
329
+ // Generate content hash for change detection
330
+ const hash = generateContentHash(content);
331
+
332
+ // Calculate effective target with padding
333
+ const effectiveTarget = calculateEffectiveTarget(512, 0.9); // 460
334
+ ```
335
+
336
+ ## API Reference
337
+
338
+ ### Interfaces
339
+
340
+ #### RAGQueryProvider
341
+
342
+ Read-only provider interface for querying RAG databases.
343
+
344
+ ```typescript
345
+ interface RAGQueryProvider {
346
+ query(query: RAGQuery): Promise<RAGResult>;
347
+ getStats(): Promise<RAGStats>;
348
+ }
349
+ ```
350
+
351
+ #### RAGAdminProvider
352
+
353
+ Read/write provider interface for building and managing RAG databases.
354
+
355
+ ```typescript
356
+ interface RAGAdminProvider extends RAGQueryProvider {
357
+ indexResources(resources: ResourceMetadata[]): Promise<IndexResult>;
358
+ updateResource(resourceId: string): Promise<void>;
359
+ deleteResource(resourceId: string): Promise<void>;
360
+ clear(): Promise<void>;
361
+ close(): Promise<void>;
362
+ }
363
+ ```
364
+
365
+ #### EmbeddingProvider
366
+
367
+ Interface for embedding providers (transformers.js, OpenAI, etc.)
368
+
369
+ ```typescript
370
+ interface EmbeddingProvider {
371
+ name: string;
372
+ model: string;
373
+ dimensions: number;
374
+ embed(text: string): Promise<number[]>;
375
+ embedBatch(texts: string[]): Promise<number[][]>;
376
+ }
377
+ ```
378
+
379
+ #### TokenCounter
380
+
381
+ Interface for token counting implementations.
382
+
383
+ ```typescript
384
+ interface TokenCounter {
385
+ name: string;
386
+ count(text: string): number;
387
+ countBatch(texts: string[]): number[];
388
+ }
389
+ ```
390
+
391
+ ### Schemas
392
+
393
+ All schemas are defined with Zod and exported as both TypeScript types and JSON Schemas.
394
+
395
+ - `RAGChunkSchema` / `RAGChunk` - Structure of a chunk in the RAG database
396
+ - `RAGQuerySchema` / `RAGQuery` - Structure of a query
397
+ - `RAGResultSchema` / `RAGResult` - Structure of query results
398
+ - `RAGStatsSchema` / `RAGStats` - Database statistics
399
+ - `IndexResultSchema` / `IndexResult` - Result from indexing operation
400
+
401
+ ## Architecture
402
+
403
+ **Key principles:**
404
+ - **Interface-first**: Define contracts before implementations
405
+ - **Pluggable components**: All providers are swappable
406
+ - **Read/write separation**: Query providers for agents, admin providers for build tools
407
+ - **Rich metadata**: Enable powerful filtered searches
408
+
409
+ ## Development
410
+
411
+ ```bash
412
+ # Build
413
+ bun run build
414
+
415
+ # Test
416
+ bun test
417
+
418
+ # Type check
419
+ bun run typecheck
420
+ ```
421
+
422
+ ## License
423
+
424
+ MIT
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Token-aware chunking
3
+ *
4
+ * Splits text by token count, respecting paragraph boundaries.
5
+ */
6
+ import type { ChunkingConfig, RawChunk } from './types.js';
7
+ /**
8
+ * Chunk text by token count
9
+ *
10
+ * Splits text when it exceeds the effective target size (targetChunkSize * paddingFactor).
11
+ * Respects paragraph boundaries when possible, falls back to sentence splitting.
12
+ *
13
+ * @param text - Text to chunk
14
+ * @param config - Chunking configuration
15
+ * @param metadata - Optional metadata to attach to chunks
16
+ * @returns Array of raw chunks
17
+ */
18
+ export declare function chunkByTokens(text: string, config: ChunkingConfig, metadata?: {
19
+ headingPath?: string;
20
+ headingLevel?: number;
21
+ }): RawChunk[];
22
+ //# sourceMappingURL=chunk-by-tokens.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk-by-tokens.d.ts","sourceRoot":"","sources":["../../src/chunking/chunk-by-tokens.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAG3D;;;;;;;;;;GAUG;AACH,wBAAgB,aAAa,CAC3B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,cAAc,EACtB,QAAQ,CAAC,EAAE;IAAE,WAAW,CAAC,EAAE,MAAM,CAAC;IAAC,YAAY,CAAC,EAAE,MAAM,CAAA;CAAE,GACzD,QAAQ,EAAE,CA4DZ"}
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Token-aware chunking
3
+ *
4
+ * Splits text by token count, respecting paragraph boundaries.
5
+ */
6
+ import { calculateEffectiveTarget, splitByParagraphs } from './utils.js';
7
+ /**
8
+ * Chunk text by token count
9
+ *
10
+ * Splits text when it exceeds the effective target size (targetChunkSize * paddingFactor).
11
+ * Respects paragraph boundaries when possible, falls back to sentence splitting.
12
+ *
13
+ * @param text - Text to chunk
14
+ * @param config - Chunking configuration
15
+ * @param metadata - Optional metadata to attach to chunks
16
+ * @returns Array of raw chunks
17
+ */
18
+ export function chunkByTokens(text, config, metadata) {
19
+ if (text.trim().length === 0) {
20
+ return [];
21
+ }
22
+ const { targetChunkSize, modelTokenLimit, paddingFactor, tokenCounter } = config;
23
+ const effectiveTarget = calculateEffectiveTarget(targetChunkSize, paddingFactor);
24
+ // Check if entire text fits
25
+ const totalTokens = tokenCounter.count(text);
26
+ if (totalTokens <= effectiveTarget) {
27
+ return [
28
+ {
29
+ content: text,
30
+ ...metadata,
31
+ },
32
+ ];
33
+ }
34
+ // Split by paragraphs first
35
+ const paragraphs = splitByParagraphs(text);
36
+ const chunks = [];
37
+ let currentChunk = '';
38
+ let currentTokens = 0;
39
+ for (const paragraph of paragraphs) {
40
+ const paragraphTokens = tokenCounter.count(paragraph);
41
+ // Check if paragraph itself exceeds model limit
42
+ if (paragraphTokens > modelTokenLimit) {
43
+ throw new Error(`Paragraph exceeds model token limit (${paragraphTokens} > ${modelTokenLimit}). ` +
44
+ 'Consider splitting by sentences or reducing content.');
45
+ }
46
+ // If adding this paragraph would exceed target, start new chunk
47
+ if (currentTokens > 0 && currentTokens + paragraphTokens > effectiveTarget) {
48
+ chunks.push({
49
+ content: currentChunk.trim(),
50
+ ...metadata,
51
+ });
52
+ currentChunk = '';
53
+ currentTokens = 0;
54
+ }
55
+ // Add paragraph to current chunk
56
+ currentChunk += (currentChunk.length > 0 ? '\n\n' : '') + paragraph;
57
+ currentTokens += paragraphTokens;
58
+ }
59
+ // Add final chunk
60
+ if (currentChunk.trim().length > 0) {
61
+ chunks.push({
62
+ content: currentChunk.trim(),
63
+ ...metadata,
64
+ });
65
+ }
66
+ return chunks;
67
+ }
68
+ //# sourceMappingURL=chunk-by-tokens.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk-by-tokens.js","sourceRoot":"","sources":["../../src/chunking/chunk-by-tokens.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,EAAE,wBAAwB,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAEzE;;;;;;;;;;GAUG;AACH,MAAM,UAAU,aAAa,CAC3B,IAAY,EACZ,MAAsB,EACtB,QAA0D;IAE1D,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,EAAE,eAAe,EAAE,eAAe,EAAE,aAAa,EAAE,YAAY,EAAE,GAAG,MAAM,CAAC;IACjF,MAAM,eAAe,GAAG,wBAAwB,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;IAEjF,4BAA4B;IAC5B,MAAM,WAAW,GAAG,YAAY,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7C,IAAI,WAAW,IAAI,eAAe,EAAE,CAAC;QACnC,OAAO;YACL;gBACE,OAAO,EAAE,IAAI;gBACb,GAAG,QAAQ;aACZ;SACF,CAAC;IACJ,CAAC;IAED,4BAA4B;IAC5B,MAAM,UAAU,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAC3C,MAAM,MAAM,GAAe,EAAE,CAAC;IAC9B,IAAI,YAAY,GAAG,EAAE,CAAC;IACtB,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,eAAe,GAAG,YAAY,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QAEtD,gDAAgD;QAChD,IAAI,eAAe,GAAG,eAAe,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CACb,wCAAwC,eAAe,MAAM,eAAe,KAAK;gBAC/E,sDAAsD,CACzD,CAAC;QACJ,CAAC;QAED,gEAAgE;QAChE,IAAI,aAAa,GAAG,CAAC,IAAI,aAAa,GAAG,eAAe,GAAG,eAAe,EAAE,CAAC;YAC3E,MAAM,CAAC,IAAI,CAAC;gBACV,OAAO,EAAE,YAAY,CAAC,IAAI,EAAE;gBAC5B,GAAG,QAAQ;aACZ,CAAC,CAAC;YACH,YAAY,GAAG,EAAE,CAAC;YAClB,aAAa,GAAG,CAAC,CAAC;QACpB,CAAC;QAED,iCAAiC;QACjC,YAAY,IAAI,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC;QACpE,aAAa,IAAI,eAAe,CAAC;IACnC,CAAC;IAED,kBAAkB;IAClB,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACnC,MAAM,CAAC,IAAI,CAAC;YACV,OAAO,EAAE,YAAY,CAAC,IAAI,EAAE;YAC5B,GAAG,QAAQ;SACZ,CAAC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,46 @@
1
+ /**
2
+ * Resource chunking
3
+ *
4
+ * Chunks ResourceMetadata using hybrid heading-based + token-aware strategy.
5
+ */
6
+ import type { ResourceMetadata } from '@vibe-agent-toolkit/resources';
7
+ import type { RAGChunk } from '../schemas/chunk.js';
8
+ import type { ChunkingConfig, ChunkingResult, RawChunk } from './types.js';
9
+ /**
10
+ * Extended resource metadata for chunking
11
+ *
12
+ * Extends ResourceMetadata with content and frontmatter needed for chunking.
13
+ * Typically obtained by reading the file after getting ResourceMetadata.
14
+ */
15
+ export interface ChunkableResource extends ResourceMetadata {
16
+ /** File content (markdown text) */
17
+ content: string;
18
+ /** Parsed frontmatter from the file */
19
+ frontmatter: Record<string, unknown>;
20
+ }
21
+ /**
22
+ * Chunk a resource using hybrid strategy
23
+ *
24
+ * Strategy:
25
+ * 1. Use heading boundaries as primary splits (from ResourceRegistry)
26
+ * 2. For large sections exceeding target size, split by tokens (paragraphs)
27
+ * 3. Link chunks for context expansion (previousChunkId, nextChunkId)
28
+ *
29
+ * @param resource - Chunkable resource with content and frontmatter
30
+ * @param config - Chunking configuration
31
+ * @returns Chunking result with raw chunks and statistics
32
+ */
33
+ export declare function chunkResource(resource: ChunkableResource, config: ChunkingConfig): ChunkingResult;
34
+ /**
35
+ * Enrich raw chunks with full RAGChunk metadata
36
+ *
37
+ * Adds resource metadata, embeddings, chunk IDs, and links between chunks.
38
+ *
39
+ * @param rawChunks - Raw chunks from chunkResource
40
+ * @param resource - Source chunkable resource with frontmatter
41
+ * @param embeddings - Embedding array for each chunk
42
+ * @param embeddingModel - Model used for embeddings
43
+ * @returns Array of complete RAGChunks
44
+ */
45
+ export declare function enrichChunks(rawChunks: RawChunk[], resource: ChunkableResource, embeddings: number[][], embeddingModel: string): RAGChunk[];
46
+ //# sourceMappingURL=chunk-resource.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk-resource.d.ts","sourceRoot":"","sources":["../../src/chunking/chunk-resource.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AAEtE,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAGpD,OAAO,KAAK,EAAE,cAAc,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAG3E;;;;;GAKG;AACH,MAAM,WAAW,iBAAkB,SAAQ,gBAAgB;IACzD,mCAAmC;IACnC,OAAO,EAAE,MAAM,CAAC;IAChB,uCAAuC;IACvC,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACtC;AAED;;;;;;;;;;;GAWG;AACH,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,EAAE,cAAc,GACrB,cAAc,CAwDhB;AAgCD;;;;;;;;;;GAUG;AACH,wBAAgB,YAAY,CAC1B,SAAS,EAAE,QAAQ,EAAE,EACrB,QAAQ,EAAE,iBAAiB,EAC3B,UAAU,EAAE,MAAM,EAAE,EAAE,EACtB,cAAc,EAAE,MAAM,GACrB,QAAQ,EAAE,CA6BZ"}