voctar 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/src/chunking/index.d.ts +48 -0
- package/dist/src/chunking/index.d.ts.map +1 -0
- package/dist/src/chunking/index.js +123 -0
- package/dist/src/chunking/index.js.map +1 -0
- package/dist/src/chunking/strategies/fixed.d.ts +14 -0
- package/dist/src/chunking/strategies/fixed.d.ts.map +1 -0
- package/dist/src/chunking/strategies/fixed.js +111 -0
- package/dist/src/chunking/strategies/fixed.js.map +1 -0
- package/dist/src/chunking/strategies/paragraph.d.ts +6 -0
- package/dist/src/chunking/strategies/paragraph.d.ts.map +1 -0
- package/dist/src/chunking/strategies/paragraph.js +84 -0
- package/dist/src/chunking/strategies/paragraph.js.map +1 -0
- package/dist/src/chunking/strategies/recursive.d.ts +17 -0
- package/dist/src/chunking/strategies/recursive.d.ts.map +1 -0
- package/dist/src/chunking/strategies/recursive.js +192 -0
- package/dist/src/chunking/strategies/recursive.js.map +1 -0
- package/dist/src/chunking/strategies/semantic.d.ts +96 -0
- package/dist/src/chunking/strategies/semantic.d.ts.map +1 -0
- package/dist/src/chunking/strategies/semantic.js +587 -0
- package/dist/src/chunking/strategies/semantic.js.map +1 -0
- package/dist/src/chunking/strategies/sentence.d.ts +7 -0
- package/dist/src/chunking/strategies/sentence.d.ts.map +1 -0
- package/dist/src/chunking/strategies/sentence.js +116 -0
- package/dist/src/chunking/strategies/sentence.js.map +1 -0
- package/dist/src/chunking/types.d.ts +45 -0
- package/dist/src/chunking/types.d.ts.map +1 -0
- package/dist/src/chunking/types.js +4 -0
- package/dist/src/chunking/types.js.map +1 -0
- package/dist/src/chunking/utils/tokenizer.d.ts +10 -0
- package/dist/src/chunking/utils/tokenizer.d.ts.map +1 -0
- package/dist/src/chunking/utils/tokenizer.js +50 -0
- package/dist/src/chunking/utils/tokenizer.js.map +1 -0
- package/dist/src/providers/embeddings/index.d.ts +3 -0
- package/dist/src/providers/embeddings/index.d.ts.map +1 -0
- package/dist/src/providers/embeddings/index.js +7 -0
- package/dist/src/providers/embeddings/index.js.map +1 -0
- package/dist/src/providers/embeddings/openai.d.ts +21 -0
- package/dist/src/providers/embeddings/openai.d.ts.map +1 -0
- package/dist/src/providers/embeddings/openai.js +86 -0
- package/dist/src/providers/embeddings/openai.js.map +1 -0
- package/dist/src/providers/index.d.ts +3 -0
- package/dist/src/providers/index.d.ts.map +1 -0
- package/dist/src/providers/index.js +20 -0
- package/dist/src/providers/index.js.map +1 -0
- package/dist/src/providers/stores/index.d.ts +6 -0
- package/dist/src/providers/stores/index.d.ts.map +1 -0
- package/dist/src/providers/stores/index.js +11 -0
- package/dist/src/providers/stores/index.js.map +1 -0
- package/dist/src/providers/stores/memory.d.ts +18 -0
- package/dist/src/providers/stores/memory.d.ts.map +1 -0
- package/dist/src/providers/stores/memory.js +169 -0
- package/dist/src/providers/stores/memory.js.map +1 -0
- package/dist/src/providers/stores/qdrant.d.ts +28 -0
- package/dist/src/providers/stores/qdrant.d.ts.map +1 -0
- package/dist/src/providers/stores/qdrant.js +223 -0
- package/dist/src/providers/stores/qdrant.js.map +1 -0
- package/dist/src/providers/stores/sqlite.d.ts +38 -0
- package/dist/src/providers/stores/sqlite.d.ts.map +1 -0
- package/dist/src/providers/stores/sqlite.js +306 -0
- package/dist/src/providers/stores/sqlite.js.map +1 -0
- package/dist/src/types.d.ts +111 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +32 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/vector.d.ts +74 -0
- package/dist/src/vector.d.ts.map +1 -0
- package/dist/src/vector.js +505 -0
- package/dist/src/vector.js.map +1 -0
- package/docs/API.md +361 -0
- package/docs/CHUNKING.md +280 -0
- package/docs/CUSTOM_PROVIDERS.md +101 -0
- package/docs/README.md +11 -0
- package/docs/STORAGE_BACKENDS.md +189 -0
- package/docs/assets/vectar.png +0 -0
- package/package.json +46 -0
package/docs/API.md
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# Voctar API Reference
|
|
2
|
+
|
|
3
|
+
This document covers the full public API exported by `voctar`.
|
|
4
|
+
|
|
5
|
+
## Package Exports
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
import {
|
|
9
|
+
Voctar,
|
|
10
|
+
Vector, // alias of Voctar
|
|
11
|
+
chunking,
|
|
12
|
+
ChunkingService,
|
|
13
|
+
// types
|
|
14
|
+
type VectorConfig,
|
|
15
|
+
type EmbedOptions,
|
|
16
|
+
type SearchOptions,
|
|
17
|
+
type SearchResult,
|
|
18
|
+
type VectorDocument,
|
|
19
|
+
type CollectionConfig,
|
|
20
|
+
type EmbeddingProvider,
|
|
21
|
+
type VectorStoreProvider,
|
|
22
|
+
// errors
|
|
23
|
+
VectorEmbeddingError,
|
|
24
|
+
VectorSearchError,
|
|
25
|
+
VectorStoreError,
|
|
26
|
+
} from 'voctar';
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## `class Voctar` (aka `Vector`)
|
|
30
|
+
|
|
31
|
+
### Constructor
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
new Voctar(config?: VectorConfig)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Creates a Voctar client with embedding and store providers.
|
|
38
|
+
|
|
39
|
+
Notes:
|
|
40
|
+
|
|
41
|
+
- If `store` is omitted, Voctar defaults to SQLite at `./vector.db`.
|
|
42
|
+
- `autoChunk` defaults to `true`.
|
|
43
|
+
- `defaultChunkStrategy` defaults to `'recursive'`.
|
|
44
|
+
- `defaultChunkSize` defaults to `1000`.
|
|
45
|
+
- `defaultChunkOverlap` defaults to `200`.
|
|
46
|
+
|
|
47
|
+
### `embed(collection, text, options?)`
|
|
48
|
+
|
|
49
|
+
```typescript
|
|
50
|
+
embed(
|
|
51
|
+
collection: string,
|
|
52
|
+
text: string,
|
|
53
|
+
options?: EmbedOptions
|
|
54
|
+
): Promise<{ documentId: string; chunkIds: string[] }>
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Embeds one document into a collection.
|
|
58
|
+
|
|
59
|
+
Behavior:
|
|
60
|
+
|
|
61
|
+
- Validates collection name and text.
|
|
62
|
+
- Ensures collection exists.
|
|
63
|
+
- Auto-chunks when needed (based on token limits and chunk settings).
|
|
64
|
+
- If `options.documentId` is provided and already exists, old chunks are removed first.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
|
|
68
|
+
- `documentId`: parent document id
|
|
69
|
+
- `chunkIds`: final stored ids
|
|
70
|
+
|
|
71
|
+
### `embedBatch(collection, documents, user_id?)`
|
|
72
|
+
|
|
73
|
+
```typescript
|
|
74
|
+
embedBatch(
|
|
75
|
+
collection: string,
|
|
76
|
+
documents: VectorDocument[],
|
|
77
|
+
user_id?: string
|
|
78
|
+
): Promise<string[]>
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Embeds multiple documents in one batch.
|
|
82
|
+
|
|
83
|
+
Behavior:
|
|
84
|
+
|
|
85
|
+
- Ensures collection exists.
|
|
86
|
+
- Re-chunks oversized documents when they exceed model token limits.
|
|
87
|
+
- Upserts all vectors to the configured store.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
|
|
91
|
+
- Array of stored point ids.
|
|
92
|
+
|
|
93
|
+
### `search(collection, query, options?)`
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
search(
|
|
97
|
+
collection: string,
|
|
98
|
+
query: string,
|
|
99
|
+
options?: SearchOptions
|
|
100
|
+
): Promise<SearchResult[]>
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Performs semantic similarity search.
|
|
104
|
+
|
|
105
|
+
Behavior:
|
|
106
|
+
|
|
107
|
+
- Embeds the query.
|
|
108
|
+
- Searches configured vector store.
|
|
109
|
+
- Returns normalized results with ISO `createdAt`.
|
|
110
|
+
- Includes internal system metadata only when `includeSystem` is `true`.
|
|
111
|
+
|
|
112
|
+
### `upsert(collection, documentId, text, options?)`
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
upsert(
|
|
116
|
+
collection: string,
|
|
117
|
+
documentId: string,
|
|
118
|
+
text: string,
|
|
119
|
+
options?: EmbedOptions
|
|
120
|
+
): Promise<void>
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Replaces a document:
|
|
124
|
+
|
|
125
|
+
1. deletes previous document/chunks by `documentId`,
|
|
126
|
+
2. re-embeds the new text with the same `documentId`.
|
|
127
|
+
|
|
128
|
+
### `delete(collection, documentId | documentIds)`
|
|
129
|
+
|
|
130
|
+
```typescript
|
|
131
|
+
delete(collection: string, documentId: string | string[]): Promise<void>
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Deletes one or many documents and their chunks.
|
|
135
|
+
|
|
136
|
+
Behavior:
|
|
137
|
+
|
|
138
|
+
- Deletes direct ids.
|
|
139
|
+
- Also resolves chunk ids through `system._documentId` filter and deletes them.
|
|
140
|
+
|
|
141
|
+
### `deleteCollection(collection)`
|
|
142
|
+
|
|
143
|
+
```typescript
|
|
144
|
+
deleteCollection(collection: string): Promise<void>
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Deletes an entire collection from the configured store.
|
|
148
|
+
|
|
149
|
+
### `ensureCollection(collection, config?)`
|
|
150
|
+
|
|
151
|
+
```typescript
|
|
152
|
+
ensureCollection(
|
|
153
|
+
collection: string,
|
|
154
|
+
config?: CollectionConfig
|
|
155
|
+
): Promise<void>
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Ensures collection exists with optional collection configuration.
|
|
159
|
+
|
|
160
|
+
### `getEmbeddingProvider()`
|
|
161
|
+
|
|
162
|
+
```typescript
|
|
163
|
+
getEmbeddingProvider(): EmbeddingProvider
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Returns the active embedding provider instance.
|
|
167
|
+
|
|
168
|
+
### `getVectorStoreProvider()`
|
|
169
|
+
|
|
170
|
+
```typescript
|
|
171
|
+
getVectorStoreProvider(): VectorStoreProvider
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Returns the active vector store provider instance.
|
|
175
|
+
|
|
176
|
+
### Static Helpers
|
|
177
|
+
|
|
178
|
+
#### `Voctar.getChunkId(documentId, chunkIndex)`
|
|
179
|
+
|
|
180
|
+
```typescript
|
|
181
|
+
static getChunkId(documentId: string, chunkIndex: number): string
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Builds chunk id using `documentId#chunkIndex`.
|
|
185
|
+
|
|
186
|
+
#### `Voctar.parseChunkId(chunkId)`
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
static parseChunkId(
|
|
190
|
+
chunkId: string
|
|
191
|
+
): { documentId: string; chunkIndex: number } | null
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Parses chunk id into parent id and index, or returns `null`.
|
|
195
|
+
|
|
196
|
+
#### `Voctar.isChunkId(id)`
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
static isChunkId(id: string): boolean
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Checks whether an id contains the chunk separator format.
|
|
203
|
+
|
|
204
|
+
## Core Config and Types
|
|
205
|
+
|
|
206
|
+
### `VectorConfig`
|
|
207
|
+
|
|
208
|
+
```typescript
|
|
209
|
+
interface VectorConfig {
|
|
210
|
+
embedding?: RuntimeEmbeddingConfig;
|
|
211
|
+
store?: RuntimeStoreConfig;
|
|
212
|
+
embeddingProvider?: EmbeddingProvider; // deprecated
|
|
213
|
+
vectorStoreProvider?: VectorStoreProvider; // deprecated
|
|
214
|
+
defaultChunkSize?: number;
|
|
215
|
+
defaultChunkStrategy?: 'fixed' | 'recursive' | 'semantic' | 'sentence' | 'paragraph';
|
|
216
|
+
defaultChunkOverlap?: number;
|
|
217
|
+
autoChunk?: boolean;
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### `EmbedOptions`
|
|
222
|
+
|
|
223
|
+
```typescript
|
|
224
|
+
interface EmbedOptions {
|
|
225
|
+
documentId?: string;
|
|
226
|
+
metadata?: Record<string, any>;
|
|
227
|
+
chunkSize?: number;
|
|
228
|
+
chunkStrategy?: 'fixed' | 'recursive' | 'semantic' | 'sentence' | 'paragraph';
|
|
229
|
+
chunkOverlap?: number;
|
|
230
|
+
autoChunk?: boolean;
|
|
231
|
+
user_id?: string;
|
|
232
|
+
}
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### `SearchOptions`
|
|
236
|
+
|
|
237
|
+
```typescript
|
|
238
|
+
interface SearchOptions {
|
|
239
|
+
limit?: number;
|
|
240
|
+
scoreThreshold?: number;
|
|
241
|
+
filter?: Record<string, any>;
|
|
242
|
+
includeSystem?: boolean;
|
|
243
|
+
}
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Filter behavior:
|
|
247
|
+
|
|
248
|
+
- keys without `.` are matched under `metadata.*`
|
|
249
|
+
- keys with `.` are used as-is
|
|
250
|
+
- scalar = equality
|
|
251
|
+
- array = OR match for that field
|
|
252
|
+
|
|
253
|
+
### `SearchResult`
|
|
254
|
+
|
|
255
|
+
```typescript
|
|
256
|
+
interface SearchResult {
|
|
257
|
+
id: string;
|
|
258
|
+
text: string;
|
|
259
|
+
score: number;
|
|
260
|
+
createdAt: string; // ISO 8601
|
|
261
|
+
metadata?: Record<string, any>;
|
|
262
|
+
system?: Record<string, any>; // only when includeSystem=true
|
|
263
|
+
}
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### `RuntimeEmbeddingConfig`
|
|
267
|
+
|
|
268
|
+
```typescript
|
|
269
|
+
type RuntimeEmbeddingConfig =
|
|
270
|
+
| {
|
|
271
|
+
type: 'openai';
|
|
272
|
+
apiKey: string;
|
|
273
|
+
model?: string;
|
|
274
|
+
dimension?: number;
|
|
275
|
+
maxRetries?: number;
|
|
276
|
+
}
|
|
277
|
+
| {
|
|
278
|
+
type: 'custom';
|
|
279
|
+
provider: EmbeddingProvider;
|
|
280
|
+
};
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### `RuntimeStoreConfig`
|
|
284
|
+
|
|
285
|
+
```typescript
|
|
286
|
+
type RuntimeStoreConfig =
|
|
287
|
+
| { type: 'sqlite'; path?: string; inMemory?: boolean }
|
|
288
|
+
| { type: 'qdrant'; url: string; port?: number; apiKey?: string; timeout?: number; checkCompatibility?: boolean }
|
|
289
|
+
| { type: 'memory' }
|
|
290
|
+
| { type: 'custom'; provider: VectorStoreProvider };
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### `CollectionConfig`
|
|
294
|
+
|
|
295
|
+
```typescript
|
|
296
|
+
interface CollectionConfig {
|
|
297
|
+
dimension?: number;
|
|
298
|
+
distance?: 'cosine' | 'euclidean' | 'dot';
|
|
299
|
+
}
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### `VectorDocument`
|
|
303
|
+
|
|
304
|
+
```typescript
|
|
305
|
+
interface VectorDocument {
|
|
306
|
+
id: string;
|
|
307
|
+
text: string;
|
|
308
|
+
metadata?: Record<string, any>;
|
|
309
|
+
}
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## Provider Interfaces
|
|
313
|
+
|
|
314
|
+
### `EmbeddingProvider`
|
|
315
|
+
|
|
316
|
+
```typescript
|
|
317
|
+
interface EmbeddingProvider {
|
|
318
|
+
embed(text: string): Promise<number[]>;
|
|
319
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
320
|
+
getDimension(): number;
|
|
321
|
+
getModelName(): string;
|
|
322
|
+
getTokenLimit(): number;
|
|
323
|
+
}
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### `VectorStoreProvider`
|
|
327
|
+
|
|
328
|
+
```typescript
|
|
329
|
+
interface VectorStoreProvider {
|
|
330
|
+
ensureCollection(name: string, dimension: number, config?: CollectionConfig): Promise<void>;
|
|
331
|
+
upsert(collection: string, points: VectorPoint[]): Promise<void>;
|
|
332
|
+
search(collection: string, vector: number[], options: SearchOptions): Promise<SearchResult[]>;
|
|
333
|
+
delete(collection: string, ids: string[]): Promise<void>;
|
|
334
|
+
deleteCollection(collection: string): Promise<void>;
|
|
335
|
+
getIdsByFilter(collection: string, filter: Record<string, any>, limit?: number): Promise<string[]>;
|
|
336
|
+
}
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
## Error Types
|
|
340
|
+
|
|
341
|
+
### `VectorEmbeddingError`
|
|
342
|
+
|
|
343
|
+
- Raised on embedding/chunking/query-embedding failures.
|
|
344
|
+
- Includes optional `cause` and optional `data`.
|
|
345
|
+
|
|
346
|
+
### `VectorSearchError`
|
|
347
|
+
|
|
348
|
+
- Raised by store-level search providers.
|
|
349
|
+
|
|
350
|
+
### `VectorStoreError`
|
|
351
|
+
|
|
352
|
+
- Raised on store operations (collection creation, delete, upsert, etc.).
|
|
353
|
+
|
|
354
|
+
## Chunking API
|
|
355
|
+
|
|
356
|
+
Voctar also exports chunking utilities:
|
|
357
|
+
|
|
358
|
+
- `chunking` singleton service
|
|
359
|
+
- `ChunkingService` class
|
|
360
|
+
|
|
361
|
+
For full chunking-specific API and behavior, see [`CHUNKING.md`](./CHUNKING.md).
|
package/docs/CHUNKING.md
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
# Chunking Guide
|
|
2
|
+
|
|
3
|
+
This is the single source of truth for chunking in Voctar, including:
|
|
4
|
+
|
|
5
|
+
- chunk ID format and metadata model,
|
|
6
|
+
- strategy behavior and options,
|
|
7
|
+
- semantic chunking behavior,
|
|
8
|
+
- usage examples.
|
|
9
|
+
|
|
10
|
+
## Chunk Strategies
|
|
11
|
+
|
|
12
|
+
Different strategies for different content:
|
|
13
|
+
|
|
14
|
+
```typescript
|
|
15
|
+
// Code - fixed size
|
|
16
|
+
await vector.embed('code', sourceCode, {
|
|
17
|
+
chunkStrategy: 'fixed',
|
|
18
|
+
chunkSize: 2000,
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// Articles - recursive (default, splits on natural boundaries)
|
|
22
|
+
await vector.embed('articles', article, {
|
|
23
|
+
chunkStrategy: 'recursive',
|
|
24
|
+
chunkSize: 1000,
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
// Narrative - sentence-based
|
|
28
|
+
await vector.embed('stories', story, {
|
|
29
|
+
chunkStrategy: 'sentence',
|
|
30
|
+
chunkSize: 800,
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
// Structured docs - paragraph-based
|
|
34
|
+
await vector.embed('docs', documentation, {
|
|
35
|
+
chunkStrategy: 'paragraph',
|
|
36
|
+
chunkSize: 1500,
|
|
37
|
+
});
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## How Chunks Are Stored
|
|
41
|
+
|
|
42
|
+
### Chunk ID Format
|
|
43
|
+
|
|
44
|
+
Chunks use a predictable ID format: `documentId#chunkIndex`
|
|
45
|
+
|
|
46
|
+
```typescript
|
|
47
|
+
// Document ID: "user-manual-v2"
|
|
48
|
+
// Chunks get IDs like:
|
|
49
|
+
"user-manual-v2#0" // First chunk
|
|
50
|
+
"user-manual-v2#1" // Second chunk
|
|
51
|
+
"user-manual-v2#2" // Third chunk
|
|
52
|
+
// ... etc
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Benefits
|
|
56
|
+
|
|
57
|
+
1. **Easy Identification**: You can tell which document a chunk belongs to
|
|
58
|
+
2. **Ordered Retrieval**: Chunk index preserves the order
|
|
59
|
+
3. **Simple Deletion**: Delete all chunks by ID pattern
|
|
60
|
+
4. **No External Tracking**: No need for separate mapping tables
|
|
61
|
+
|
|
62
|
+
### Metadata Structure
|
|
63
|
+
|
|
64
|
+
Every chunk stores rich metadata:
|
|
65
|
+
|
|
66
|
+
```typescript
|
|
67
|
+
{
|
|
68
|
+
// User metadata
|
|
69
|
+
title: "User Manual",
|
|
70
|
+
author: "Tech Team",
|
|
71
|
+
version: "2.0",
|
|
72
|
+
|
|
73
|
+
// Chunk position info
|
|
74
|
+
documentId: "user-manual-v2",
|
|
75
|
+
chunkIndex: 0, // 0-based index
|
|
76
|
+
totalChunks: 15, // Total chunks in document
|
|
77
|
+
startChar: 0, // Start position in original text
|
|
78
|
+
endChar: 1000, // End position in original text
|
|
79
|
+
|
|
80
|
+
// System metadata
|
|
81
|
+
_isChunk: true, // Indicates this is a chunk
|
|
82
|
+
_documentId: "user-manual-v2", // Parent document ID
|
|
83
|
+
_chunkId: "user-manual-v2#0", // Full chunk ID
|
|
84
|
+
|
|
85
|
+
// Original text
|
|
86
|
+
text: "Chapter 1: Introduction..."
|
|
87
|
+
}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Examples
|
|
91
|
+
|
|
92
|
+
### Basic Chunking
|
|
93
|
+
|
|
94
|
+
```typescript
|
|
95
|
+
import { Voctar } from '@libs/voctar';
|
|
96
|
+
|
|
97
|
+
const vector = new Voctar({
|
|
98
|
+
embedding: {
|
|
99
|
+
type: 'openai',
|
|
100
|
+
apiKey: '<your-api-key>',
|
|
101
|
+
},
|
|
102
|
+
store: {
|
|
103
|
+
type: 'sqlite',
|
|
104
|
+
path: 'data/vector.db',
|
|
105
|
+
},
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
const longText = '...10,000 characters...';
|
|
109
|
+
|
|
110
|
+
// Embed with auto-chunking
|
|
111
|
+
const result = await vector.embed('docs', longText, {
|
|
112
|
+
documentId: 'article-123',
|
|
113
|
+
metadata: { author: 'Alice' }
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
console.log(result.documentId); // "article-123"
|
|
117
|
+
console.log(result.chunkIds); // ["article-123#0", "article-123#1", ...]
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Parsing Chunk IDs
|
|
121
|
+
|
|
122
|
+
```typescript
|
|
123
|
+
import { Voctar } from '@libs/voctar';
|
|
124
|
+
|
|
125
|
+
const chunkId = "article-123#5";
|
|
126
|
+
|
|
127
|
+
// Parse chunk ID
|
|
128
|
+
const parsed = Voctar.parseChunkId(chunkId);
|
|
129
|
+
console.log(parsed);
|
|
130
|
+
// { documentId: "article-123", chunkIndex: 5 }
|
|
131
|
+
|
|
132
|
+
// Check if ID is a chunk
|
|
133
|
+
console.log(Voctar.isChunkId(chunkId)); // true
|
|
134
|
+
console.log(Voctar.isChunkId("article-123")); // false
|
|
135
|
+
|
|
136
|
+
// Generate chunk ID
|
|
137
|
+
const id = Voctar.getChunkId("article-123", 5);
|
|
138
|
+
console.log(id); // "article-123#5"
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Search with Chunk Context
|
|
142
|
+
|
|
143
|
+
```typescript
|
|
144
|
+
const results = await vector.search('docs', 'installation steps');
|
|
145
|
+
|
|
146
|
+
results.forEach(result => {
|
|
147
|
+
// Check if result is from a chunked document
|
|
148
|
+
if (Voctar.isChunkId(result.id)) {
|
|
149
|
+
const { documentId, chunkIndex } = Voctar.parseChunkId(result.id)!;
|
|
150
|
+
|
|
151
|
+
console.log(`Found in document: ${documentId}`);
|
|
152
|
+
console.log(`Chunk ${chunkIndex + 1} of ${result.metadata.totalChunks}`);
|
|
153
|
+
console.log(`Text position: ${result.metadata.startChar}-${result.metadata.endChar}`);
|
|
154
|
+
} else {
|
|
155
|
+
console.log(`Non-chunked document: ${result.id}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
console.log(result.text);
|
|
159
|
+
});
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Reconstructing Original Document
|
|
163
|
+
|
|
164
|
+
```typescript
|
|
165
|
+
// Search for all chunks of a document
|
|
166
|
+
const results = await vector.search('docs', 'anything', {
|
|
167
|
+
filter: { _documentId: 'article-123' },
|
|
168
|
+
limit: 1000,
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
// Sort chunks by index
|
|
172
|
+
const sortedChunks = results
|
|
173
|
+
.map(r => ({
|
|
174
|
+
...r,
|
|
175
|
+
...Voctar.parseChunkId(r.id)!
|
|
176
|
+
}))
|
|
177
|
+
.sort((a, b) => a.chunkIndex - b.chunkIndex);
|
|
178
|
+
|
|
179
|
+
// Reconstruct (approximately - note overlap)
|
|
180
|
+
const reconstructed = sortedChunks
|
|
181
|
+
.map(chunk => chunk.text)
|
|
182
|
+
.join('\n\n');
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Deletion
|
|
186
|
+
|
|
187
|
+
```typescript
|
|
188
|
+
// Delete document and all its chunks
|
|
189
|
+
await vector.delete('docs', 'article-123');
|
|
190
|
+
|
|
191
|
+
// This deletes:
|
|
192
|
+
// - article-123 (if it wasn't chunked)
|
|
193
|
+
// - article-123#0, article-123#1, article-123#2, ... (all chunks)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### Upsert (Update)
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
// Original document
|
|
200
|
+
await vector.embed('docs', originalText, {
|
|
201
|
+
documentId: 'article-123',
|
|
202
|
+
metadata: { version: 1 }
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
// Update the entire document
|
|
206
|
+
await vector.upsert('docs', 'article-123', updatedText, {
|
|
207
|
+
metadata: { version: 2, updated: Date.now() }
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
// All old chunks are deleted
|
|
211
|
+
// New chunks are created with same documentId
|
|
212
|
+
// Chunk IDs reset: article-123#0, article-123#1, ...
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Implementation Notes
|
|
216
|
+
|
|
217
|
+
### Max Chunks Per Document
|
|
218
|
+
|
|
219
|
+
Currently limited to 1000 chunks per document for efficient deletion.
|
|
220
|
+
|
|
221
|
+
If your documents might exceed this:
|
|
222
|
+
|
|
223
|
+
```typescript
|
|
224
|
+
// Increase chunk size
|
|
225
|
+
await vector.embed('docs', veryLongText, {
|
|
226
|
+
chunkSize: 2000, // Larger chunks = fewer chunks
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
// Or split into multiple documents
|
|
230
|
+
const parts = splitIntoSections(veryLongText);
|
|
231
|
+
for (const [index, part] of parts.entries()) {
|
|
232
|
+
await vector.embed('docs', part, {
|
|
233
|
+
documentId: `article-123-part${index}`,
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Chunk ID Character Limit
|
|
239
|
+
|
|
240
|
+
Keep document IDs reasonable (< 100 chars) to avoid hitting ID length limits in vector stores.
|
|
241
|
+
|
|
242
|
+
### Metadata Filtering
|
|
243
|
+
|
|
244
|
+
Filter searches by chunk metadata:
|
|
245
|
+
|
|
246
|
+
```typescript
|
|
247
|
+
// Find only first chunks
|
|
248
|
+
const results = await vector.search('docs', 'query', {
|
|
249
|
+
filter: { chunkIndex: 0 }
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
// Find chunks from specific document
|
|
253
|
+
const results = await vector.search('docs', 'query', {
|
|
254
|
+
filter: { _documentId: 'article-123' }
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
// Combine filters
|
|
258
|
+
const results = await vector.search('docs', 'query', {
|
|
259
|
+
filter: {
|
|
260
|
+
_documentId: 'article-123',
|
|
261
|
+
author: 'Alice',
|
|
262
|
+
}
|
|
263
|
+
});
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Best Practices
|
|
267
|
+
|
|
268
|
+
1. **Use meaningful document IDs**: They become part of chunk IDs
|
|
269
|
+
2. **Add rich metadata**: Makes filtering and retrieval easier
|
|
270
|
+
3. **Choose appropriate chunk size**: Balance between context and precision
|
|
271
|
+
4. **Use chunk overlap**: Prevents losing context at boundaries
|
|
272
|
+
5. **Track versions**: Include version info in metadata for updates
|
|
273
|
+
|
|
274
|
+
## Future Improvements
|
|
275
|
+
|
|
276
|
+
- [ ] Metadata-based deletion (query by `_documentId` and delete all matches)
|
|
277
|
+
- [ ] Chunk merging for adjacent results
|
|
278
|
+
- [ ] Automatic chunk retrieval by proximity
|
|
279
|
+
- [ ] Chunk caching for faster document reconstruction
|
|
280
|
+
|