@workglow/dataset 0.0.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +1134 -0
  3. package/dist/browser.js +1053 -0
  4. package/dist/browser.js.map +20 -0
  5. package/dist/bun.js +1054 -0
  6. package/dist/bun.js.map +20 -0
  7. package/dist/common-server.d.ts +7 -0
  8. package/dist/common-server.d.ts.map +1 -0
  9. package/dist/common.d.ts +17 -0
  10. package/dist/common.d.ts.map +1 -0
  11. package/dist/document/Document.d.ts +50 -0
  12. package/dist/document/Document.d.ts.map +1 -0
  13. package/dist/document/DocumentDataset.d.ts +79 -0
  14. package/dist/document/DocumentDataset.d.ts.map +1 -0
  15. package/dist/document/DocumentDatasetRegistry.d.ts +29 -0
  16. package/dist/document/DocumentDatasetRegistry.d.ts.map +1 -0
  17. package/dist/document/DocumentNode.d.ts +31 -0
  18. package/dist/document/DocumentNode.d.ts.map +1 -0
  19. package/dist/document/DocumentSchema.d.ts +1668 -0
  20. package/dist/document/DocumentSchema.d.ts.map +1 -0
  21. package/dist/document/DocumentStorageSchema.d.ts +43 -0
  22. package/dist/document/DocumentStorageSchema.d.ts.map +1 -0
  23. package/dist/document/StructuralParser.d.ts +30 -0
  24. package/dist/document/StructuralParser.d.ts.map +1 -0
  25. package/dist/document-chunk/DocumentChunkDataset.d.ts +79 -0
  26. package/dist/document-chunk/DocumentChunkDataset.d.ts.map +1 -0
  27. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +29 -0
  28. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +1 -0
  29. package/dist/document-chunk/DocumentChunkSchema.d.ts +55 -0
  30. package/dist/document-chunk/DocumentChunkSchema.d.ts.map +1 -0
  31. package/dist/node.js +1053 -0
  32. package/dist/node.js.map +20 -0
  33. package/dist/types.d.ts +7 -0
  34. package/dist/types.d.ts.map +1 -0
  35. package/dist/util/DatasetSchema.d.ts +85 -0
  36. package/dist/util/DatasetSchema.d.ts.map +1 -0
  37. package/package.json +54 -0
  38. package/src/document-chunk/README.md +362 -0
@@ -0,0 +1,362 @@
1
+ # Document Chunk Dataset
2
+
3
+ Document-specific schema and utilities for storing document chunk embeddings. Uses the general-purpose vector storage from `@workglow/storage` with a predefined schema for document chunks in RAG (Retrieval-Augmented Generation) pipelines.
4
+
5
+ ## Features
6
+
7
+ - **Predefined Schema**: `DocumentChunkSchema` with fields for chunk_id, doc_id, vector, and metadata
8
+ - **Registry Pattern**: Register and retrieve chunk storage instances globally
9
+ - **Type Safety**: Full TypeScript type definitions for document chunks
10
+ - **Storage Agnostic**: Works with any vector storage backend (InMemory, SQLite, PostgreSQL)
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ bun install @workglow/dataset @workglow/storage
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ### Basic Usage with InMemoryVectorStorage
21
+
22
+ ```typescript
23
+ import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
24
+ import { InMemoryVectorStorage } from "@workglow/storage";
25
+
26
+ // Create storage using the DocumentChunkSchema
27
+ const repo = new InMemoryVectorStorage(
28
+ DocumentChunkSchema,
29
+ DocumentChunkPrimaryKey,
30
+ [], // indexes (optional)
31
+ 384 // vector dimensions
32
+ );
33
+ await repo.setupDatabase();
34
+
35
+ // Store a document chunk with its embedding
36
+ await repo.put({
37
+ chunk_id: "chunk-001",
38
+ doc_id: "doc-001",
39
+ vector: new Float32Array([0.1, 0.2, 0.3 /* ... 384 dims */]),
40
+ metadata: { text: "Hello world", source: "example.txt" },
41
+ });
42
+
43
+ // Search for similar chunks
44
+ const results = await repo.similaritySearch(new Float32Array([0.15, 0.25, 0.35 /* ... */]), {
45
+ topK: 5,
46
+ scoreThreshold: 0.7,
47
+ });
48
+ ```
49
+
50
+ ### Quantized Vectors (Reduced Storage)
51
+
52
+ ```typescript
53
+ import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
54
+ import { InMemoryVectorStorage } from "@workglow/storage";
55
+
56
+ // Use Int8Array for 4x smaller storage (binary quantization)
57
+ const repo = new InMemoryVectorStorage(
58
+ DocumentChunkSchema,
59
+ DocumentChunkPrimaryKey,
60
+ [],
61
+ 384,
62
+ Int8Array // Specify vector type
63
+ );
64
+ await repo.setupDatabase();
65
+
66
+ // Store quantized vectors
67
+ await repo.put({
68
+ chunk_id: "chunk-001",
69
+ doc_id: "doc-001",
70
+ vector: new Int8Array([127, -128, 64 /* ... */]),
71
+ metadata: { category: "ai" },
72
+ });
73
+
74
+ // Search with quantized query
75
+ const results = await repo.similaritySearch(new Int8Array([100, -50, 75 /* ... */]), { topK: 5 });
76
+ ```
77
+
78
+ ### SQLite Storage (Local Persistence)
79
+
80
+ ```typescript
81
+ import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
82
+ import { SqliteVectorStorage } from "@workglow/storage";
83
+
84
+ const repo = new SqliteVectorStorage(
85
+ "./vectors.db", // database path
86
+ "chunks", // table name
87
+ DocumentChunkSchema,
88
+ DocumentChunkPrimaryKey,
89
+ [], // indexes
90
+ 768 // vector dimension
91
+ );
92
+ await repo.setupDatabase();
93
+
94
+ // Bulk insert using inherited tabular methods
95
+ await repo.putMany([
96
+ { chunk_id: "1", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
97
+ { chunk_id: "2", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
98
+ ]);
99
+ ```
100
+
101
+ ### PostgreSQL with pgvector
102
+
103
+ ```typescript
104
+ import { Pool } from "pg";
105
+ import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
106
+ import { PostgresVectorStorage } from "@workglow/storage";
107
+
108
+ const pool = new Pool({ connectionString: "postgresql://..." });
109
+ const repo = new PostgresVectorStorage(
110
+ pool,
111
+ "chunks",
112
+ DocumentChunkSchema,
113
+ DocumentChunkPrimaryKey,
114
+ [],
115
+ 384 // vector dimension
116
+ );
117
+ await repo.setupDatabase();
118
+
119
+ // Native pgvector similarity search with filter
120
+ const results = await repo.similaritySearch(queryVector, {
121
+ topK: 10,
122
+ filter: { category: "ai" },
123
+ scoreThreshold: 0.5,
124
+ });
125
+
126
+ // Hybrid search (vector + full-text)
127
+ const hybridResults = await repo.hybridSearch(queryVector, {
128
+ textQuery: "machine learning",
129
+ topK: 10,
130
+ vectorWeight: 0.7,
131
+ filter: { category: "ai" },
132
+ });
133
+ ```
134
+
135
+ ## Schema Definition
136
+
137
+ ### DocumentChunkSchema
138
+
139
+ The predefined schema for document chunks:
140
+
141
+ ```typescript
142
+ import { TypedArraySchema } from "@workglow/util";
143
+
144
+ export const DocumentChunkSchema = {
145
+ type: "object",
146
+ properties: {
147
+ chunk_id: { type: "string" },
148
+ doc_id: { type: "string" },
149
+ vector: TypedArraySchema(), // Automatically detected as vector column
150
+ metadata: {
151
+ type: "object",
152
+ format: "metadata", // Marked for filtering support
153
+ additionalProperties: true,
154
+ },
155
+ },
156
+ additionalProperties: false,
157
+ } as const;
158
+
159
+ export const DocumentChunkPrimaryKey = ["chunk_id"] as const;
160
+ ```
161
+
162
+ ### DocumentChunk Type
163
+
164
+ TypeScript interface for document chunks:
165
+
166
+ ```typescript
167
+ interface DocumentChunk<
168
+ Metadata extends Record<string, unknown> = Record<string, unknown>,
169
+ Vector extends TypedArray = Float32Array,
170
+ > {
171
+ chunk_id: string; // Unique identifier for the chunk
172
+ doc_id: string; // Parent document identifier
173
+ vector: Vector; // Embedding vector
174
+ metadata: Metadata; // Custom metadata (text content, entities, etc.)
175
+ }
176
+ ```
177
+
178
+ ## API Reference
179
+
180
+ ### IChunkVectorStorage Interface
181
+
182
+ Extends `ITabularStorage` with vector-specific methods:
183
+
184
+ ```typescript
185
+ interface IChunkVectorStorage<Schema, PrimaryKeyNames, Entity> extends ITabularStorage<
186
+ Schema,
187
+ PrimaryKeyNames,
188
+ Entity
189
+ > {
190
+ // Get the vector dimension
191
+ getVectorDimensions(): number;
192
+
193
+ // Vector similarity search
194
+ similaritySearch(
195
+ query: TypedArray,
196
+ options?: VectorSearchOptions
197
+ ): Promise<(Entity & { score: number })[]>;
198
+
199
+ // Hybrid search (optional - not all implementations support it)
200
+ hybridSearch?(
201
+ query: TypedArray,
202
+ options: HybridSearchOptions
203
+ ): Promise<(Entity & { score: number })[]>;
204
+ }
205
+ ```
206
+
207
+ ### Inherited Tabular Methods
208
+
209
+ From `ITabularStorage`:
210
+
211
+ ```typescript
212
+ // Setup
213
+ setupDatabase(): Promise<void>;
214
+
215
+ // CRUD Operations
216
+ put(entity: Entity): Promise<void>;
217
+ putMany(entities: Entity[]): Promise<void>;
218
+ get(key: PrimaryKey): Promise<Entity | undefined>;
219
+ getAll(): Promise<Entity[] | undefined>;
220
+ delete(key: PrimaryKey): Promise<void>;
221
+ deleteMany(keys: PrimaryKey[]): Promise<void>;
222
+
223
+ // Utility
224
+ size(): Promise<number>;
225
+ clear(): Promise<void>;
226
+ destroy(): void;
227
+ ```
228
+
229
+ ### Search Options
230
+
231
+ ```typescript
232
+ interface VectorSearchOptions<Metadata = Record<string, unknown>> {
233
+ readonly topK?: number; // Number of results (default: 10)
234
+ readonly filter?: Partial<Metadata>; // Filter by metadata fields
235
+ readonly scoreThreshold?: number; // Minimum score 0-1 (default: 0)
236
+ }
237
+
238
+ interface HybridSearchOptions<Metadata> extends VectorSearchOptions<Metadata> {
239
+ readonly textQuery: string; // Full-text query keywords
240
+ readonly vectorWeight?: number; // Vector weight 0-1 (default: 0.7)
241
+ }
242
+ ```
243
+
244
+ ## Global Registry
245
+
246
+ Register and retrieve chunk vector storage instances globally:
247
+
248
+ ```typescript
249
+ import {
250
+ DocumentChunkSchema,
251
+ DocumentChunkPrimaryKey,
252
+ registerChunkVectorRepository,
253
+ getDocumentChunkDataset,
254
+ getGlobalDocumentChunkDataset,
255
+ } from "@workglow/dataset";
256
+ import { InMemoryVectorStorage } from "@workglow/storage";
257
+
258
+ // Create and register a storage instance
259
+ const repo = new InMemoryVectorStorage(DocumentChunkSchema, DocumentChunkPrimaryKey, [], 384);
260
+ await repo.setupDatabase();
261
+
262
+ registerChunkVectorRepository("my-chunks", repo);
263
+
264
+ // Retrieve by ID
265
+ const retrievedRepo = getDocumentChunkDataset("my-chunks");
266
+
267
+ // Get all registered storage instances
268
+ const allRepos = getGlobalDocumentChunkDataset();
269
+ ```
270
+
271
+ ## Quantization Benefits
272
+
273
+ Quantized vectors reduce storage and can improve performance:
274
+
275
+ | Vector Type | Bytes/Dim | Storage vs Float32 | Use Case |
276
+ | ------------ | --------- | ------------------ | ------------------------------------ |
277
+ | Float32Array | 4 | 100% (baseline) | Standard embeddings |
278
+ | Float64Array | 8 | 200% | High precision needed |
279
+ | Float16Array | 2 | 50% | Great precision/size tradeoff |
280
+ | Int16Array | 2 | 50% | Good precision/size tradeoff |
281
+ | Int8Array | 1 | 25% | Binary quantization, max compression |
282
+ | Uint8Array | 1 | 25% | Quantized embeddings [0-255] |
283
+
284
+ **Example:** A 768-dimensional embedding:
285
+
286
+ - Float32: 3,072 bytes
287
+ - Int8: 768 bytes (75% reduction!)
288
+
289
+ ## Performance Considerations
290
+
291
+ ### InMemory
292
+
293
+ - **Best for:** Testing, small datasets (<10K vectors), development
294
+ - **Pros:** Fastest, no dependencies, supports all vector types
295
+ - **Cons:** No persistence, memory limited
296
+
297
+ ### SQLite
298
+
299
+ - **Best for:** Local apps, medium datasets (<100K vectors)
300
+ - **Pros:** Persistent, single file, no server
301
+ - **Cons:** No native vector indexing (linear scan), slower for large datasets
302
+
303
+ ### PostgreSQL + pgvector
304
+
305
+ - **Best for:** Production, large datasets (>100K vectors)
306
+ - **Pros:** Native HNSW/IVFFlat indexing, efficient similarity search, scalable
307
+ - **Cons:** Requires PostgreSQL server and pgvector extension
308
+ - **Setup:** `CREATE EXTENSION vector;`
309
+
310
+ ## Integration with DocumentDataset
311
+
312
+ Document chunk storage works alongside `DocumentDataset` for hierarchical document management:
313
+
314
+ ```typescript
315
+ import {
316
+ DocumentDataset,
317
+ DocumentStorageSchema,
318
+ DocumentChunkSchema,
319
+ DocumentChunkPrimaryKey,
320
+ } from "@workglow/dataset";
321
+ import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
322
+
323
+ // Initialize storage backends
324
+ const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, ["doc_id"]);
325
+ await tabularStorage.setupDatabase();
326
+
327
+ const vectorStorage = new InMemoryVectorStorage(
328
+ DocumentChunkSchema,
329
+ DocumentChunkPrimaryKey,
330
+ [],
331
+ 384
332
+ );
333
+ await vectorStorage.setupDatabase();
334
+
335
+ // Create document dataset with both storages
336
+ const docDataset = new DocumentDataset(tabularStorage, vectorStorage);
337
+
338
+ // Store document structure in tabular, chunks in vector
339
+ await docDataset.upsert(document);
340
+
341
+ // Search chunks by vector similarity
342
+ const results = await docDataset.search(queryVector, { topK: 5 });
343
+ ```
344
+
345
+ ### Chunk Metadata for Hierarchical Documents
346
+
347
+ When using hierarchical chunking, chunk metadata typically includes:
348
+
349
+ ```typescript
350
+ metadata: {
351
+ text: string; // Chunk text content
352
+ leafNodeId?: string; // Reference to document tree node
353
+ depth?: number; // Hierarchy depth
354
+ nodePath?: string[]; // Node IDs from root to leaf
355
+ summary?: string; // Summary of the chunk content
356
+ entities?: Entity[]; // Named entities extracted from the chunk
357
+ }
358
+ ```
359
+
360
+ ## License
361
+
362
+ Apache 2.0