@workglow/dataset 0.0.86
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +1134 -0
- package/dist/browser.js +1053 -0
- package/dist/browser.js.map +20 -0
- package/dist/bun.js +1054 -0
- package/dist/bun.js.map +20 -0
- package/dist/common-server.d.ts +7 -0
- package/dist/common-server.d.ts.map +1 -0
- package/dist/common.d.ts +17 -0
- package/dist/common.d.ts.map +1 -0
- package/dist/document/Document.d.ts +50 -0
- package/dist/document/Document.d.ts.map +1 -0
- package/dist/document/DocumentDataset.d.ts +79 -0
- package/dist/document/DocumentDataset.d.ts.map +1 -0
- package/dist/document/DocumentDatasetRegistry.d.ts +29 -0
- package/dist/document/DocumentDatasetRegistry.d.ts.map +1 -0
- package/dist/document/DocumentNode.d.ts +31 -0
- package/dist/document/DocumentNode.d.ts.map +1 -0
- package/dist/document/DocumentSchema.d.ts +1668 -0
- package/dist/document/DocumentSchema.d.ts.map +1 -0
- package/dist/document/DocumentStorageSchema.d.ts +43 -0
- package/dist/document/DocumentStorageSchema.d.ts.map +1 -0
- package/dist/document/StructuralParser.d.ts +30 -0
- package/dist/document/StructuralParser.d.ts.map +1 -0
- package/dist/document-chunk/DocumentChunkDataset.d.ts +79 -0
- package/dist/document-chunk/DocumentChunkDataset.d.ts.map +1 -0
- package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +29 -0
- package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +1 -0
- package/dist/document-chunk/DocumentChunkSchema.d.ts +55 -0
- package/dist/document-chunk/DocumentChunkSchema.d.ts.map +1 -0
- package/dist/node.js +1053 -0
- package/dist/node.js.map +20 -0
- package/dist/types.d.ts +7 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/util/DatasetSchema.d.ts +85 -0
- package/dist/util/DatasetSchema.d.ts.map +1 -0
- package/package.json +54 -0
- package/src/document-chunk/README.md +362 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
# Document Chunk Dataset
|
|
2
|
+
|
|
3
|
+
Document-specific schema and utilities for storing document chunk embeddings. Uses the general-purpose vector storage from `@workglow/storage` with a predefined schema for document chunks in RAG (Retrieval-Augmented Generation) pipelines.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Predefined Schema**: `DocumentChunkSchema` with fields for chunk_id, doc_id, vector, and metadata
|
|
8
|
+
- **Registry Pattern**: Register and retrieve chunk storage instances globally
|
|
9
|
+
- **Type Safety**: Full TypeScript type definitions for document chunks
|
|
10
|
+
- **Storage Agnostic**: Works with any vector storage backend (InMemory, SQLite, PostgreSQL)
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
bun install @workglow/dataset @workglow/storage
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
### Basic Usage with InMemoryVectorStorage
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
|
|
24
|
+
import { InMemoryVectorStorage } from "@workglow/storage";
|
|
25
|
+
|
|
26
|
+
// Create storage using the DocumentChunkSchema
|
|
27
|
+
const repo = new InMemoryVectorStorage(
|
|
28
|
+
DocumentChunkSchema,
|
|
29
|
+
DocumentChunkPrimaryKey,
|
|
30
|
+
[], // indexes (optional)
|
|
31
|
+
384 // vector dimensions
|
|
32
|
+
);
|
|
33
|
+
await repo.setupDatabase();
|
|
34
|
+
|
|
35
|
+
// Store a document chunk with its embedding
|
|
36
|
+
await repo.put({
|
|
37
|
+
chunk_id: "chunk-001",
|
|
38
|
+
doc_id: "doc-001",
|
|
39
|
+
vector: new Float32Array([0.1, 0.2, 0.3 /* ... 384 dims */]),
|
|
40
|
+
metadata: { text: "Hello world", source: "example.txt" },
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
// Search for similar chunks
|
|
44
|
+
const results = await repo.similaritySearch(new Float32Array([0.15, 0.25, 0.35 /* ... */]), {
|
|
45
|
+
topK: 5,
|
|
46
|
+
scoreThreshold: 0.7,
|
|
47
|
+
});
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Quantized Vectors (Reduced Storage)
|
|
51
|
+
|
|
52
|
+
```typescript
|
|
53
|
+
import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
|
|
54
|
+
import { InMemoryVectorStorage } from "@workglow/storage";
|
|
55
|
+
|
|
56
|
+
// Use Int8Array for 4x smaller storage (binary quantization)
|
|
57
|
+
const repo = new InMemoryVectorStorage(
|
|
58
|
+
DocumentChunkSchema,
|
|
59
|
+
DocumentChunkPrimaryKey,
|
|
60
|
+
[],
|
|
61
|
+
384,
|
|
62
|
+
Int8Array // Specify vector type
|
|
63
|
+
);
|
|
64
|
+
await repo.setupDatabase();
|
|
65
|
+
|
|
66
|
+
// Store quantized vectors
|
|
67
|
+
await repo.put({
|
|
68
|
+
chunk_id: "chunk-001",
|
|
69
|
+
doc_id: "doc-001",
|
|
70
|
+
vector: new Int8Array([127, -128, 64 /* ... */]),
|
|
71
|
+
metadata: { category: "ai" },
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
// Search with quantized query
|
|
75
|
+
const results = await repo.similaritySearch(new Int8Array([100, -50, 75 /* ... */]), { topK: 5 });
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### SQLite Storage (Local Persistence)
|
|
79
|
+
|
|
80
|
+
```typescript
|
|
81
|
+
import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
|
|
82
|
+
import { SqliteVectorStorage } from "@workglow/storage";
|
|
83
|
+
|
|
84
|
+
const repo = new SqliteVectorStorage(
|
|
85
|
+
"./vectors.db", // database path
|
|
86
|
+
"chunks", // table name
|
|
87
|
+
DocumentChunkSchema,
|
|
88
|
+
DocumentChunkPrimaryKey,
|
|
89
|
+
[], // indexes
|
|
90
|
+
768 // vector dimension
|
|
91
|
+
);
|
|
92
|
+
await repo.setupDatabase();
|
|
93
|
+
|
|
94
|
+
// Bulk insert using inherited tabular methods
|
|
95
|
+
await repo.putMany([
|
|
96
|
+
{ chunk_id: "1", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
|
|
97
|
+
{ chunk_id: "2", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
|
|
98
|
+
]);
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### PostgreSQL with pgvector
|
|
102
|
+
|
|
103
|
+
```typescript
|
|
104
|
+
import { Pool } from "pg";
|
|
105
|
+
import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
|
|
106
|
+
import { PostgresVectorStorage } from "@workglow/storage";
|
|
107
|
+
|
|
108
|
+
const pool = new Pool({ connectionString: "postgresql://..." });
|
|
109
|
+
const repo = new PostgresVectorStorage(
|
|
110
|
+
pool,
|
|
111
|
+
"chunks",
|
|
112
|
+
DocumentChunkSchema,
|
|
113
|
+
DocumentChunkPrimaryKey,
|
|
114
|
+
[],
|
|
115
|
+
384 // vector dimension
|
|
116
|
+
);
|
|
117
|
+
await repo.setupDatabase();
|
|
118
|
+
|
|
119
|
+
// Native pgvector similarity search with filter
|
|
120
|
+
const results = await repo.similaritySearch(queryVector, {
|
|
121
|
+
topK: 10,
|
|
122
|
+
filter: { category: "ai" },
|
|
123
|
+
scoreThreshold: 0.5,
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
// Hybrid search (vector + full-text)
|
|
127
|
+
const hybridResults = await repo.hybridSearch(queryVector, {
|
|
128
|
+
textQuery: "machine learning",
|
|
129
|
+
topK: 10,
|
|
130
|
+
vectorWeight: 0.7,
|
|
131
|
+
filter: { category: "ai" },
|
|
132
|
+
});
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Schema Definition
|
|
136
|
+
|
|
137
|
+
### DocumentChunkSchema
|
|
138
|
+
|
|
139
|
+
The predefined schema for document chunks:
|
|
140
|
+
|
|
141
|
+
```typescript
|
|
142
|
+
import { TypedArraySchema } from "@workglow/util";
|
|
143
|
+
|
|
144
|
+
export const DocumentChunkSchema = {
|
|
145
|
+
type: "object",
|
|
146
|
+
properties: {
|
|
147
|
+
chunk_id: { type: "string" },
|
|
148
|
+
doc_id: { type: "string" },
|
|
149
|
+
vector: TypedArraySchema(), // Automatically detected as vector column
|
|
150
|
+
metadata: {
|
|
151
|
+
type: "object",
|
|
152
|
+
format: "metadata", // Marked for filtering support
|
|
153
|
+
additionalProperties: true,
|
|
154
|
+
},
|
|
155
|
+
},
|
|
156
|
+
additionalProperties: false,
|
|
157
|
+
} as const;
|
|
158
|
+
|
|
159
|
+
export const DocumentChunkPrimaryKey = ["chunk_id"] as const;
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### DocumentChunk Type
|
|
163
|
+
|
|
164
|
+
TypeScript interface for document chunks:
|
|
165
|
+
|
|
166
|
+
```typescript
|
|
167
|
+
interface DocumentChunk<
|
|
168
|
+
Metadata extends Record<string, unknown> = Record<string, unknown>,
|
|
169
|
+
Vector extends TypedArray = Float32Array,
|
|
170
|
+
> {
|
|
171
|
+
chunk_id: string; // Unique identifier for the chunk
|
|
172
|
+
doc_id: string; // Parent document identifier
|
|
173
|
+
vector: Vector; // Embedding vector
|
|
174
|
+
metadata: Metadata; // Custom metadata (text content, entities, etc.)
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## API Reference
|
|
179
|
+
|
|
180
|
+
### IChunkVectorStorage Interface
|
|
181
|
+
|
|
182
|
+
Extends `ITabularStorage` with vector-specific methods:
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
interface IChunkVectorStorage<Schema, PrimaryKeyNames, Entity> extends ITabularStorage<
|
|
186
|
+
Schema,
|
|
187
|
+
PrimaryKeyNames,
|
|
188
|
+
Entity
|
|
189
|
+
> {
|
|
190
|
+
// Get the vector dimension
|
|
191
|
+
getVectorDimensions(): number;
|
|
192
|
+
|
|
193
|
+
// Vector similarity search
|
|
194
|
+
similaritySearch(
|
|
195
|
+
query: TypedArray,
|
|
196
|
+
options?: VectorSearchOptions
|
|
197
|
+
): Promise<(Entity & { score: number })[]>;
|
|
198
|
+
|
|
199
|
+
// Hybrid search (optional - not all implementations support it)
|
|
200
|
+
hybridSearch?(
|
|
201
|
+
query: TypedArray,
|
|
202
|
+
options: HybridSearchOptions
|
|
203
|
+
): Promise<(Entity & { score: number })[]>;
|
|
204
|
+
}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Inherited Tabular Methods
|
|
208
|
+
|
|
209
|
+
From `ITabularStorage`:
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
// Setup
|
|
213
|
+
setupDatabase(): Promise<void>;
|
|
214
|
+
|
|
215
|
+
// CRUD Operations
|
|
216
|
+
put(entity: Entity): Promise<void>;
|
|
217
|
+
putMany(entities: Entity[]): Promise<void>;
|
|
218
|
+
get(key: PrimaryKey): Promise<Entity | undefined>;
|
|
219
|
+
getAll(): Promise<Entity[] | undefined>;
|
|
220
|
+
delete(key: PrimaryKey): Promise<void>;
|
|
221
|
+
deleteMany(keys: PrimaryKey[]): Promise<void>;
|
|
222
|
+
|
|
223
|
+
// Utility
|
|
224
|
+
size(): Promise<number>;
|
|
225
|
+
clear(): Promise<void>;
|
|
226
|
+
destroy(): void;
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Search Options
|
|
230
|
+
|
|
231
|
+
```typescript
|
|
232
|
+
interface VectorSearchOptions<Metadata = Record<string, unknown>> {
|
|
233
|
+
readonly topK?: number; // Number of results (default: 10)
|
|
234
|
+
readonly filter?: Partial<Metadata>; // Filter by metadata fields
|
|
235
|
+
readonly scoreThreshold?: number; // Minimum score 0-1 (default: 0)
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
interface HybridSearchOptions<Metadata> extends VectorSearchOptions<Metadata> {
|
|
239
|
+
readonly textQuery: string; // Full-text query keywords
|
|
240
|
+
readonly vectorWeight?: number; // Vector weight 0-1 (default: 0.7)
|
|
241
|
+
}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Global Registry
|
|
245
|
+
|
|
246
|
+
Register and retrieve chunk vector storage instances globally:
|
|
247
|
+
|
|
248
|
+
```typescript
|
|
249
|
+
import {
|
|
250
|
+
DocumentChunkSchema,
|
|
251
|
+
DocumentChunkPrimaryKey,
|
|
252
|
+
registerChunkVectorRepository,
|
|
253
|
+
getDocumentChunkDataset,
|
|
254
|
+
getGlobalDocumentChunkDataset,
|
|
255
|
+
} from "@workglow/dataset";
|
|
256
|
+
import { InMemoryVectorStorage } from "@workglow/storage";
|
|
257
|
+
|
|
258
|
+
// Create and register a storage instance
|
|
259
|
+
const repo = new InMemoryVectorStorage(DocumentChunkSchema, DocumentChunkPrimaryKey, [], 384);
|
|
260
|
+
await repo.setupDatabase();
|
|
261
|
+
|
|
262
|
+
registerChunkVectorRepository("my-chunks", repo);
|
|
263
|
+
|
|
264
|
+
// Retrieve by ID
|
|
265
|
+
const retrievedRepo = getDocumentChunkDataset("my-chunks");
|
|
266
|
+
|
|
267
|
+
// Get all registered storage instances
|
|
268
|
+
const allRepos = getGlobalDocumentChunkDataset();
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## Quantization Benefits
|
|
272
|
+
|
|
273
|
+
Quantized vectors reduce storage and can improve performance:
|
|
274
|
+
|
|
275
|
+
| Vector Type | Bytes/Dim | Storage vs Float32 | Use Case |
|
|
276
|
+
| ------------ | --------- | ------------------ | ------------------------------------ |
|
|
277
|
+
| Float32Array | 4 | 100% (baseline) | Standard embeddings |
|
|
278
|
+
| Float64Array | 8 | 200% | High precision needed |
|
|
279
|
+
| Float16Array | 2 | 50% | Great precision/size tradeoff |
|
|
280
|
+
| Int16Array | 2 | 50% | Good precision/size tradeoff |
|
|
281
|
+
| Int8Array | 1 | 25% | Binary quantization, max compression |
|
|
282
|
+
| Uint8Array | 1 | 25% | Quantized embeddings [0-255] |
|
|
283
|
+
|
|
284
|
+
**Example:** A 768-dimensional embedding:
|
|
285
|
+
|
|
286
|
+
- Float32: 3,072 bytes
|
|
287
|
+
- Int8: 768 bytes (75% reduction!)
|
|
288
|
+
|
|
289
|
+
## Performance Considerations
|
|
290
|
+
|
|
291
|
+
### InMemory
|
|
292
|
+
|
|
293
|
+
- **Best for:** Testing, small datasets (<10K vectors), development
|
|
294
|
+
- **Pros:** Fastest, no dependencies, supports all vector types
|
|
295
|
+
- **Cons:** No persistence, memory limited
|
|
296
|
+
|
|
297
|
+
### SQLite
|
|
298
|
+
|
|
299
|
+
- **Best for:** Local apps, medium datasets (<100K vectors)
|
|
300
|
+
- **Pros:** Persistent, single file, no server
|
|
301
|
+
- **Cons:** No native vector indexing (linear scan), slower for large datasets
|
|
302
|
+
|
|
303
|
+
### PostgreSQL + pgvector
|
|
304
|
+
|
|
305
|
+
- **Best for:** Production, large datasets (>100K vectors)
|
|
306
|
+
- **Pros:** Native HNSW/IVFFlat indexing, efficient similarity search, scalable
|
|
307
|
+
- **Cons:** Requires PostgreSQL server and pgvector extension
|
|
308
|
+
- **Setup:** `CREATE EXTENSION vector;`
|
|
309
|
+
|
|
310
|
+
## Integration with DocumentDataset
|
|
311
|
+
|
|
312
|
+
Document chunk storage works alongside `DocumentDataset` for hierarchical document management:
|
|
313
|
+
|
|
314
|
+
```typescript
|
|
315
|
+
import {
|
|
316
|
+
DocumentDataset,
|
|
317
|
+
DocumentStorageSchema,
|
|
318
|
+
DocumentChunkSchema,
|
|
319
|
+
DocumentChunkPrimaryKey,
|
|
320
|
+
} from "@workglow/dataset";
|
|
321
|
+
import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
|
|
322
|
+
|
|
323
|
+
// Initialize storage backends
|
|
324
|
+
const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, ["doc_id"]);
|
|
325
|
+
await tabularStorage.setupDatabase();
|
|
326
|
+
|
|
327
|
+
const vectorStorage = new InMemoryVectorStorage(
|
|
328
|
+
DocumentChunkSchema,
|
|
329
|
+
DocumentChunkPrimaryKey,
|
|
330
|
+
[],
|
|
331
|
+
384
|
|
332
|
+
);
|
|
333
|
+
await vectorStorage.setupDatabase();
|
|
334
|
+
|
|
335
|
+
// Create document dataset with both storages
|
|
336
|
+
const docDataset = new DocumentDataset(tabularStorage, vectorStorage);
|
|
337
|
+
|
|
338
|
+
// Store document structure in tabular, chunks in vector
|
|
339
|
+
await docDataset.upsert(document);
|
|
340
|
+
|
|
341
|
+
// Search chunks by vector similarity
|
|
342
|
+
const results = await docDataset.search(queryVector, { topK: 5 });
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Chunk Metadata for Hierarchical Documents
|
|
346
|
+
|
|
347
|
+
When using hierarchical chunking, chunk metadata typically includes:
|
|
348
|
+
|
|
349
|
+
```typescript
|
|
350
|
+
metadata: {
|
|
351
|
+
text: string; // Chunk text content
|
|
352
|
+
leafNodeId?: string; // Reference to document tree node
|
|
353
|
+
depth?: number; // Hierarchy depth
|
|
354
|
+
nodePath?: string[]; // Node IDs from root to leaf
|
|
355
|
+
summary?: string; // Summary of the chunk content
|
|
356
|
+
entities?: Entity[]; // Named entities extracted from the chunk
|
|
357
|
+
}
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
## License
|
|
361
|
+
|
|
362
|
+
Apache 2.0
|