@workglow/dataset 0.0.110 → 0.0.114

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +504 -968
  2. package/dist/browser.js +376 -490
  3. package/dist/browser.js.map +13 -13
  4. package/dist/bun.js +376 -490
  5. package/dist/bun.js.map +13 -13
  6. package/dist/chunk/ChunkSchema.d.ts +206 -0
  7. package/dist/chunk/ChunkSchema.d.ts.map +1 -0
  8. package/dist/chunk/ChunkVectorStorageSchema.d.ts +64 -0
  9. package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -0
  10. package/dist/common.d.ts +5 -5
  11. package/dist/common.d.ts.map +1 -1
  12. package/dist/document/Document.d.ts +7 -6
  13. package/dist/document/Document.d.ts.map +1 -1
  14. package/dist/document/DocumentSchema.d.ts +0 -465
  15. package/dist/document/DocumentSchema.d.ts.map +1 -1
  16. package/dist/knowledge-base/KnowledgeBase.d.ts +122 -0
  17. package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -0
  18. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +24 -0
  19. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -0
  20. package/dist/knowledge-base/createKnowledgeBase.d.ts +28 -0
  21. package/dist/knowledge-base/createKnowledgeBase.d.ts.map +1 -0
  22. package/dist/node.js +376 -490
  23. package/dist/node.js.map +13 -13
  24. package/dist/util/DatasetSchema.d.ts +9 -49
  25. package/dist/util/DatasetSchema.d.ts.map +1 -1
  26. package/package.json +5 -5
  27. package/dist/document/DocumentDataset.d.ts +0 -79
  28. package/dist/document/DocumentDataset.d.ts.map +0 -1
  29. package/dist/document/DocumentDatasetRegistry.d.ts +0 -29
  30. package/dist/document/DocumentDatasetRegistry.d.ts.map +0 -1
  31. package/dist/document-chunk/DocumentChunkDataset.d.ts +0 -79
  32. package/dist/document-chunk/DocumentChunkDataset.d.ts.map +0 -1
  33. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +0 -29
  34. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +0 -1
  35. package/dist/document-chunk/DocumentChunkSchema.d.ts +0 -56
  36. package/dist/document-chunk/DocumentChunkSchema.d.ts.map +0 -1
  37. package/src/document-chunk/README.md +0 -362
package/README.md CHANGED
@@ -1,1132 +1,668 @@
1
- # @workglow/storage
1
+ # @workglow/dataset
2
2
 
3
- Modular storage solutions for Workglow.AI platform with multiple backend implementations. Provides consistent interfaces for key-value storage, tabular data storage, and job queue persistence.
3
+ Document management, hierarchical chunking, and knowledge base infrastructure for RAG pipelines.
4
4
 
5
- - [Quick Start](#quick-start)
5
+ - [Overview](#overview)
6
6
  - [Installation](#installation)
7
- - [Core Concepts](#core-concepts)
8
- - [Type Safety](#type-safety)
9
- - [Environment Compatibility](#environment-compatibility)
10
- - [Import Patterns](#import-patterns)
11
- - [Storage Types](#storage-types)
12
- - [Key-Value Storage](#key-value-storage)
13
- - [Basic Usage](#basic-usage)
14
- - [Environment-Specific Examples](#environment-specific-examples)
15
- - [Bulk Operations](#bulk-operations)
16
- - [Event Handling](#event-handling)
17
- - [Tabular Storage](#tabular-storage)
18
- - [Schema Definition](#schema-definition)
19
- - [CRUD Operations](#crud-operations)
20
- - [Bulk Operations](#bulk-operations-1)
21
- - [Searching and Filtering](#searching-and-filtering)
22
- - [Environment-Specific Tabular Storage](#environment-specific-tabular-storage)
23
- - [Queue Storage](#queue-storage)
24
- - [Basic Job Queue Operations](#basic-job-queue-operations)
25
- - [Job Management](#job-management)
26
- - [Environment-Specific Usage](#environment-specific-usage)
27
- - [Browser Environment](#browser-environment)
28
- - [Node.js Environment](#nodejs-environment)
29
- - [Bun Environment](#bun-environment)
30
- - [Advanced Features](#advanced-features)
31
- - [Repository Registry](#repository-registry)
32
- - [Event-Driven Architecture](#event-driven-architecture)
33
- - [Compound Primary Keys](#compound-primary-keys)
34
- - [Custom File Layout (KV on filesystem)](#custom-file-layout-kv-on-filesystem)
7
+ - [Quick Start](#quick-start)
8
+ - [Architecture](#architecture)
9
+ - [Documents](#documents)
10
+ - [Document Tree Structure](#document-tree-structure)
11
+ - [Parsing](#parsing)
12
+ - [Node Enrichment](#node-enrichment)
13
+ - [Chunks](#chunks)
14
+ - [ChunkRecord](#chunkrecord)
15
+ - [Chunk Vector Storage](#chunk-vector-storage)
16
+ - [KnowledgeBase](#knowledgebase)
17
+ - [Creating a KnowledgeBase](#creating-a-knowledgebase)
18
+ - [Document CRUD](#document-crud)
19
+ - [Chunk Operations](#chunk-operations)
20
+ - [Search](#search)
21
+ - [Tree Traversal](#tree-traversal)
22
+ - [Lifecycle Management](#lifecycle-management)
23
+ - [Registry](#registry)
24
+ - [Data Flow](#data-flow)
25
+ - [Ingestion Pipeline](#ingestion-pipeline)
26
+ - [Retrieval Pipeline](#retrieval-pipeline)
35
27
  - [API Reference](#api-reference)
36
- - [IKvStorage\<Key, Value\>](#ikvrepositorykey-value)
37
- - [ITabularStorage\<Schema, PrimaryKeyNames\>](#itabularrepositoryschema-primarykeynames)
38
- - [IQueueStorage\<Input, Output\>](#iqueuestorageinput-output)
39
- - [Examples](#examples)
40
- - [User Management System](#user-management-system)
41
- - [Configuration Management](#configuration-management)
42
- - [Testing](#testing)
43
- - [Writing Tests for Your Storage Usage](#writing-tests-for-your-storage-usage)
28
+ - [Document](#document)
29
+ - [KnowledgeBase](#knowledgebase-1)
30
+ - [createKnowledgeBase](#createknowledgebase)
31
+ - [StructuralParser](#structuralparser)
32
+ - [Type Helpers](#type-helpers)
44
33
  - [License](#license)
45
34
 
46
- ## Quick Start
47
-
48
- ```typescript
49
- // Key-Value Storage (simple data)
50
- import { InMemoryKvStorage } from "@workglow/storage";
35
+ ## Overview
51
36
 
52
- const kvStore = new InMemoryKvStorage<string, { name: string; age: number }>();
53
- await kvStore.put("user:123", { name: "Alice", age: 30 });
54
- const kvUser = await kvStore.get("user:123"); // { name: "Alice", age: 30 }
55
- ```
37
+ This package provides the data layer for RAG (Retrieval-Augmented Generation) workflows. It ties together three concerns:
56
38
 
57
- ```typescript
58
- // Tabular Storage (structured data with schemas)
59
- import { InMemoryTabularStorage } from "@workglow/storage";
60
- import { JsonSchema } from "@workglow/util";
61
-
62
- const userSchema = {
63
- type: "object",
64
- properties: {
65
- id: { type: "string" },
66
- name: { type: "string" },
67
- email: { type: "string" },
68
- age: { type: "number" },
69
- },
70
- required: ["id", "name", "email", "age"],
71
- additionalProperties: false,
72
- } as const satisfies JsonSchema;
73
-
74
- const userRepo = new InMemoryTabularStorage<typeof userSchema, ["id"]>(
75
- userSchema,
76
- ["id"], // primary key
77
- ["email"] // additional indexes
78
- );
39
+ 1. **Documents** — hierarchical tree representation of parsed text (sections, paragraphs, sentences)
40
+ 2. **Chunks** flat records derived from the document tree, each tracking its position via node paths
41
+ 3. **KnowledgeBase** unified interface that owns both document storage (tabular) and chunk storage (vector), with cascading lifecycle management
79
42
 
80
- await userRepo.put({ id: "123", name: "Alice", email: "alice@example.com", age: 30 });
81
- const user = await userRepo.get({ id: "123" });
43
+ ```
44
+ Markdown / Plain Text
45
+
46
+
47
+ ┌──────────────┐
48
+ │ Document │ Hierarchical tree (sections, paragraphs)
49
+ │ (tabular) │ Stored as serialized JSON
50
+ └──────┬───────┘
51
+ │ chunking
52
+
53
+ ┌──────────────┐
54
+ │ Chunks │ Flat records with tree linkage (nodePath, depth)
55
+ │ (vector) │ Stored with embedding vectors
56
+ └──────┬───────┘
57
+ │ search
58
+
59
+ ┌──────────────┐
60
+ │ Results │ Ranked by similarity score
61
+ └──────────────┘
82
62
  ```
83
63
 
84
64
  ## Installation
85
65
 
86
66
  ```bash
87
- # Using bun (recommended)
88
- bun install @workglow/storage
89
-
90
- # Using npm
91
- npm install @workglow/storage
92
-
93
- # Using yarn
94
- yarn add @workglow/storage
67
+ bun install @workglow/dataset
95
68
  ```
96
69
 
97
- ## Core Concepts
98
-
99
- ### Type Safety
70
+ Peer dependencies: `@workglow/storage`, `@workglow/util`.
100
71
 
101
- All storage implementations are fully typed using TypeScript and JSON Schema for runtime validation:
72
+ ## Quick Start
102
73
 
103
74
  ```typescript
104
- import { JsonSchema, FromSchema } from "@workglow/util";
75
+ import {
76
+ createKnowledgeBase,
77
+ Document,
78
+ StructuralParser,
79
+ } from "@workglow/dataset";
80
+
81
+ // 1. Create a knowledge base
82
+ const kb = await createKnowledgeBase({
83
+ name: "my-kb",
84
+ vectorDimensions: 384,
85
+ });
105
86
 
106
- // Define your data structure
107
- const ProductSchema = {
108
- type: "object",
109
- properties: {
110
- id: { type: "string" },
111
- name: { type: "string" },
112
- price: { type: "number" },
113
- category: { type: "string" },
114
- inStock: { type: "boolean" },
87
+ // 2. Parse a document
88
+ const root = await StructuralParser.parseMarkdown("doc1", markdown, "My Doc");
89
+ const doc = new Document(root, { title: "My Doc" });
90
+
91
+ // 3. Store the document
92
+ const inserted = await kb.upsertDocument(doc);
93
+
94
+ // 4. Store chunk embeddings
95
+ await kb.upsertChunk({
96
+ doc_id: inserted.doc_id!,
97
+ vector: new Float32Array([0.1, 0.2, ...]),
98
+ metadata: {
99
+ chunkId: "chunk_1",
100
+ doc_id: inserted.doc_id!,
101
+ text: "The chunk text...",
102
+ nodePath: [root.nodeId, sectionNodeId],
103
+ depth: 2,
115
104
  },
116
- required: ["id", "name", "price", "category", "inStock"],
117
- additionalProperties: false,
118
- } as const satisfies JsonSchema;
119
-
120
- // TypeScript automatically infers:
121
- // Entity = FromSchema<typeof ProductSchema>
122
- // PrimaryKey = { id: string }
123
- ```
124
-
125
- ### Environment Compatibility
126
-
127
- | Storage Type | Node.js | Bun | Browser | Persistence |
128
- | ------------ | ------- | --- | ------- | ----------- |
129
- | InMemory | ✅ | ✅ | ✅ | ❌ |
130
- | IndexedDB | ❌ | ❌ | ✅ | ✅ |
131
- | SQLite | ✅ | ✅ | ❌ | ✅ |
132
- | PostgreSQL | ✅ | ✅ | ❌ | ✅ |
133
- | Supabase | ✅ | ✅ | ✅ | ✅ |
134
- | FileSystem | ✅ | ✅ | ❌ | ✅ |
135
-
136
- ### Import Patterns
105
+ });
137
106
 
138
- The package uses conditional exports, so importing from `@workglow/storage` automatically selects the right build for your runtime (browser, Node.js, or Bun).
107
+ // 5. Search (via task pipeline)
108
+ import { Workflow } from "@workglow/task-graph";
139
109
 
140
- ```typescript
141
- // Import from the top-level package; it resolves to the correct target per environment
142
- import { InMemoryKvStorage, SqliteTabularStorage } from "@workglow/storage";
110
+ const result = await new Workflow()
111
+ .chunkRetrieval({
112
+ knowledgeBase: "my-kb",
113
+ query: "your search query",
114
+ model: "your-embedding-model",
115
+ topK: 5,
116
+ })
117
+ .run();
143
118
  ```
144
119
 
145
- ## Storage Types
146
-
147
- ### Key-Value Storage
148
-
149
- Simple key-value storage for unstructured or semi-structured data.
150
-
151
- #### Basic Usage
152
-
153
- ```typescript
154
- import { InMemoryKvStorage, FsFolderJsonKvRepository } from "@workglow/storage";
120
+ ## Architecture
155
121
 
156
- // In-memory (for testing/caching)
157
- const cache = new InMemoryKvStorage<string, any>();
158
- await cache.put("config", { theme: "dark", language: "en" });
122
+ The package is organized around three layers:
159
123
 
160
- // File-based JSON (persistent)
161
- const settings = new FsFolderJsonKvRepository("./data/settings");
162
- await settings.put("user:preferences", { notifications: true });
163
124
  ```
164
-
165
- #### Environment-Specific Examples
166
-
167
- ```typescript
168
- // Browser (using IndexedDB)
169
- import { IndexedDbKvRepository } from "@workglow/storage";
170
- const browserStore = new IndexedDbKvRepository("my-app-storage");
171
-
172
- // Node.js/Bun (using SQLite)
173
- import { SqliteKvRepository } from "@workglow/storage";
174
- // Pass a file path or a Database instance (see @workglow/sqlite)
175
- const sqliteStore = new SqliteKvRepository("./data.db", "config_table");
176
-
177
- // PostgreSQL (Node.js/Bun)
178
- import { PostgresKvRepository } from "@workglow/storage";
179
- import { Pool } from "pg";
180
- const pool = new Pool({ connectionString: "postgresql://..." });
181
- const pgStore = new PostgresKvRepository(pool, "settings");
182
-
183
- // Supabase (Node.js/Bun)
184
- import { SupabaseKvRepository } from "@workglow/storage";
185
- import { createClient } from "@supabase/supabase-js";
186
- const supabase = createClient("https://your-project.supabase.co", "your-anon-key");
187
- const supabaseStore = new SupabaseKvRepository(supabase, "settings");
125
+ ┌───────────────────────────────────────────────────┐
126
+ │ KnowledgeBase │
127
+ │ Unified API for documents + chunks + search │
128
+ ├────────────────────┬──────────────────────────────┤
129
+ │ DocumentTabular │ ChunkVector │
130
+ │ Storage │ Storage │
131
+ │ (ITabularStorage) (IVectorStorage)
132
+ │ │ │
133
+ │ Stores serialized │ Stores embeddings + │
134
+ │ document trees │ ChunkRecord metadata │
135
+ └────────────────────┴──────────────────────────────┘
188
136
  ```
189
137
 
190
- #### Bulk Operations
138
+ **Storage backends** are pluggable via the `@workglow/storage` interfaces. The `createKnowledgeBase` factory defaults to in-memory storage; production deployments can use SQLite, PostgreSQL, or any other `ITabularStorage` / `IVectorStorage` implementation.
191
139
 
192
- ```typescript
193
- const store = new InMemoryKvStorage<string, { name: string; score: number }>();
140
+ ## Documents
194
141
 
195
- // Bulk insert
196
- await store.putBulk([
197
- { key: "player1", value: { name: "Alice", score: 100 } },
198
- { key: "player2", value: { name: "Bob", score: 85 } },
199
- ]);
142
+ ### Document Tree Structure
200
143
 
201
- // Get all data
202
- const allPlayers = await store.getAll();
203
- // Result: [{ key: "player1", value: { name: "Alice", score: 100 } }, ...]
144
+ Documents are represented as a hierarchical tree using a **discriminated union** of node types:
204
145
 
205
- // Get size
206
- const count = await store.size(); // 2
207
146
  ```
208
-
209
- #### Event Handling
210
-
211
- ```typescript
212
- const store = new InMemoryKvStorage<string, any>();
213
-
214
- // Listen to storage events
215
- store.on("put", (key, value) => {
216
- console.log(`Stored: ${key} = ${JSON.stringify(value)}`);
217
- });
218
-
219
- store.on("get", (key, value) => {
220
- console.log(`Retrieved: ${key} = ${value ? "found" : "not found"}`);
221
- });
222
-
223
- await store.put("test", { data: "example" }); // Triggers 'put' event
224
- await store.get("test"); // Triggers 'get' event
147
+ DocumentRootNode (kind: "document")
148
+ ├── SectionNode (kind: "section", level: 1)
149
+ │ ├── ParagraphNode (kind: "paragraph")
150
+ │ ├── ParagraphNode (kind: "paragraph")
151
+ │ └── SectionNode (kind: "section", level: 2)
152
+ │ └── ParagraphNode (kind: "paragraph")
153
+ ├── SectionNode (kind: "section", level: 1)
154
+ │ └── ParagraphNode (kind: "paragraph")
155
+ └── ParagraphNode (kind: "paragraph")
225
156
  ```
226
157
 
227
- ### Tabular Storage
158
+ **Node types:**
228
159
 
229
- Structured storage with schemas, primary keys, and indexing for complex data relationships.
160
+ | Type | Kind | Has Children | Description |
161
+ | ------------------ | ------------- | ------------ | ------------------------------------------------- |
162
+ | `DocumentRootNode` | `"document"` | yes | Root of the tree, has `title` |
163
+ | `SectionNode` | `"section"` | yes | From headers (level 1-6), has `title` and `level` |
164
+ | `ParagraphNode` | `"paragraph"` | no | Prose content |
165
+ | `SentenceNode` | `"sentence"` | no | Fine-grained segmentation |
166
+ | `TopicNode` | `"topic"` | yes | From topic segmentation algorithms |
230
167
 
231
- #### Schema Definition
168
+ All nodes share a base set of fields:
232
169
 
233
170
  ```typescript
234
- import { JsonSchema } from "@workglow/util";
235
- import { InMemoryTabularStorage } from "@workglow/storage";
236
-
237
- // Define your entity schema
238
- const UserSchema = {
239
- type: "object",
240
- properties: {
241
- id: { type: "string" },
242
- email: { type: "string" },
243
- name: { type: "string" },
244
- age: { type: "number" },
245
- department: { type: "string" },
246
- createdAt: { type: "string" },
247
- },
248
- required: ["id", "email", "name", "age", "department", "createdAt"],
249
- additionalProperties: false,
250
- } as const satisfies JsonSchema;
251
-
252
- // Create repository with primary key and indexes
253
- const userRepo = new InMemoryTabularStorage<typeof UserSchema, ["id"]>(
254
- UserSchema,
255
- ["id"], // Primary key (can be compound: ["dept", "id"])
256
- ["email", "department", ["department", "age"]] // Indexes for fast lookups
257
- );
171
+ interface DocumentNodeBase {
172
+ readonly nodeId: string; // Unique identifier (UUID)
173
+ readonly kind: NodeKind; // Discriminator
174
+ readonly range: NodeRange; // { startOffset, endOffset } in source text
175
+ readonly text: string; // Text content
176
+ readonly enrichment?: NodeEnrichment; // Optional summary, entities, keywords
177
+ }
258
178
  ```
259
179
 
260
- #### CRUD Operations
180
+ Use the `NodeKind` constants for comparisons:
261
181
 
262
182
  ```typescript
263
- // Create
264
- await userRepo.put({
265
- id: "user_123",
266
- email: "alice@company.com",
267
- name: "Alice Johnson",
268
- age: 28,
269
- department: "Engineering",
270
- createdAt: new Date().toISOString(),
271
- });
272
-
273
- // Read by primary key
274
- const user = await userRepo.get({ id: "user_123" });
275
-
276
- // Update (put with same primary key)
277
- await userRepo.put({
278
- ...user!,
279
- age: 29, // Birthday!
280
- });
183
+ import { NodeKind } from "@workglow/dataset";
281
184
 
282
- // Delete
283
- await userRepo.delete({ id: "user_123" });
185
+ if (node.kind === NodeKind.SECTION) {
186
+ console.log(node.title, node.level, node.children.length);
187
+ }
284
188
  ```
285
189
 
286
- #### Bulk Operations
190
+ ### Parsing
191
+
192
+ `StructuralParser` converts raw text into a `DocumentRootNode`:
287
193
 
288
194
  ```typescript
289
- // Bulk insert
290
- await userRepo.putBulk([
291
- {
292
- id: "1",
293
- email: "alice@co.com",
294
- name: "Alice",
295
- age: 28,
296
- department: "Engineering",
297
- createdAt: "2024-01-01",
298
- },
299
- {
300
- id: "2",
301
- email: "bob@co.com",
302
- name: "Bob",
303
- age: 32,
304
- department: "Sales",
305
- createdAt: "2024-01-02",
306
- },
307
- {
308
- id: "3",
309
- email: "carol@co.com",
310
- name: "Carol",
311
- age: 26,
312
- department: "Engineering",
313
- createdAt: "2024-01-03",
314
- },
315
- ]);
195
+ import { StructuralParser } from "@workglow/dataset";
316
196
 
317
- // Get all records
318
- const allUsers = await userRepo.getAll();
197
+ // Markdown detects headers, creates nested sections
198
+ const root = await StructuralParser.parseMarkdown(docId, markdownText, "Title");
319
199
 
320
- // Get repository size
321
- const userCount = await userRepo.size();
322
- ```
200
+ // Plain text — splits on blank lines into paragraphs
201
+ const root = await StructuralParser.parsePlainText(docId, plainText, "Title");
323
202
 
324
- #### Searching and Filtering
325
-
326
- ```typescript
327
- // Search by partial match (uses indexes when available)
328
- const engineeringUsers = await userRepo.search({ department: "Engineering" });
329
- const adultUsers = await userRepo.search({ age: 25 }); // Exact match
330
-
331
- // Delete by search criteria (supports multiple columns)
332
- await userRepo.deleteSearch({ department: "Sales" }); // Equality
333
- await userRepo.deleteSearch({ age: { value: 65, operator: ">=" } }); // Delete users 65 and older
334
-
335
- // Multiple criteria (AND logic)
336
- await userRepo.deleteSearch({
337
- department: "Sales",
338
- age: { value: 30, operator: "<" },
339
- }); // Delete young Sales employees
203
+ // Auto-detect format
204
+ const root = await StructuralParser.parse(docId, text, "Title");
340
205
  ```
341
206
 
342
- #### Environment-Specific Tabular Storage
207
+ The parser:
343
208
 
344
- ```typescript
345
- // SQLite (Node.js/Bun)
346
- import { SqliteTabularStorage } from "@workglow/storage";
347
-
348
- const sqliteUsers = new SqliteTabularStorage<typeof UserSchema, ["id"]>(
349
- "./users.db",
350
- "users",
351
- UserSchema,
352
- ["id"],
353
- ["email"]
354
- );
209
+ - Converts markdown headers (`#` through `######`) into nested `SectionNode`s
210
+ - Groups text between headers as `ParagraphNode` children
211
+ - Tracks character offsets (`startOffset`, `endOffset`) for every node
212
+ - Assigns a unique `nodeId` to each node
355
213
 
356
- // PostgreSQL (Node.js/Bun)
357
- import { PostgresTabularStorage } from "@workglow/storage";
358
- import { Pool } from "pg";
359
-
360
- const pool = new Pool({ connectionString: "postgresql://..." });
361
- const pgUsers = new PostgresTabularStorage<typeof UserSchema, ["id"]>(
362
- pool,
363
- "users",
364
- UserSchema,
365
- ["id"],
366
- ["email"]
367
- );
214
+ ### Node Enrichment
368
215
 
369
- // Supabase (Node.js/Bun)
370
- import { SupabaseTabularStorage } from "@workglow/storage";
371
- import { createClient } from "@supabase/supabase-js";
372
-
373
- const supabase = createClient("https://your-project.supabase.co", "your-anon-key");
374
- const supabaseUsers = new SupabaseTabularStorage<typeof UserSchema, ["id"]>(
375
- supabase,
376
- "users",
377
- UserSchema,
378
- ["id"],
379
- ["email"]
380
- );
381
-
382
- // IndexedDB (Browser)
383
- import { IndexedDbTabularStorage } from "@workglow/storage";
384
- const browserUsers = new IndexedDbTabularStorage<typeof UserSchema, ["id"]>(
385
- "users",
386
- UserSchema,
387
- ["id"],
388
- ["email"]
389
- );
216
+ Nodes can carry optional enrichment data populated by AI tasks:
390
217
 
391
- // File-based (Node.js/Bun)
392
- import { FsFolderTabularStorage } from "@workglow/storage";
393
- const fileUsers = new FsFolderTabularStorage<typeof UserSchema, ["id"]>(
394
- "./data/users",
395
- UserSchema,
396
- ["id"],
397
- ["email"]
398
- );
218
+ ```typescript
219
+ interface NodeEnrichment {
220
+ summary?: string; // AI-generated summary
221
+ entities?: Entity[]; // Named entities (text, type, confidence score)
222
+ keywords?: string[]; // Extracted keywords
223
+ }
399
224
  ```
400
225
 
401
- ### Queue Storage
226
+ Enrichment is set on nodes during the ingestion pipeline (e.g., by `DocumentEnricherTask`) and propagated to chunks during hierarchy join.
402
227
 
403
- Persistent job queue storage for background processing and task management.
228
+ ## Chunks
404
229
 
405
- > **Note**: Queue storage is primarily used internally by the job queue system. Direct usage is for advanced scenarios.
230
+ ### ChunkRecord
406
231
 
407
- #### Basic Job Queue Operations
232
+ A `ChunkRecord` is a flat, self-contained unit of text with full context about its position in the document tree:
408
233
 
409
234
  ```typescript
410
- import { InMemoryQueueStorage, JobStatus } from "@workglow/storage";
235
+ interface ChunkRecord {
236
+ // Identity
237
+ chunkId: string; // Unique chunk identifier
238
+ doc_id: string; // Parent document ID
411
239
 
412
- // Define job input/output types
413
- type ProcessingInput = { text: string; options: any };
414
- type ProcessingOutput = { result: string; metadata: any };
240
+ // Content
241
+ text: string; // The text to embed and search
415
242
 
416
- const jobQueue = new InMemoryQueueStorage<ProcessingInput, ProcessingOutput>();
243
+ // Tree linkage
244
+ nodePath: string[]; // Node IDs from root to leaf
245
+ depth: number; // Depth in the document tree
417
246
 
418
- // Add job to queue
419
- const jobId = await jobQueue.add({
420
- input: { text: "Hello world", options: { uppercase: true } },
421
- run_after: null, // Run immediately
422
- max_retries: 3,
423
- });
424
-
425
- // Get next job for processing
426
- const job = await jobQueue.next();
427
- if (job) {
428
- // Process the job...
429
- const result = { result: "HELLO WORLD", metadata: { processed: true } };
430
-
431
- // Mark as complete
432
- await jobQueue.complete({
433
- ...job,
434
- output: result,
435
- status: JobStatus.COMPLETED,
436
- });
247
+ // Optional fields
248
+ leafNodeId?: string; // Leaf node this chunk belongs to
249
+ summary?: string; // Chunk-level summary
250
+ entities?: Entity[]; // Named entities
251
+ parentSummaries?: string[]; // Summaries from ancestor nodes
252
+ sectionTitles?: string[]; // Titles of ancestor sections
253
+ doc_title?: string; // Document title
437
254
  }
438
255
  ```
439
256
 
440
- #### Job Management
441
-
442
- ```typescript
443
- // Check queue status
444
- const pendingCount = await jobQueue.size(JobStatus.PENDING);
445
- const processingCount = await jobQueue.size(JobStatus.PROCESSING);
446
-
447
- // Peek at jobs without removing them
448
- const nextJobs = await jobQueue.peek(JobStatus.PENDING, 5);
257
+ The `nodePath` and `depth` fields enable **hierarchy-aware retrieval**: given a search result, you can walk back up the document tree to get section titles, parent summaries, or sibling chunks for additional context.
449
258
 
450
- // Progress tracking
451
- await jobQueue.saveProgress(jobId, 50, "Processing...", { step: 1 });
259
+ ### Chunk Vector Storage
452
260
 
453
- // Handle job failures
454
- await jobQueue.abort(jobId);
455
-
456
- // Cleanup old completed jobs
457
- await jobQueue.deleteJobsByStatusAndAge(JobStatus.COMPLETED, 24 * 60 * 60 * 1000); // 24 hours
458
- ```
459
-
460
- ## Environment-Specific Usage
461
-
462
- ### Browser Environment
261
+ Chunks are stored in vector storage as `ChunkVectorEntity`:
463
262
 
464
263
  ```typescript
465
- import {
466
- IndexedDbKvRepository,
467
- IndexedDbTabularStorage,
468
- IndexedDbQueueStorage,
469
- SupabaseKvRepository,
470
- SupabaseTabularStorage,
471
- SupabaseQueueStorage,
472
- } from "@workglow/storage";
473
- import { createClient } from "@supabase/supabase-js";
474
-
475
- // Local browser storage with IndexedDB
476
- const settings = new IndexedDbKvRepository("app-settings");
477
- const userData = new IndexedDbTabularStorage("users", UserSchema, ["id"]);
478
- const jobQueue = new IndexedDbQueueStorage<any, any>("background-jobs");
479
-
480
- // Or use Supabase for cloud storage from the browser
481
- const supabase = createClient("https://your-project.supabase.co", "your-anon-key");
482
- const cloudSettings = new SupabaseKvRepository(supabase, "app-settings");
483
- const cloudUserData = new SupabaseTabularStorage(supabase, "users", UserSchema, ["id"]);
484
- const cloudJobQueue = new SupabaseQueueStorage(supabase, "background-jobs");
264
+ interface ChunkVectorEntity {
265
+ chunk_id: string; // Primary key (auto-generated UUID)
266
+ doc_id: string; // For filtering by document
267
+ vector: TypedArray; // Embedding (Float32Array, etc.)
268
+ metadata: ChunkRecord; // Full chunk record
269
+ }
485
270
  ```
486
271
 
487
- ### Node.js Environment
272
+ The `metadata` field holds the complete `ChunkRecord`, so search results carry all the context needed for hierarchy-aware retrieval without additional lookups.
488
273
 
489
- ```typescript
490
- import {
491
- SqliteKvRepository,
492
- PostgresTabularStorage,
493
- FsFolderJsonKvRepository,
494
- } from "@workglow/storage";
495
-
496
- // Mix and match storage backends
497
- const cache = new FsFolderJsonKvRepository("./cache");
498
- const users = new PostgresTabularStorage(pool, "users", UserSchema, ["id"]);
499
- ```
274
+ ## KnowledgeBase
500
275
 
501
- ### Bun Environment
276
+ `KnowledgeBase` is the central class that ties document storage and vector storage together.
502
277
 
503
- ```typescript
504
- // Bun has access to all implementations
505
- import {
506
- SqliteTabularStorage,
507
- FsFolderJsonKvRepository,
508
- PostgresQueueStorage,
509
- SupabaseTabularStorage,
510
- } from "@workglow/storage";
278
+ ### Creating a KnowledgeBase
511
279
 
512
- import { Database } from "bun:sqlite";
513
- import { createClient } from "@supabase/supabase-js";
280
+ **Factory function (recommended):**
514
281
 
515
- const db = new Database("./app.db");
516
- const data = new SqliteTabularStorage(db, "items", ItemSchema, ["id"]);
282
+ ```typescript
283
+ import { createKnowledgeBase } from "@workglow/dataset";
517
284
 
518
- // Or use Supabase for cloud storage
519
- const supabase = createClient("https://your-project.supabase.co", "your-anon-key");
520
- const cloudData = new SupabaseTabularStorage(supabase, "items", ItemSchema, ["id"]);
285
+ const kb = await createKnowledgeBase({
286
+ name: "my-kb", // Identifier
287
+ vectorDimensions: 384, // Must match your embedding model
288
+ backend: "in-memory", // Currently only "in-memory"
289
+ vectorType: Float32Array, // Default: Float32Array
290
+ register: true, // Register globally (default: true)
291
+ });
521
292
  ```
522
293
 
523
- ## Advanced Features
294
+ **Direct construction (custom storage backends):**
524
295
 
525
- ### Repository Registry
296
+ ```typescript
297
+ import { KnowledgeBase } from "@workglow/dataset";
526
298
 
527
- Repositories can be registered globally by ID, allowing tasks to reference them by name rather than passing direct instances. This is useful for configuring repositories once at application startup and referencing them throughout your task graphs.
299
+ const kb = new KnowledgeBase(
300
+ "my-kb",
301
+ myDocumentTabularStorage, // ITabularStorage implementation
302
+ myChunkVectorStorage // IVectorStorage implementation
303
+ );
304
+ ```
528
305
 
529
- #### Registering Repositories
306
+ ### Document CRUD
530
307
 
531
308
  ```typescript
532
- import {
533
- registerTabularStorage,
534
- getTabularStorage,
535
- InMemoryTabularStorage,
536
- } from "@workglow/storage";
309
+ // Upsert — auto-generates doc_id if not set
310
+ const doc = new Document(root, { title: "My Document" });
311
+ const inserted = await kb.upsertDocument(doc);
312
+ console.log(inserted.doc_id); // auto-generated UUID
537
313
 
538
- // Define your schema
539
- const userSchema = {
540
- type: "object",
541
- properties: {
542
- id: { type: "string" },
543
- name: { type: "string" },
544
- email: { type: "string" },
545
- },
546
- required: ["id", "name", "email"],
547
- additionalProperties: false,
548
- } as const;
314
+ // Get by ID
315
+ const retrieved = await kb.getDocument(inserted.doc_id!);
549
316
 
550
- // Create and register a repository
551
- const userRepo = new InMemoryTabularStorage(userSchema, ["id"] as const);
552
- registerTabularStorage("users", userRepo);
317
+ // List all document IDs
318
+ const docIds = await kb.listDocuments();
553
319
 
554
- // Later, retrieve the repository by ID
555
- const repo = getTabularStorage("users");
320
+ // Delete cascades to all chunks in vector storage
321
+ await kb.deleteDocument(inserted.doc_id!);
556
322
  ```
557
323
 
558
- #### Using Repositories in Tasks
559
-
560
- When using repositories with tasks, you can pass either the repository ID or a direct instance. The TaskRunner automatically resolves string IDs using the registry.
324
+ ### Chunk Operations
561
325
 
562
326
  ```typescript
563
- import { TypeTabularStorage } from "@workglow/storage";
564
-
565
- // In your task's input schema, use TypeTabularStorage
566
- static inputSchema() {
567
- return {
568
- type: "object",
569
- properties: {
570
- dataSource: TypeTabularStorage({
571
- title: "User Repository",
572
- description: "Repository containing user records",
573
- }),
574
- },
575
- required: ["dataSource"],
576
- };
577
- }
578
-
579
- // Both approaches work:
580
- await task.run({ dataSource: "users" }); // Resolved from registry
581
- await task.run({ dataSource: userRepoInstance }); // Direct instance
582
- ```
583
-
584
- #### Schema Helper Functions
327
+ // Upsert a single chunk
328
+ const entity = await kb.upsertChunk({
329
+ doc_id: "doc1",
330
+ vector: new Float32Array([0.1, 0.2, 0.3]),
331
+ metadata: {
332
+ chunkId: "chunk_1",
333
+ doc_id: "doc1",
334
+ text: "Some text...",
335
+ nodePath: ["root", "section1"],
336
+ depth: 2,
337
+ },
338
+ });
585
339
 
586
- The package provides schema helper functions for defining repository inputs with proper format annotations:
340
+ // Upsert in bulk
341
+ const entities = await kb.upsertChunksBulk(chunkArray);
587
342
 
588
- ```typescript
589
- import {
590
- TypeTabularStorage,
591
- TypeVectorRepository,
592
- TypeDocumentRepository,
593
- } from "@workglow/storage";
594
-
595
- // Tabular repository (format: "storage:tabular")
596
- const tabularSchema = TypeTabularStorage({
597
- title: "Data Source",
598
- description: "Tabular data storage",
599
- });
343
+ // Get a specific chunk
344
+ const chunk = await kb.getChunk("chunk_id_here");
600
345
 
601
- // Vector repository (format: "dataset:document-chunk")
602
- const vectorSchema = TypeVectorRepository({
603
- title: "Embeddings Store",
604
- description: "Vector embeddings dataset",
605
- });
346
+ // Get all chunks for a document
347
+ const docChunks = await kb.getChunksForDocument("doc1");
606
348
 
607
- // Document repository (format: "dataset:document")
608
- const docSchema = TypeDocumentRepository({
609
- title: "Document Store",
610
- description: "Document storage dataset",
611
- });
349
+ // Delete all chunks for a document (without deleting the document)
350
+ await kb.deleteChunksForDocument("doc1");
612
351
  ```
613
352
 
614
- ### Event-Driven Architecture
353
+ ### Search
615
354
 
616
- All storage implementations support event emission for monitoring and reactive programming:
355
+ **Similarity search** vector-only:
617
356
 
618
357
  ```typescript
619
- const store = new InMemoryTabularStorage(UserSchema, ["id"]);
620
-
621
- // Monitor all operations
622
- store.on("put", (entity) => console.log("User created/updated:", entity));
623
- store.on("delete", (key) => console.log("User deleted:", key));
624
- store.on("get", (key, entity) => console.log("User accessed:", entity ? "found" : "not found"));
358
+ const results = await kb.similaritySearch(queryVector, {
359
+ topK: 10, // Max results (default varies by backend)
360
+ scoreThreshold: 0.7, // Minimum similarity score
361
+ filter: { doc_id: "doc1" }, // Metadata filter
362
+ });
625
363
 
626
- // Wait for specific events
627
- const [entity] = await store.waitOn("put"); // Waits for next put operation
364
+ // Each result: ChunkVectorEntity & { score: number }
365
+ for (const result of results) {
366
+ console.log(result.chunk_id, result.score, result.metadata.text);
367
+ }
628
368
  ```
629
369
 
630
- ### Compound Primary Keys
370
+ **Hybrid search** combines vector similarity with full-text search. Requires a storage backend that supports it (e.g., PostgreSQL with pgvector). Returns an empty array if unsupported.
631
371
 
632
372
  ```typescript
633
- import { JsonSchema } from "@workglow/util";
634
-
635
- const OrderLineSchema = {
636
- type: "object",
637
- properties: {
638
- orderId: { type: "string" },
639
- lineNumber: { type: "number" },
640
- productId: { type: "string" },
641
- quantity: { type: "number" },
642
- price: { type: "number" },
643
- },
644
- required: ["orderId", "lineNumber", "productId", "quantity", "price"],
645
- additionalProperties: false,
646
- } as const satisfies JsonSchema;
647
-
648
- const orderLines = new InMemoryTabularStorage<typeof OrderLineSchema, ["orderId", "lineNumber"]>(
649
- OrderLineSchema,
650
- ["orderId", "lineNumber"], // Compound primary key
651
- ["productId"] // Additional index
652
- );
653
-
654
- // Use compound keys
655
- await orderLines.put({
656
- orderId: "ORD-123",
657
- lineNumber: 1,
658
- productId: "PROD-A",
659
- quantity: 2,
660
- price: 19.99,
373
+ const results = await kb.hybridSearch(queryVector, {
374
+ textQuery: "machine learning",
375
+ topK: 10,
376
+ vectorWeight: 0.7, // 0-1, balance between vector and text
377
+ scoreThreshold: 0.5,
378
+ filter: { doc_id: "doc1" },
661
379
  });
662
- const line = await orderLines.get({ orderId: "ORD-123", lineNumber: 1 });
663
380
  ```
664
381
 
665
- ### Custom File Layout (KV on filesystem)
382
+ ### Tree Traversal
383
+
384
+ Navigate the document tree stored in the knowledge base:
666
385
 
667
386
  ```typescript
668
- import { FsFolderKvRepository } from "@workglow/storage";
669
- import { JsonSchema } from "@workglow/util";
670
-
671
- // Control how keys map to file paths and value encoding via schemas
672
- const keySchema = { type: "string" } as const satisfies JsonSchema;
673
- const valueSchema = { type: "string" } as const satisfies JsonSchema;
674
-
675
- const files = new FsFolderKvRepository<string, string>(
676
- "./data/files",
677
- (key) => `${key}.txt`,
678
- keySchema,
679
- valueSchema
680
- );
387
+ // Get a specific node by ID
388
+ const node = await kb.getNode("doc1", nodeId);
681
389
 
682
- await files.put("note-1", "Hello world");
683
- ```
390
+ // Get ancestors from root to target (useful for building context)
391
+ const ancestors = await kb.getAncestors("doc1", leafNodeId);
392
+ // Returns: [rootNode, sectionNode, subsectionNode, targetNode]
684
393
 
685
- ## API Reference
394
+ // Get chunks stored in the document JSON (not vector storage)
395
+ const chunks = await kb.getDocumentChunks("doc1");
686
396
 
687
- ### IKvStorage<Key, Value>
397
+ // Find chunks whose nodePath contains a given node ID
398
+ const related = await kb.findChunksByNodeId("doc1", sectionNodeId);
399
+ ```
688
400
 
689
- Core interface for key-value storage:
401
+ ### Lifecycle Management
690
402
 
691
403
  ```typescript
692
- interface IKvStorage<Key, Value> {
693
- // Core operations
694
- put(key: Key, value: Value): Promise<void>;
695
- putBulk(items: Array<{ key: Key; value: Value }>): Promise<void>;
696
- get(key: Key): Promise<Value | undefined>;
697
- delete(key: Key): Promise<void>;
698
- getAll(): Promise<Array<{ key: Key; value: Value }> | undefined>;
699
- deleteAll(): Promise<void>;
700
- size(): Promise<number>;
701
-
702
- // Event handling
703
- on(event: "put" | "get" | "getAll" | "delete" | "deleteall", callback: Function): void;
704
- off(event: string, callback: Function): void;
705
- once(event: string, callback: Function): void;
706
- waitOn(event: string): Promise<any[]>;
707
- emit(event: string, ...args: any[]): void;
708
- }
709
- ```
404
+ // Prepare for re-indexing: delete chunks but keep the document
405
+ const doc = await kb.prepareReindex("doc1");
406
+ // doc is returned so you can re-chunk and re-embed
710
407
 
711
- ### ITabularStorage<Schema, PrimaryKeyNames>
408
+ // Initialize storage backends
409
+ await kb.setupDatabase();
712
410
 
713
- Core interface for tabular storage:
714
-
715
- ```typescript
716
- interface ITabularStorage<Schema, PrimaryKeyNames, Entity, PrimaryKey, Value> {
717
- // Core operations
718
- put(entity: Entity): Promise<void>;
719
- putBulk(entities: Entity[]): Promise<void>;
720
- get(key: PrimaryKey): Promise<Entity | undefined>;
721
- delete(key: PrimaryKey | Entity): Promise<void>;
722
- getAll(): Promise<Entity[] | undefined>;
723
- deleteAll(): Promise<void>;
724
- size(): Promise<number>;
725
-
726
- // Search operations
727
- search(criteria: Partial<Entity>): Promise<Entity[] | undefined>;
728
- deleteSearch(criteria: DeleteSearchCriteria<Entity>): Promise<void>;
729
-
730
- // Event handling
731
- on(event: "put" | "get" | "search" | "delete" | "clearall", callback: Function): void;
732
- off(event: string, callback: Function): void;
733
- once(event: string, callback: Function): void;
734
- waitOn(event: string): Promise<any[]>;
735
- emit(event: string, ...args: any[]): void;
736
- }
411
+ // Tear down
412
+ kb.destroy();
737
413
  ```
738
414
 
739
- #### DeleteSearchCriteria<Entity>
415
+ ### Registry
740
416
 
741
- The `deleteSearch` method accepts a criteria object that supports multiple columns with optional comparison operators:
417
+ Knowledge bases can be registered globally by name, allowing tasks to reference them by string ID:
742
418
 
743
419
  ```typescript
744
- // Type definitions
745
- type SearchOperator = "=" | "<" | "<=" | ">" | ">=";
746
-
747
- interface SearchCondition<T> {
748
- readonly value: T;
749
- readonly operator: SearchOperator;
750
- }
420
+ import { registerKnowledgeBase, getKnowledgeBase, TypeKnowledgeBase } from "@workglow/dataset";
751
421
 
752
- type DeleteSearchCriteria<Entity> = {
753
- readonly [K in keyof Entity]?: Entity[K] | SearchCondition<Entity[K]>;
754
- };
422
+ // Register
423
+ registerKnowledgeBase("my-kb", kb);
755
424
 
756
- // Usage examples
757
- // Equality match (direct value)
758
- await repo.deleteSearch({ category: "electronics" });
425
+ // Retrieve
426
+ const retrieved = getKnowledgeBase("my-kb");
759
427
 
760
- // With comparison operator
761
- await repo.deleteSearch({ createdAt: { value: date, operator: "<" } });
428
+ // In task schemas — accepts either a string ID or a KnowledgeBase instance
429
+ const inputSchema = {
430
+ type: "object",
431
+ properties: {
432
+ knowledgeBase: TypeKnowledgeBase({
433
+ title: "Knowledge Base",
434
+ description: "The KB to search",
435
+ }),
436
+ },
437
+ required: ["knowledgeBase"],
438
+ } as const;
762
439
 
763
- // Multiple criteria (AND logic)
764
- await repo.deleteSearch({
765
- category: "electronics",
766
- value: { value: 100, operator: ">=" },
767
- });
440
+ // Both work:
441
+ await task.run({ knowledgeBase: kb }); // Direct instance
442
+ await task.run({ knowledgeBase: "my-kb" }); // Resolved from registry
443
+ ```
444
+
445
+ ## Data Flow
446
+
447
+ ### Ingestion Pipeline
448
+
449
+ All ingestion steps are composable Tasks that auto-connect in a Workflow:
450
+
451
+ ```
452
+ Raw Text / Markdown
453
+
454
+ ▼ StructuralParserTask
455
+ DocumentRootNode (tree with character offsets)
456
+
457
+ ▼ DocumentEnricherTask (optional AI enrichment)
458
+ DocumentRootNode (+ summaries, entities on nodes)
459
+
460
+ ▼ HierarchicalChunkerTask
461
+ ChunkRecord[] (flat chunks with nodePath linkage)
462
+
463
+ ▼ TextEmbeddingTask (AI model)
464
+ ChunkRecord[] + Float32Array[] (text → vectors)
465
+
466
+ ▼ ChunkToVectorTask
467
+ Vectors + metadata in vector store format
468
+
469
+ ▼ ChunkVectorUpsertTask → vector + tabular storage
470
+ ```
471
+
472
+ Example using the Workflow API:
473
+
474
+ ```typescript
475
+ import { Workflow } from "@workglow/task-graph";
476
+
477
+ const result = await new Workflow()
478
+ .fileLoader({ url: `file://${filePath}`, format: "markdown" })
479
+ .structuralParser({ title: "My Document" })
480
+ .documentEnricher({ generateSummaries: true, extractEntities: true })
481
+ .hierarchicalChunker({ maxTokens: 512, overlap: 50 })
482
+ .textEmbedding({ model: "your-embedding-model" })
483
+ .chunkToVector()
484
+ .chunkVectorUpsert({ knowledgeBase: "my-kb" })
485
+ .run();
486
+
487
+ console.log(result.count); // Number of vectors stored
488
+ ```
489
+
490
+ ### Retrieval Pipeline
491
+
492
+ All retrieval steps are composable Tasks that auto-connect in a Workflow:
493
+
494
+ ```
495
+ User Query
496
+
497
+ ▼ QueryExpanderTask (optional — generates query variations)
498
+ Expanded queries
499
+
500
+ ▼ ChunkRetrievalTask (embeds query + vector search)
501
+ or ChunkVectorHybridSearchTask (vector + full-text search)
502
+ ChunkSearchResult[] (chunks, chunk_ids, scores, query)
503
+
504
+ ▼ HierarchyJoinTask (optional — enriches with ancestor context)
505
+ Enriched chunks (+ parentSummaries, sectionTitles, entities)
506
+
507
+ ▼ RerankerTask (optional — cross-encoder reranking)
508
+ Re-scored chunks
509
+
510
+ ▼ ContextBuilderTask
511
+ Formatted context string for LLM prompt
512
+
513
+ ▼ LLM
514
+ Answer
515
+ ```
516
+
517
+ Example using the Workflow API:
518
+
519
+ ```typescript
520
+ import { Workflow } from "@workglow/task-graph";
521
+
522
+ const result = await new Workflow()
523
+ .chunkRetrieval({
524
+ knowledgeBase: "my-kb",
525
+ query: "What caused the Civil War?",
526
+ model: "your-embedding-model",
527
+ topK: 10,
528
+ })
529
+ .hierarchyJoin({
530
+ knowledgeBase: "my-kb",
531
+ includeParentSummaries: true,
532
+ })
533
+ .reranker({
534
+ method: "cross-encoder",
535
+ model: "your-reranker-model",
536
+ topK: 5,
537
+ })
538
+ .contextBuilder({
539
+ format: "numbered",
540
+ includeMetadata: false,
541
+ })
542
+ .run();
543
+
544
+ console.log(result.context); // Formatted context ready for LLM
768
545
  ```
769
546
 
770
- ### IQueueStorage<Input, Output>
547
+ ## API Reference
771
548
 
772
- Core interface for job queue storage:
549
+ ### Document
773
550
 
774
551
  ```typescript
775
- interface IQueueStorage<Input, Output> {
776
- add(job: JobStorageFormat<Input, Output>): Promise<unknown>;
777
- get(id: unknown): Promise<JobStorageFormat<Input, Output> | undefined>;
778
- next(): Promise<JobStorageFormat<Input, Output> | undefined>;
779
- complete(job: JobStorageFormat<Input, Output>): Promise<void>;
780
- peek(status?: JobStatus, num?: number): Promise<JobStorageFormat<Input, Output>[]>;
781
- size(status?: JobStatus): Promise<number>;
782
- abort(id: unknown): Promise<void>;
783
- saveProgress(id: unknown, progress: number, message: string, details: any): Promise<void>;
784
- deleteAll(): Promise<void>;
785
- getByRunId(runId: string): Promise<Array<JobStorageFormat<Input, Output>>>;
786
- outputForInput(input: Input): Promise<Output | null>;
787
- delete(id: unknown): Promise<void>;
788
- deleteJobsByStatusAndAge(status: JobStatus, olderThanMs: number): Promise<void>;
552
+ class Document {
553
+ readonly doc_id?: string;
554
+ readonly root: DocumentRootNode;
555
+ readonly metadata: DocumentMetadata;
556
+
557
+ constructor(
558
+ root: DocumentRootNode,
559
+ metadata: DocumentMetadata,
560
+ chunks?: ChunkRecord[],
561
+ doc_id?: string
562
+ );
563
+
564
+ setDocId(id: string): void;
565
+ setChunks(chunks: ChunkRecord[]): void;
566
+ getChunks(): ChunkRecord[];
567
+ findChunksByNodeId(nodeId: string): ChunkRecord[];
568
+ toJSON(): object;
569
+ static fromJSON(json: string, doc_id?: string): Document;
789
570
  }
790
571
  ```
791
572
 
792
- ## Examples
793
-
794
- ### User Management System
573
+ ### KnowledgeBase
795
574
 
796
575
  ```typescript
797
- import { JsonSchema, FromSchema } from "@workglow/util";
798
- import { InMemoryTabularStorage, InMemoryKvStorage } from "@workglow/storage";
799
-
800
- // User profile with tabular storage
801
- const UserSchema = {
802
- type: "object",
803
- properties: {
804
- id: { type: "string" },
805
- username: { type: "string" },
806
- email: { type: "string" },
807
- firstName: { type: "string" },
808
- lastName: { type: "string" },
809
- role: {
810
- type: "string",
811
- enum: ["admin", "user", "guest"],
812
- },
813
- createdAt: { type: "string" },
814
- lastLoginAt: { type: "string" },
815
- },
816
- required: ["id", "username", "email", "firstName", "lastName", "role", "createdAt"],
817
- additionalProperties: false,
818
- } as const satisfies JsonSchema;
819
-
820
- const userRepo = new InMemoryTabularStorage<typeof UserSchema, ["id"]>(
821
- UserSchema,
822
- ["id"],
823
- ["email", "username"]
824
- );
576
+ class KnowledgeBase {
577
+ readonly name: string;
825
578
 
826
- // User sessions with KV storage
827
- const sessionStore = new InMemoryKvStorage<string, { userId: string; expiresAt: string }>();
828
-
829
- // User management class
830
- class UserManager {
831
579
  constructor(
832
- private userRepo: typeof userRepo,
833
- private sessionStore: typeof sessionStore
834
- ) {}
835
-
836
- async createUser(userData: Omit<FromSchema<typeof UserSchema>, "id" | "createdAt">) {
837
- const user = {
838
- ...userData,
839
- id: crypto.randomUUID(),
840
- createdAt: new Date().toISOString(),
841
- };
842
- await this.userRepo.put(user);
843
- return user;
844
- }
845
-
846
- async loginUser(email: string): Promise<string> {
847
- const users = await this.userRepo.search({ email });
848
- if (!users?.length) throw new Error("User not found");
849
-
850
- const sessionId = crypto.randomUUID();
851
- await this.sessionStore.put(sessionId, {
852
- userId: users[0].id,
853
- expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).toISOString(),
854
- });
855
-
856
- // Update last login
857
- await this.userRepo.put({
858
- ...users[0],
859
- lastLoginAt: new Date().toISOString(),
860
- });
861
-
862
- return sessionId;
863
- }
864
-
865
- async getSessionUser(sessionId: string) {
866
- const session = await this.sessionStore.get(sessionId);
867
- if (!session || new Date(session.expiresAt) < new Date()) {
868
- return null;
869
- }
870
- return this.userRepo.get({ id: session.userId });
871
- }
580
+ name: string,
581
+ documentStorage: DocumentTabularStorage,
582
+ chunkStorage: ChunkVectorStorage
583
+ );
584
+
585
+ // Documents
586
+ upsertDocument(document: Document): Promise<Document>;
587
+ getDocument(doc_id: string): Promise<Document | undefined>;
588
+ deleteDocument(doc_id: string): Promise<void>;
589
+ listDocuments(): Promise<string[]>;
590
+
591
+ // Tree traversal
592
+ getNode(doc_id: string, nodeId: string): Promise<DocumentNode | undefined>;
593
+ getAncestors(doc_id: string, nodeId: string): Promise<DocumentNode[]>;
594
+
595
+ // Chunks
596
+ upsertChunk(chunk: InsertChunkVectorEntity): Promise<ChunkVectorEntity>;
597
+ upsertChunksBulk(chunks: InsertChunkVectorEntity[]): Promise<ChunkVectorEntity[]>;
598
+ getChunk(chunk_id: string): Promise<ChunkVectorEntity | undefined>;
599
+ getChunksForDocument(doc_id: string): Promise<ChunkVectorEntity[]>;
600
+ deleteChunksForDocument(doc_id: string): Promise<void>;
601
+
602
+ // Search
603
+ similaritySearch(
604
+ query: TypedArray,
605
+ options?: VectorSearchOptions<ChunkRecord>
606
+ ): Promise<ChunkSearchResult[]>;
607
+ hybridSearch(
608
+ query: TypedArray,
609
+ options: HybridSearchOptions<ChunkRecord>
610
+ ): Promise<ChunkSearchResult[]>;
611
+
612
+ // Lifecycle
613
+ prepareReindex(doc_id: string): Promise<Document | undefined>;
614
+ setupDatabase(): Promise<void>;
615
+ destroy(): void;
616
+
617
+ // Accessors
618
+ put(chunk: InsertChunkVectorEntity): Promise<ChunkVectorEntity>;
619
+ putBulk(chunks: InsertChunkVectorEntity[]): Promise<ChunkVectorEntity[]>;
620
+ getAllChunks(): Promise<ChunkVectorEntity[] | undefined>;
621
+ chunkCount(): Promise<number>;
622
+ clearChunks(): Promise<void>;
623
+ getVectorDimensions(): number;
624
+ getDocumentChunks(doc_id: string): Promise<ChunkRecord[]>;
625
+ findChunksByNodeId(doc_id: string, nodeId: string): Promise<ChunkRecord[]>;
872
626
  }
873
627
  ```
874
628
 
875
- ### Configuration Management
629
+ ### createKnowledgeBase
876
630
 
877
631
  ```typescript
878
- // Application settings with typed configuration
879
- type AppConfig = {
880
- database: {
881
- host: string;
882
- port: number;
883
- name: string;
884
- };
885
- features: {
886
- enableNewUI: boolean;
887
- maxUploadSize: number;
888
- };
889
- integrations: {
890
- stripe: { apiKey: string; webhook: string };
891
- sendgrid: { apiKey: string };
892
- };
893
- };
894
-
895
- const configStore = new FsFolderJsonKvRepository<string, AppConfig>("./config");
896
-
897
- class ConfigManager {
898
- private cache = new Map<string, AppConfig>();
899
-
900
- constructor(private store: typeof configStore) {
901
- // Listen for config changes
902
- store.on("put", (key, value) => {
903
- this.cache.set(key, value);
904
- console.log(`Configuration updated: ${key}`);
905
- });
906
- }
907
-
908
- async getConfig(environment: string): Promise<AppConfig> {
909
- if (this.cache.has(environment)) {
910
- return this.cache.get(environment)!;
911
- }
912
-
913
- const config = await this.store.get(environment);
914
- if (!config) throw new Error(`No configuration for environment: ${environment}`);
915
-
916
- this.cache.set(environment, config);
917
- return config;
918
- }
919
-
920
- async updateConfig(environment: string, updates: Partial<AppConfig>) {
921
- const current = await this.getConfig(environment);
922
- const updated = { ...current, ...updates };
923
- await this.store.put(environment, updated);
924
- }
632
+ function createKnowledgeBase(options: CreateKnowledgeBaseOptions): Promise<KnowledgeBase>;
633
+
634
+ interface CreateKnowledgeBaseOptions {
635
+ readonly name: string;
636
+ readonly vectorDimensions: number;
637
+ readonly backend?: "in-memory";
638
+ readonly vectorType?: { new (array: number[]): TypedArray };
639
+ readonly register?: boolean; // Default: true
925
640
  }
926
641
  ```
927
642
 
928
- ### Supabase Integration Example
643
+ ### StructuralParser
929
644
 
930
645
  ```typescript
931
- import { createClient } from "@supabase/supabase-js";
932
- import { JsonSchema } from "@workglow/util";
933
- import {
934
- SupabaseTabularStorage,
935
- SupabaseKvRepository,
936
- SupabaseQueueStorage,
937
- } from "@workglow/storage";
938
-
939
- // Initialize Supabase client
940
- const supabase = createClient(process.env.SUPABASE_URL!, process.env.SUPABASE_ANON_KEY!);
941
-
942
- // Define schemas
943
- const ProductSchema = {
944
- type: "object",
945
- properties: {
946
- id: { type: "string" },
947
- name: { type: "string" },
948
- price: { type: "number" },
949
- category: { type: "string" },
950
- stock: { type: "number", minimum: 0 },
951
- createdAt: { type: "string", format: "date-time" },
952
- },
953
- required: ["id", "name", "price", "category", "stock", "createdAt"],
954
- additionalProperties: false,
955
- } as const satisfies JsonSchema;
956
-
957
- const OrderSchema = {
958
- type: "object",
959
- properties: {
960
- id: { type: "string" },
961
- customerId: { type: "string" },
962
- productId: { type: "string" },
963
- quantity: { type: "number", minimum: 1 },
964
- status: {
965
- type: "string",
966
- enum: ["pending", "processing", "completed", "cancelled"],
967
- },
968
- createdAt: { type: "string", format: "date-time" },
969
- },
970
- required: ["id", "customerId", "productId", "quantity", "status", "createdAt"],
971
- additionalProperties: false,
972
- } as const satisfies JsonSchema;
973
-
974
- // Create repositories
975
- const products = new SupabaseTabularStorage<typeof ProductSchema, ["id"]>(
976
- supabase,
977
- "products",
978
- ProductSchema,
979
- ["id"],
980
- ["category", "name"] // Indexed columns for fast searching
981
- );
982
-
983
- const orders = new SupabaseTabularStorage<typeof OrderSchema, ["id"]>(
984
- supabase,
985
- "orders",
986
- OrderSchema,
987
- ["id"],
988
- ["customerId", "status", ["customerId", "status"]] // Compound index
989
- );
990
-
991
- // Use KV for caching
992
- const cache = new SupabaseKvRepository(supabase, "cache");
993
-
994
- // Use queue for background processing
995
- type EmailJob = { to: string; subject: string; body: string };
996
- const emailQueue = new SupabaseQueueStorage<EmailJob, void>(supabase, "emails");
997
-
998
- // Example usage
999
- async function createOrder(customerId: string, productId: string, quantity: number) {
1000
- // Check product availability
1001
- const product = await products.get({ id: productId });
1002
- if (!product || product.stock < quantity) {
1003
- throw new Error("Insufficient stock");
1004
- }
1005
-
1006
- // Create order
1007
- const order = {
1008
- id: crypto.randomUUID(),
1009
- customerId,
1010
- productId,
1011
- quantity,
1012
- status: "pending" as const,
1013
- createdAt: new Date().toISOString(),
1014
- };
1015
- await orders.put(order);
1016
-
1017
- // Update stock
1018
- await products.put({
1019
- ...product,
1020
- stock: product.stock - quantity,
1021
- });
1022
-
1023
- // Queue email notification
1024
- await emailQueue.add({
1025
- input: {
1026
- to: customerId,
1027
- subject: "Order Confirmation",
1028
- body: `Your order ${order.id} has been confirmed!`,
1029
- },
1030
- run_after: null,
1031
- max_retries: 3,
1032
- });
1033
-
1034
- return order;
646
+ class StructuralParser {
647
+ static parseMarkdown(doc_id: string, text: string, title: string): Promise<DocumentRootNode>;
648
+ static parsePlainText(doc_id: string, text: string, title: string): Promise<DocumentRootNode>;
649
+ static parse(
650
+ doc_id: string,
651
+ text: string,
652
+ title: string,
653
+ format?: string
654
+ ): Promise<DocumentRootNode>;
1035
655
  }
1036
-
1037
- // Get customer's orders
1038
- async function getCustomerOrders(customerId: string) {
1039
- return await orders.search({ customerId });
1040
- }
1041
-
1042
- // Get orders by status
1043
- async function getOrdersByStatus(status: string) {
1044
- return await orders.search({ status });
1045
- }
1046
- ```
1047
-
1048
- **Important Note**
1049
- The implementations assume you have an exec_sql RPC function in your Supabase database for table creation, or that you've created the tables through Supabase migrations. For production use, it's recommended to:
1050
-
1051
- - Create tables using Supabase migrations rather than runtime table creation
1052
- - Set up proper Row Level Security (RLS) policies in Supabase
1053
- - Use service role keys for server-side operations that need elevated permissions
1054
-
1055
- ## Testing
1056
-
1057
- The package includes comprehensive test suites for all storage implementations:
1058
-
1059
- ```bash
1060
- # Run all tests
1061
- bun test
1062
-
1063
- # Run specific test suites
1064
- bun test --grep "KvRepository"
1065
- bun test --grep "TabularStorage"
1066
- bun test --grep "QueueStorage"
1067
-
1068
- # Test specific environments
1069
- bun test --grep "InMemory" # Cross-platform tests
1070
- bun test --grep "IndexedDb" # Browser tests
1071
- bun test --grep "Sqlite" # Native tests
1072
656
  ```
1073
657
 
1074
- ### Writing Tests for Your Storage Usage
658
+ ### Type Helpers
1075
659
 
1076
660
  ```typescript
1077
- import { describe, test, expect, beforeEach } from "vitest";
1078
- import { InMemoryTabularStorage } from "@workglow/storage";
1079
-
1080
- describe("UserRepository", () => {
1081
- let userRepo: InMemoryTabularStorage<typeof UserSchema, ["id"]>;
1082
-
1083
- beforeEach(() => {
1084
- userRepo = new InMemoryTabularStorage<typeof UserSchema, ["id"]>(UserSchema, ["id"], ["email"]);
1085
- });
1086
-
1087
- test("should create and retrieve user", async () => {
1088
- const user = {
1089
- id: "test-123",
1090
- email: "test@example.com",
1091
- name: "Test User",
1092
- age: 25,
1093
- department: "Engineering",
1094
- createdAt: new Date().toISOString(),
1095
- };
1096
-
1097
- await userRepo.put(user);
1098
- const retrieved = await userRepo.get({ id: "test-123" });
1099
-
1100
- expect(retrieved).toEqual(user);
1101
- });
1102
-
1103
- test("should find users by department", async () => {
1104
- const users = [
1105
- {
1106
- id: "1",
1107
- email: "alice@co.com",
1108
- name: "Alice",
1109
- age: 28,
1110
- department: "Engineering",
1111
- createdAt: "2024-01-01",
1112
- },
1113
- {
1114
- id: "2",
1115
- email: "bob@co.com",
1116
- name: "Bob",
1117
- age: 32,
1118
- department: "Sales",
1119
- createdAt: "2024-01-02",
1120
- },
1121
- ];
1122
-
1123
- await userRepo.putBulk(users);
1124
- const engineers = await userRepo.search({ department: "Engineering" });
1125
-
1126
- expect(engineers).toHaveLength(1);
1127
- expect(engineers![0].name).toBe("Alice");
1128
- });
1129
- });
661
+ // Schema helper for task inputs that accept a KnowledgeBase ID or instance
662
+ function TypeKnowledgeBase<O>(options?: O): JsonSchema;
663
+
664
+ // Schema helper for tabular storage inputs
665
+ function TypeTabularStorage<O>(options?: O): JsonSchema;
1130
666
  ```
1131
667
 
1132
668
  ## License