@workglow/knowledge-base 0.0.115

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +670 -0
  3. package/dist/browser.js +1071 -0
  4. package/dist/browser.js.map +23 -0
  5. package/dist/bun.js +1072 -0
  6. package/dist/bun.js.map +23 -0
  7. package/dist/chunk/ChunkSchema.d.ts +206 -0
  8. package/dist/chunk/ChunkSchema.d.ts.map +1 -0
  9. package/dist/chunk/ChunkVectorStorageSchema.d.ts +64 -0
  10. package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -0
  11. package/dist/common-server.d.ts +7 -0
  12. package/dist/common-server.d.ts.map +1 -0
  13. package/dist/common.d.ts +20 -0
  14. package/dist/common.d.ts.map +1 -0
  15. package/dist/document/Document.d.ts +51 -0
  16. package/dist/document/Document.d.ts.map +1 -0
  17. package/dist/document/DocumentNode.d.ts +32 -0
  18. package/dist/document/DocumentNode.d.ts.map +1 -0
  19. package/dist/document/DocumentSchema.d.ts +1203 -0
  20. package/dist/document/DocumentSchema.d.ts.map +1 -0
  21. package/dist/document/DocumentStorageSchema.d.ts +43 -0
  22. package/dist/document/DocumentStorageSchema.d.ts.map +1 -0
  23. package/dist/document/StructuralParser.d.ts +30 -0
  24. package/dist/document/StructuralParser.d.ts.map +1 -0
  25. package/dist/knowledge-base/InMemoryKnowledgeBaseRepository.d.ts +13 -0
  26. package/dist/knowledge-base/InMemoryKnowledgeBaseRepository.d.ts.map +1 -0
  27. package/dist/knowledge-base/KnowledgeBase.d.ts +123 -0
  28. package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -0
  29. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +38 -0
  30. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -0
  31. package/dist/knowledge-base/KnowledgeBaseRepository.d.ts +74 -0
  32. package/dist/knowledge-base/KnowledgeBaseRepository.d.ts.map +1 -0
  33. package/dist/knowledge-base/KnowledgeBaseSchema.d.ts +50 -0
  34. package/dist/knowledge-base/KnowledgeBaseSchema.d.ts.map +1 -0
  35. package/dist/knowledge-base/createKnowledgeBase.d.ts +30 -0
  36. package/dist/knowledge-base/createKnowledgeBase.d.ts.map +1 -0
  37. package/dist/node.js +1071 -0
  38. package/dist/node.js.map +23 -0
  39. package/dist/types.d.ts +7 -0
  40. package/dist/types.d.ts.map +1 -0
  41. package/dist/util/DatasetSchema.d.ts +40 -0
  42. package/dist/util/DatasetSchema.d.ts.map +1 -0
  43. package/package.json +55 -0
package/README.md ADDED
@@ -0,0 +1,670 @@
1
+ # @workglow/knowledge-base
2
+
3
+ Document management, hierarchical chunking, and knowledge base infrastructure for RAG pipelines.
4
+
5
+ - [Overview](#overview)
6
+ - [Installation](#installation)
7
+ - [Quick Start](#quick-start)
8
+ - [Architecture](#architecture)
9
+ - [Documents](#documents)
10
+ - [Document Tree Structure](#document-tree-structure)
11
+ - [Parsing](#parsing)
12
+ - [Node Enrichment](#node-enrichment)
13
+ - [Chunks](#chunks)
14
+ - [ChunkRecord](#chunkrecord)
15
+ - [Chunk Vector Storage](#chunk-vector-storage)
16
+ - [KnowledgeBase](#knowledgebase)
17
+ - [Creating a KnowledgeBase](#creating-a-knowledgebase)
18
+ - [Document CRUD](#document-crud)
19
+ - [Chunk Operations](#chunk-operations)
20
+ - [Search](#search)
21
+ - [Tree Traversal](#tree-traversal)
22
+ - [Lifecycle Management](#lifecycle-management)
23
+ - [Registry](#registry)
24
+ - [Data Flow](#data-flow)
25
+ - [Ingestion Pipeline](#ingestion-pipeline)
26
+ - [Retrieval Pipeline](#retrieval-pipeline)
27
+ - [API Reference](#api-reference)
28
+ - [Document](#document)
29
+ - [KnowledgeBase](#knowledgebase-1)
30
+ - [createKnowledgeBase](#createknowledgebase)
31
+ - [StructuralParser](#structuralparser)
32
+ - [Type Helpers](#type-helpers)
33
+ - [License](#license)
34
+
35
+ ## Overview
36
+
37
+ This package provides the data layer for RAG (Retrieval-Augmented Generation) workflows. It ties together three concerns:
38
+
39
+ 1. **Documents** — hierarchical tree representation of parsed text (sections, paragraphs, sentences)
40
+ 2. **Chunks** — flat records derived from the document tree, each tracking its position via node paths
41
+ 3. **KnowledgeBase** — unified interface that owns both document storage (tabular) and chunk storage (vector), with cascading lifecycle management
42
+
43
+ ```
44
+ Markdown / Plain Text
45
+
46
+
47
+ ┌──────────────┐
48
+ │ Document │ Hierarchical tree (sections, paragraphs)
49
+ │ (tabular) │ Stored as serialized JSON
50
+ └──────┬───────┘
51
+ │ chunking
52
+
53
+ ┌──────────────┐
54
+ │ Chunks │ Flat records with tree linkage (nodePath, depth)
55
+ │ (vector) │ Stored with embedding vectors
56
+ └──────┬───────┘
57
+ │ search
58
+
59
+ ┌──────────────┐
60
+ │ Results │ Ranked by similarity score
61
+ └──────────────┘
62
+ ```
63
+
64
+ ## Installation
65
+
66
+ ```bash
67
+ bun install @workglow/knowledge-base
68
+ ```
69
+
70
+ Peer dependencies: `@workglow/storage`, `@workglow/util`.
71
+
72
+ ## Quick Start
73
+
74
+ ```typescript
75
+ import {
76
+ createKnowledgeBase,
77
+ Document,
78
+ StructuralParser,
79
+ } from "@workglow/knowledge-base";
80
+
81
+ // 1. Create a knowledge base
82
+ const kb = await createKnowledgeBase({
83
+ name: "my-kb",
84
+ vectorDimensions: 384,
85
+ });
86
+
87
+ // 2. Parse a document
88
+ const root = await StructuralParser.parseMarkdown("doc1", markdown, "My Doc");
89
+ const doc = new Document(root, { title: "My Doc" });
90
+
91
+ // 3. Store the document
92
+ const inserted = await kb.upsertDocument(doc);
93
+
94
+ // 4. Store chunk embeddings
95
+ await kb.upsertChunk({
96
+ doc_id: inserted.doc_id!,
97
+ vector: new Float32Array([0.1, 0.2, ...]),
98
+ metadata: {
99
+ chunkId: "chunk_1",
100
+ doc_id: inserted.doc_id!,
101
+ text: "The chunk text...",
102
+ nodePath: [root.nodeId, sectionNodeId],
103
+ depth: 2,
104
+ },
105
+ });
106
+
107
+ // 5. Search (via task pipeline)
108
+ import { Workflow } from "@workglow/task-graph";
109
+
110
+ const result = await new Workflow()
111
+ .chunkRetrieval({
112
+ knowledgeBase: "my-kb",
113
+ query: "your search query",
114
+ model: "your-embedding-model",
115
+ topK: 5,
116
+ })
117
+ .run();
118
+ ```
119
+
120
+ ## Architecture
121
+
122
+ The package is organized around three layers:
123
+
124
+ ```
125
+ ┌───────────────────────────────────────────────────┐
126
+ │ KnowledgeBase │
127
+ │ Unified API for documents + chunks + search │
128
+ ├────────────────────┬──────────────────────────────┤
129
+ │ DocumentTabular │ ChunkVector │
130
+ │ Storage │ Storage │
131
+ │ (ITabularStorage) │ (IVectorStorage) │
132
+ │ │ │
133
+ │ Stores serialized │ Stores embeddings + │
134
+ │ document trees │ ChunkRecord metadata │
135
+ └────────────────────┴──────────────────────────────┘
136
+ ```
137
+
138
+ **Storage backends** are pluggable via the `@workglow/storage` interfaces. The `createKnowledgeBase` factory defaults to in-memory storage; production deployments can use SQLite, PostgreSQL, or any other `ITabularStorage` / `IVectorStorage` implementation.
139
+
140
+ ## Documents
141
+
142
+ ### Document Tree Structure
143
+
144
+ Documents are represented as a hierarchical tree using a **discriminated union** of node types:
145
+
146
+ ```
147
+ DocumentRootNode (kind: "document")
148
+ ├── SectionNode (kind: "section", level: 1)
149
+ │ ├── ParagraphNode (kind: "paragraph")
150
+ │ ├── ParagraphNode (kind: "paragraph")
151
+ │ └── SectionNode (kind: "section", level: 2)
152
+ │ └── ParagraphNode (kind: "paragraph")
153
+ ├── SectionNode (kind: "section", level: 1)
154
+ │ └── ParagraphNode (kind: "paragraph")
155
+ └── ParagraphNode (kind: "paragraph")
156
+ ```
157
+
158
+ **Node types:**
159
+
160
+ | Type | Kind | Has Children | Description |
161
+ | ------------------ | ------------- | ------------ | ------------------------------------------------- |
162
+ | `DocumentRootNode` | `"document"` | yes | Root of the tree, has `title` |
163
+ | `SectionNode` | `"section"` | yes | From headers (level 1-6), has `title` and `level` |
164
+ | `ParagraphNode` | `"paragraph"` | no | Prose content |
165
+ | `SentenceNode` | `"sentence"` | no | Fine-grained segmentation |
166
+ | `TopicNode` | `"topic"` | yes | From topic segmentation algorithms |
167
+
168
+ All nodes share a base set of fields:
169
+
170
+ ```typescript
171
+ interface DocumentNodeBase {
172
+ readonly nodeId: string; // Unique identifier (UUID)
173
+ readonly kind: NodeKind; // Discriminator
174
+ readonly range: NodeRange; // { startOffset, endOffset } in source text
175
+ readonly text: string; // Text content
176
+ readonly enrichment?: NodeEnrichment; // Optional summary, entities, keywords
177
+ }
178
+ ```
179
+
180
+ Use the `NodeKind` constants for comparisons:
181
+
182
+ ```typescript
183
+ import { NodeKind } from "@workglow/knowledge-base";
184
+
185
+ if (node.kind === NodeKind.SECTION) {
186
+ console.log(node.title, node.level, node.children.length);
187
+ }
188
+ ```
189
+
190
+ ### Parsing
191
+
192
+ `StructuralParser` converts raw text into a `DocumentRootNode`:
193
+
194
+ ```typescript
195
+ import { StructuralParser } from "@workglow/knowledge-base";
196
+
197
+ // Markdown — detects headers, creates nested sections
198
+ const root = await StructuralParser.parseMarkdown(docId, markdownText, "Title");
199
+
200
+ // Plain text — splits on blank lines into paragraphs
201
+ const root = await StructuralParser.parsePlainText(docId, plainText, "Title");
202
+
203
+ // Auto-detect format
204
+ const root = await StructuralParser.parse(docId, text, "Title");
205
+ ```
206
+
207
+ The parser:
208
+
209
+ - Converts markdown headers (`#` through `######`) into nested `SectionNode`s
210
+ - Groups text between headers as `ParagraphNode` children
211
+ - Tracks character offsets (`startOffset`, `endOffset`) for every node
212
+ - Assigns a unique `nodeId` to each node
213
+
214
+ ### Node Enrichment
215
+
216
+ Nodes can carry optional enrichment data populated by AI tasks:
217
+
218
+ ```typescript
219
+ interface NodeEnrichment {
220
+ summary?: string; // AI-generated summary
221
+ entities?: Entity[]; // Named entities (text, type, confidence score)
222
+ keywords?: string[]; // Extracted keywords
223
+ }
224
+ ```
225
+
226
+ Enrichment is set on nodes during the ingestion pipeline (e.g., by `DocumentEnricherTask`) and propagated to chunks during hierarchy join.
227
+
228
+ ## Chunks
229
+
230
+ ### ChunkRecord
231
+
232
+ A `ChunkRecord` is a flat, self-contained unit of text with full context about its position in the document tree:
233
+
234
+ ```typescript
235
+ interface ChunkRecord {
236
+ // Identity
237
+ chunkId: string; // Unique chunk identifier
238
+ doc_id: string; // Parent document ID
239
+
240
+ // Content
241
+ text: string; // The text to embed and search
242
+
243
+ // Tree linkage
244
+ nodePath: string[]; // Node IDs from root to leaf
245
+ depth: number; // Depth in the document tree
246
+
247
+ // Optional fields
248
+ leafNodeId?: string; // Leaf node this chunk belongs to
249
+ summary?: string; // Chunk-level summary
250
+ entities?: Entity[]; // Named entities
251
+ parentSummaries?: string[]; // Summaries from ancestor nodes
252
+ sectionTitles?: string[]; // Titles of ancestor sections
253
+ doc_title?: string; // Document title
254
+ }
255
+ ```
256
+
257
+ The `nodePath` and `depth` fields enable **hierarchy-aware retrieval**: given a search result, you can walk back up the document tree to get section titles, parent summaries, or sibling chunks for additional context.
258
+
259
+ ### Chunk Vector Storage
260
+
261
+ Chunks are stored in vector storage as `ChunkVectorEntity`:
262
+
263
+ ```typescript
264
+ interface ChunkVectorEntity {
265
+ chunk_id: string; // Primary key (auto-generated UUID)
266
+ doc_id: string; // For filtering by document
267
+ vector: TypedArray; // Embedding (Float32Array, etc.)
268
+ metadata: ChunkRecord; // Full chunk record
269
+ }
270
+ ```
271
+
272
+ The `metadata` field holds the complete `ChunkRecord`, so search results carry all the context needed for hierarchy-aware retrieval without additional lookups.
273
+
274
+ ## KnowledgeBase
275
+
276
+ `KnowledgeBase` is the central class that ties document storage and vector storage together.
277
+
278
+ ### Creating a KnowledgeBase
279
+
280
+ **Factory function (recommended):**
281
+
282
+ ```typescript
283
+ import { createKnowledgeBase } from "@workglow/knowledge-base";
284
+
285
+ const kb = await createKnowledgeBase({
286
+ name: "my-kb", // Identifier
287
+ vectorDimensions: 384, // Must match your embedding model
288
+ backend: "in-memory", // Currently only "in-memory"
289
+ vectorType: Float32Array, // Default: Float32Array
290
+ register: true, // Register globally (default: true)
291
+ });
292
+ ```
293
+
294
+ **Direct construction (custom storage backends):**
295
+
296
+ ```typescript
297
+ import { KnowledgeBase } from "@workglow/knowledge-base";
298
+
299
+ const kb = new KnowledgeBase(
300
+ "my-kb",
301
+ myDocumentTabularStorage, // ITabularStorage implementation
302
+ myChunkVectorStorage // IVectorStorage implementation
303
+ );
304
+ ```
305
+
306
+ ### Document CRUD
307
+
308
+ ```typescript
309
+ // Upsert — auto-generates doc_id if not set
310
+ const doc = new Document(root, { title: "My Document" });
311
+ const inserted = await kb.upsertDocument(doc);
312
+ console.log(inserted.doc_id); // auto-generated UUID
313
+
314
+ // Get by ID
315
+ const retrieved = await kb.getDocument(inserted.doc_id!);
316
+
317
+ // List all document IDs
318
+ const docIds = await kb.listDocuments();
319
+
320
+ // Delete — cascades to all chunks in vector storage
321
+ await kb.deleteDocument(inserted.doc_id!);
322
+ ```
323
+
324
+ ### Chunk Operations
325
+
326
+ ```typescript
327
+ // Upsert a single chunk
328
+ const entity = await kb.upsertChunk({
329
+ doc_id: "doc1",
330
+ vector: new Float32Array([0.1, 0.2, 0.3]),
331
+ metadata: {
332
+ chunkId: "chunk_1",
333
+ doc_id: "doc1",
334
+ text: "Some text...",
335
+ nodePath: ["root", "section1"],
336
+ depth: 2,
337
+ },
338
+ });
339
+
340
+ // Upsert in bulk
341
+ const entities = await kb.upsertChunksBulk(chunkArray);
342
+
343
+ // Get a specific chunk
344
+ const chunk = await kb.getChunk("chunk_id_here");
345
+
346
+ // Get all chunks for a document
347
+ const docChunks = await kb.getChunksForDocument("doc1");
348
+
349
+ // Delete all chunks for a document (without deleting the document)
350
+ await kb.deleteChunksForDocument("doc1");
351
+ ```
352
+
353
+ ### Search
354
+
355
+ **Similarity search** — vector-only:
356
+
357
+ ```typescript
358
+ const results = await kb.similaritySearch(queryVector, {
359
+ topK: 10, // Max results (default varies by backend)
360
+ scoreThreshold: 0.7, // Minimum similarity score
361
+ filter: { doc_id: "doc1" }, // Metadata filter
362
+ });
363
+
364
+ // Each result: ChunkVectorEntity & { score: number }
365
+ for (const result of results) {
366
+ console.log(result.chunk_id, result.score, result.metadata.text);
367
+ }
368
+ ```
369
+
370
+ **Hybrid search** — combines vector similarity with full-text search. Requires a storage backend that supports it (e.g., PostgreSQL with pgvector). Returns an empty array if unsupported.
371
+
372
+ ```typescript
373
+ const results = await kb.hybridSearch(queryVector, {
374
+ textQuery: "machine learning",
375
+ topK: 10,
376
+ vectorWeight: 0.7, // 0-1, balance between vector and text
377
+ scoreThreshold: 0.5,
378
+ filter: { doc_id: "doc1" },
379
+ });
380
+ ```
381
+
382
+ ### Tree Traversal
383
+
384
+ Navigate the document tree stored in the knowledge base:
385
+
386
+ ```typescript
387
+ // Get a specific node by ID
388
+ const node = await kb.getNode("doc1", nodeId);
389
+
390
+ // Get ancestors from root to target (useful for building context)
391
+ const ancestors = await kb.getAncestors("doc1", leafNodeId);
392
+ // Returns: [rootNode, sectionNode, subsectionNode, targetNode]
393
+
394
+ // Get chunks stored in the document JSON (not vector storage)
395
+ const chunks = await kb.getDocumentChunks("doc1");
396
+
397
+ // Find chunks whose nodePath contains a given node ID
398
+ const related = await kb.findChunksByNodeId("doc1", sectionNodeId);
399
+ ```
400
+
401
+ ### Lifecycle Management
402
+
403
+ ```typescript
404
+ // Prepare for re-indexing: delete chunks but keep the document
405
+ const doc = await kb.prepareReindex("doc1");
406
+ // doc is returned so you can re-chunk and re-embed
407
+
408
+ // Initialize storage backends
409
+ await kb.setupDatabase();
410
+
411
+ // Tear down
412
+ kb.destroy();
413
+ ```
414
+
415
+ ### Registry
416
+
417
+ Knowledge bases can be registered globally by name, allowing tasks to reference them by string ID:
418
+
419
+ ```typescript
420
+ import { registerKnowledgeBase, getKnowledgeBase, TypeKnowledgeBase } from "@workglow/knowledge-base";
421
+
422
+ // Register
423
+ registerKnowledgeBase("my-kb", kb);
424
+
425
+ // Retrieve
426
+ const retrieved = getKnowledgeBase("my-kb");
427
+
428
+ // In task schemas — accepts either a string ID or a KnowledgeBase instance
429
+ const inputSchema = {
430
+ type: "object",
431
+ properties: {
432
+ knowledgeBase: TypeKnowledgeBase({
433
+ title: "Knowledge Base",
434
+ description: "The KB to search",
435
+ }),
436
+ },
437
+ required: ["knowledgeBase"],
438
+ } as const;
439
+
440
+ // Both work:
441
+ await task.run({ knowledgeBase: kb }); // Direct instance
442
+ await task.run({ knowledgeBase: "my-kb" }); // Resolved from registry
443
+ ```
444
+
445
+ ## Data Flow
446
+
447
+ ### Ingestion Pipeline
448
+
449
+ All ingestion steps are composable Tasks that auto-connect in a Workflow:
450
+
451
+ ```
452
+ Raw Text / Markdown
453
+
454
+ ▼ StructuralParserTask
455
+ DocumentRootNode (tree with character offsets)
456
+
457
+ ▼ DocumentEnricherTask (optional AI enrichment)
458
+ DocumentRootNode (+ summaries, entities on nodes)
459
+
460
+ ▼ HierarchicalChunkerTask
461
+ ChunkRecord[] (flat chunks with nodePath linkage)
462
+
463
+ ▼ TextEmbeddingTask (AI model)
464
+ ChunkRecord[] + Float32Array[] (text → vectors)
465
+
466
+ ▼ ChunkToVectorTask
467
+ Vectors + metadata in vector store format
468
+
469
+ ▼ ChunkVectorUpsertTask → vector + tabular storage
470
+ ```
471
+
472
+ Example using the Workflow API:
473
+
474
+ ```typescript
475
+ import { Workflow } from "@workglow/task-graph";
476
+
477
+ const result = await new Workflow()
478
+ .fileLoader({ url: `file://${filePath}`, format: "markdown" })
479
+ .structuralParser({ title: "My Document" })
480
+ .documentEnricher({ generateSummaries: true, extractEntities: true })
481
+ .hierarchicalChunker({ maxTokens: 512, overlap: 50 })
482
+ .textEmbedding({ model: "your-embedding-model" })
483
+ .chunkToVector()
484
+ .chunkVectorUpsert({ knowledgeBase: "my-kb" })
485
+ .run();
486
+
487
+ console.log(result.count); // Number of vectors stored
488
+ ```
489
+
490
+ ### Retrieval Pipeline
491
+
492
+ All retrieval steps are composable Tasks that auto-connect in a Workflow:
493
+
494
+ ```
495
+ User Query
496
+
497
+ ▼ QueryExpanderTask (optional — generates query variations)
498
+ Expanded queries
499
+
500
+ ▼ ChunkRetrievalTask (embeds query + vector search)
501
+ or ChunkVectorHybridSearchTask (vector + full-text search)
502
+ ChunkSearchResult[] (chunks, chunk_ids, scores, query)
503
+
504
+ ▼ HierarchyJoinTask (optional — enriches with ancestor context)
505
+ Enriched chunks (+ parentSummaries, sectionTitles, entities)
506
+
507
+ ▼ RerankerTask (optional — cross-encoder reranking)
508
+ Re-scored chunks
509
+
510
+ ▼ ContextBuilderTask
511
+ Formatted context string for LLM prompt
512
+
513
+ ▼ LLM
514
+ Answer
515
+ ```
516
+
517
+ Example using the Workflow API:
518
+
519
+ ```typescript
520
+ import { Workflow } from "@workglow/task-graph";
521
+
522
+ const result = await new Workflow()
523
+ .chunkRetrieval({
524
+ knowledgeBase: "my-kb",
525
+ query: "What caused the Civil War?",
526
+ model: "your-embedding-model",
527
+ topK: 10,
528
+ })
529
+ .hierarchyJoin({
530
+ knowledgeBase: "my-kb",
531
+ includeParentSummaries: true,
532
+ })
533
+ .reranker({
534
+ method: "cross-encoder",
535
+ model: "your-reranker-model",
536
+ topK: 5,
537
+ })
538
+ .contextBuilder({
539
+ format: "numbered",
540
+ includeMetadata: false,
541
+ })
542
+ .run();
543
+
544
+ console.log(result.context); // Formatted context ready for LLM
545
+ ```
546
+
547
+ ## API Reference
548
+
549
+ ### Document
550
+
551
+ ```typescript
552
+ class Document {
553
+ readonly doc_id?: string;
554
+ readonly root: DocumentRootNode;
555
+ readonly metadata: DocumentMetadata;
556
+
557
+ constructor(
558
+ root: DocumentRootNode,
559
+ metadata: DocumentMetadata,
560
+ chunks?: ChunkRecord[],
561
+ doc_id?: string
562
+ );
563
+
564
+ setDocId(id: string): void;
565
+ setChunks(chunks: ChunkRecord[]): void;
566
+ getChunks(): ChunkRecord[];
567
+ findChunksByNodeId(nodeId: string): ChunkRecord[];
568
+ toJSON(): object;
569
+ static fromJSON(json: string, doc_id?: string): Document;
570
+ }
571
+ ```
572
+
573
+ ### KnowledgeBase
574
+
575
+ ```typescript
576
+ class KnowledgeBase {
577
+ readonly name: string;
578
+
579
+ constructor(
580
+ name: string,
581
+ documentStorage: DocumentTabularStorage,
582
+ chunkStorage: ChunkVectorStorage
583
+ );
584
+
585
+ // Documents
586
+ upsertDocument(document: Document): Promise<Document>;
587
+ getDocument(doc_id: string): Promise<Document | undefined>;
588
+ deleteDocument(doc_id: string): Promise<void>;
589
+ listDocuments(): Promise<string[]>;
590
+
591
+ // Tree traversal
592
+ getNode(doc_id: string, nodeId: string): Promise<DocumentNode | undefined>;
593
+ getAncestors(doc_id: string, nodeId: string): Promise<DocumentNode[]>;
594
+
595
+ // Chunks
596
+ upsertChunk(chunk: InsertChunkVectorEntity): Promise<ChunkVectorEntity>;
597
+ upsertChunksBulk(chunks: InsertChunkVectorEntity[]): Promise<ChunkVectorEntity[]>;
598
+ getChunk(chunk_id: string): Promise<ChunkVectorEntity | undefined>;
599
+ getChunksForDocument(doc_id: string): Promise<ChunkVectorEntity[]>;
600
+ deleteChunksForDocument(doc_id: string): Promise<void>;
601
+
602
+ // Search
603
+ similaritySearch(
604
+ query: TypedArray,
605
+ options?: VectorSearchOptions<ChunkRecord>
606
+ ): Promise<ChunkSearchResult[]>;
607
+ hybridSearch(
608
+ query: TypedArray,
609
+ options: HybridSearchOptions<ChunkRecord>
610
+ ): Promise<ChunkSearchResult[]>;
611
+
612
+ // Lifecycle
613
+ prepareReindex(doc_id: string): Promise<Document | undefined>;
614
+ setupDatabase(): Promise<void>;
615
+ destroy(): void;
616
+
617
+ // Accessors
618
+ put(chunk: InsertChunkVectorEntity): Promise<ChunkVectorEntity>;
619
+ putBulk(chunks: InsertChunkVectorEntity[]): Promise<ChunkVectorEntity[]>;
620
+ getAllChunks(): Promise<ChunkVectorEntity[] | undefined>;
621
+ chunkCount(): Promise<number>;
622
+ clearChunks(): Promise<void>;
623
+ getVectorDimensions(): number;
624
+ getDocumentChunks(doc_id: string): Promise<ChunkRecord[]>;
625
+ findChunksByNodeId(doc_id: string, nodeId: string): Promise<ChunkRecord[]>;
626
+ }
627
+ ```
628
+
629
+ ### createKnowledgeBase
630
+
631
+ ```typescript
632
+ function createKnowledgeBase(options: CreateKnowledgeBaseOptions): Promise<KnowledgeBase>;
633
+
634
+ interface CreateKnowledgeBaseOptions {
635
+ readonly name: string;
636
+ readonly vectorDimensions: number;
637
+ readonly backend?: "in-memory";
638
+ readonly vectorType?: { new (array: number[]): TypedArray };
639
+ readonly register?: boolean; // Default: true
640
+ }
641
+ ```
642
+
643
+ ### StructuralParser
644
+
645
+ ```typescript
646
+ class StructuralParser {
647
+ static parseMarkdown(doc_id: string, text: string, title: string): Promise<DocumentRootNode>;
648
+ static parsePlainText(doc_id: string, text: string, title: string): Promise<DocumentRootNode>;
649
+ static parse(
650
+ doc_id: string,
651
+ text: string,
652
+ title: string,
653
+ format?: string
654
+ ): Promise<DocumentRootNode>;
655
+ }
656
+ ```
657
+
658
+ ### Type Helpers
659
+
660
+ ```typescript
661
+ // Schema helper for task inputs that accept a KnowledgeBase ID or instance
662
+ function TypeKnowledgeBase<O>(options?: O): JsonSchema;
663
+
664
+ // Schema helper for tabular storage inputs
665
+ function TypeTabularStorage<O>(options?: O): JsonSchema;
666
+ ```
667
+
668
+ ## License
669
+
670
+ Apache 2.0 - See [LICENSE](./LICENSE) for details