@workglow/dataset 0.0.110 → 0.0.114

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +504 -968
  2. package/dist/browser.js +376 -490
  3. package/dist/browser.js.map +13 -13
  4. package/dist/bun.js +376 -490
  5. package/dist/bun.js.map +13 -13
  6. package/dist/chunk/ChunkSchema.d.ts +206 -0
  7. package/dist/chunk/ChunkSchema.d.ts.map +1 -0
  8. package/dist/chunk/ChunkVectorStorageSchema.d.ts +64 -0
  9. package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -0
  10. package/dist/common.d.ts +5 -5
  11. package/dist/common.d.ts.map +1 -1
  12. package/dist/document/Document.d.ts +7 -6
  13. package/dist/document/Document.d.ts.map +1 -1
  14. package/dist/document/DocumentSchema.d.ts +0 -465
  15. package/dist/document/DocumentSchema.d.ts.map +1 -1
  16. package/dist/knowledge-base/KnowledgeBase.d.ts +122 -0
  17. package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -0
  18. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +24 -0
  19. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -0
  20. package/dist/knowledge-base/createKnowledgeBase.d.ts +28 -0
  21. package/dist/knowledge-base/createKnowledgeBase.d.ts.map +1 -0
  22. package/dist/node.js +376 -490
  23. package/dist/node.js.map +13 -13
  24. package/dist/util/DatasetSchema.d.ts +9 -49
  25. package/dist/util/DatasetSchema.d.ts.map +1 -1
  26. package/package.json +5 -5
  27. package/dist/document/DocumentDataset.d.ts +0 -79
  28. package/dist/document/DocumentDataset.d.ts.map +0 -1
  29. package/dist/document/DocumentDatasetRegistry.d.ts +0 -29
  30. package/dist/document/DocumentDatasetRegistry.d.ts.map +0 -1
  31. package/dist/document-chunk/DocumentChunkDataset.d.ts +0 -79
  32. package/dist/document-chunk/DocumentChunkDataset.d.ts.map +0 -1
  33. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +0 -29
  34. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +0 -1
  35. package/dist/document-chunk/DocumentChunkSchema.d.ts +0 -56
  36. package/dist/document-chunk/DocumentChunkSchema.d.ts.map +0 -1
  37. package/src/document-chunk/README.md +0 -362
package/dist/browser.js CHANGED
@@ -1,227 +1,3 @@
1
- // src/util/DatasetSchema.ts
2
- function TypeTabularStorage(options = {}) {
3
- return {
4
- title: "Tabular Storage",
5
- description: "Storage ID or instance for tabular data storage",
6
- ...options,
7
- format: "storage:tabular",
8
- oneOf: [
9
- { type: "string", title: "Storage ID" },
10
- { title: "Storage Instance", additionalProperties: true }
11
- ]
12
- };
13
- }
14
- function TypeDocumentChunkDataset(options = {}) {
15
- return {
16
- title: "Document Chunk Dataset",
17
- description: "Dataset ID or instance for document chunk data storage",
18
- ...options,
19
- format: "dataset:document-chunk",
20
- anyOf: [
21
- { type: "string", title: "Dataset ID" },
22
- { title: "Dataset Instance", additionalProperties: true }
23
- ]
24
- };
25
- }
26
- function TypeDocumentDataset(options = {}) {
27
- return {
28
- title: "Document Dataset",
29
- description: "Dataset ID or instance for document data storage",
30
- ...options,
31
- format: "dataset:document",
32
- anyOf: [
33
- { type: "string", title: "Dataset ID" },
34
- { title: "Dataset Instance", additionalProperties: true }
35
- ]
36
- };
37
- }
38
- // src/document/Document.ts
39
- class Document {
40
- doc_id;
41
- metadata;
42
- root;
43
- chunks;
44
- constructor(root, metadata, chunks = [], doc_id) {
45
- this.doc_id = doc_id;
46
- this.root = root;
47
- this.metadata = metadata;
48
- this.chunks = chunks || [];
49
- }
50
- setChunks(chunks) {
51
- this.chunks = chunks;
52
- }
53
- getChunks() {
54
- return this.chunks;
55
- }
56
- setDocId(doc_id) {
57
- this.doc_id = doc_id;
58
- }
59
- findChunksByNodeId(nodeId) {
60
- return this.chunks.filter((chunk) => chunk.nodePath.includes(nodeId));
61
- }
62
- toJSON() {
63
- return {
64
- metadata: this.metadata,
65
- root: this.root,
66
- chunks: this.chunks
67
- };
68
- }
69
- static fromJSON(json, doc_id) {
70
- const obj = JSON.parse(json);
71
- return new Document(obj.root, obj.metadata, obj.chunks, doc_id);
72
- }
73
- }
74
- // src/document/DocumentDataset.ts
75
- class DocumentDataset {
76
- tabularStorage;
77
- vectorStorage;
78
- constructor(tabularStorage, vectorStorage) {
79
- this.tabularStorage = tabularStorage;
80
- this.vectorStorage = vectorStorage;
81
- }
82
- async upsert(document) {
83
- const serialized = JSON.stringify(document.toJSON());
84
- const insertEntity = {
85
- doc_id: document.doc_id,
86
- data: serialized
87
- };
88
- const entity = await this.tabularStorage.put(insertEntity);
89
- if (document.doc_id !== entity.doc_id) {
90
- document.setDocId(entity.doc_id);
91
- }
92
- return document;
93
- }
94
- async get(doc_id) {
95
- const entity = await this.tabularStorage.get({ doc_id });
96
- if (!entity) {
97
- return;
98
- }
99
- return Document.fromJSON(entity.data, entity.doc_id);
100
- }
101
- async delete(doc_id) {
102
- await this.tabularStorage.delete({ doc_id });
103
- }
104
- async getNode(doc_id, nodeId) {
105
- const doc = await this.get(doc_id);
106
- if (!doc) {
107
- return;
108
- }
109
- const traverse = (node) => {
110
- if (node.nodeId === nodeId) {
111
- return node;
112
- }
113
- if (node.children && Array.isArray(node.children)) {
114
- for (const child of node.children) {
115
- const found = traverse(child);
116
- if (found)
117
- return found;
118
- }
119
- }
120
- return;
121
- };
122
- return traverse(doc.root);
123
- }
124
- async getAncestors(doc_id, nodeId) {
125
- const doc = await this.get(doc_id);
126
- if (!doc) {
127
- return [];
128
- }
129
- const path = [];
130
- const findPath = (node) => {
131
- path.push(node.nodeId);
132
- if (node.nodeId === nodeId) {
133
- return true;
134
- }
135
- if (node.children && Array.isArray(node.children)) {
136
- for (const child of node.children) {
137
- if (findPath(child)) {
138
- return true;
139
- }
140
- }
141
- }
142
- path.pop();
143
- return false;
144
- };
145
- if (!findPath(doc.root)) {
146
- return [];
147
- }
148
- const ancestors = [];
149
- let currentNode = doc.root;
150
- ancestors.push(currentNode);
151
- for (let i = 1;i < path.length; i++) {
152
- const targetId = path[i];
153
- if (currentNode.children && Array.isArray(currentNode.children)) {
154
- const found = currentNode.children.find((child) => child.nodeId === targetId);
155
- if (found) {
156
- currentNode = found;
157
- ancestors.push(currentNode);
158
- } else {
159
- break;
160
- }
161
- } else {
162
- break;
163
- }
164
- }
165
- return ancestors;
166
- }
167
- async getChunks(doc_id) {
168
- const doc = await this.get(doc_id);
169
- if (!doc) {
170
- return [];
171
- }
172
- return doc.getChunks();
173
- }
174
- async findChunksByNodeId(doc_id, nodeId) {
175
- const doc = await this.get(doc_id);
176
- if (!doc) {
177
- return [];
178
- }
179
- if (doc.findChunksByNodeId) {
180
- return doc.findChunksByNodeId(nodeId);
181
- }
182
- const chunks = doc.getChunks();
183
- return chunks.filter((chunk) => chunk.nodePath && chunk.nodePath.includes(nodeId));
184
- }
185
- async list() {
186
- const entities = await this.tabularStorage.getAll();
187
- if (!entities) {
188
- return [];
189
- }
190
- return entities.map((e) => e.doc_id);
191
- }
192
- async search(query, options) {
193
- return this.vectorStorage?.similaritySearch(query, options) || [];
194
- }
195
- }
196
- // src/document/DocumentDatasetRegistry.ts
197
- import {
198
- createServiceToken,
199
- globalServiceRegistry,
200
- registerInputResolver
201
- } from "@workglow/util";
202
- var DOCUMENT_DATASETS = createServiceToken("dataset.documents");
203
- if (!globalServiceRegistry.has(DOCUMENT_DATASETS)) {
204
- globalServiceRegistry.register(DOCUMENT_DATASETS, () => new Map, true);
205
- }
206
- function getGlobalDocumentDatasets() {
207
- return globalServiceRegistry.get(DOCUMENT_DATASETS);
208
- }
209
- function registerDocumentDataset(id, dataset) {
210
- const datasets = getGlobalDocumentDatasets();
211
- datasets.set(id, dataset);
212
- }
213
- function getDocumentDataset(id) {
214
- return getGlobalDocumentDatasets().get(id);
215
- }
216
- async function resolveDocumentDatasetFromRegistry(id, format, registry) {
217
- const datasets = registry.has(DOCUMENT_DATASETS) ? registry.get(DOCUMENT_DATASETS) : getGlobalDocumentDatasets();
218
- const dataset = datasets.get(id);
219
- if (!dataset) {
220
- throw new Error(`Document dataset "${id}" not found in registry`);
221
- }
222
- return dataset;
223
- }
224
- registerInputResolver("dataset:document", resolveDocumentDatasetFromRegistry);
225
1
  // src/document/DocumentSchema.ts
226
2
  var NodeKind = {
227
3
  DOCUMENT: "document",
@@ -469,24 +245,31 @@ var TokenBudgetSchema = {
469
245
  required: ["maxTokensPerChunk", "overlapTokens", "reservedTokens"],
470
246
  additionalProperties: false
471
247
  };
472
- var ChunkEnrichmentSchema = {
248
+ var DocumentMetadataSchema = {
473
249
  type: "object",
474
250
  properties: {
475
- summary: {
251
+ title: {
476
252
  type: "string",
477
- title: "Summary",
478
- description: "Summary of the chunk content"
253
+ title: "Title",
254
+ description: "Document title"
479
255
  },
480
- entities: {
481
- type: "array",
482
- items: EntitySchema,
483
- title: "Entities",
484
- description: "Named entities extracted from the chunk"
256
+ sourceUri: {
257
+ type: "string",
258
+ title: "Source URI",
259
+ description: "Original source URI of the document"
260
+ },
261
+ createdAt: {
262
+ type: "string",
263
+ title: "Created At",
264
+ description: "ISO timestamp of creation"
485
265
  }
486
266
  },
487
- additionalProperties: false
267
+ required: ["title"],
268
+ additionalProperties: true
488
269
  };
489
- var ChunkNodeSchema = () => ({
270
+
271
+ // src/chunk/ChunkSchema.ts
272
+ var ChunkRecordSchema = () => ({
490
273
  type: "object",
491
274
  properties: {
492
275
  chunkId: {
@@ -515,45 +298,11 @@ var ChunkNodeSchema = () => ({
515
298
  title: "Depth",
516
299
  description: "Depth in the document tree"
517
300
  },
518
- enrichment: ChunkEnrichmentSchema
519
- },
520
- required: ["chunkId", "doc_id", "text", "nodePath", "depth"],
521
- additionalProperties: false
522
- });
523
- var ChunkMetadataSchema = {
524
- type: "object",
525
- properties: {
526
- doc_id: {
527
- type: "string",
528
- title: "Document ID",
529
- description: "ID of the parent document"
530
- },
531
- chunkId: {
532
- type: "string",
533
- title: "Chunk ID",
534
- description: "Unique identifier for this chunk"
535
- },
536
301
  leafNodeId: {
537
302
  type: "string",
538
303
  title: "Leaf Node ID",
539
304
  description: "ID of the leaf node this chunk belongs to"
540
305
  },
541
- depth: {
542
- type: "integer",
543
- title: "Depth",
544
- description: "Depth in the document tree"
545
- },
546
- text: {
547
- type: "string",
548
- title: "Text",
549
- description: "Text content of the chunk"
550
- },
551
- nodePath: {
552
- type: "array",
553
- items: { type: "string" },
554
- title: "Node Path",
555
- description: "Node IDs from root to leaf"
556
- },
557
306
  summary: {
558
307
  type: "string",
559
308
  title: "Summary",
@@ -564,107 +313,372 @@ var ChunkMetadataSchema = {
564
313
  items: EntitySchema,
565
314
  title: "Entities",
566
315
  description: "Named entities extracted from the chunk"
316
+ },
317
+ parentSummaries: {
318
+ type: "array",
319
+ items: { type: "string" },
320
+ title: "Parent Summaries",
321
+ description: "Summaries from ancestor nodes"
322
+ },
323
+ sectionTitles: {
324
+ type: "array",
325
+ items: { type: "string" },
326
+ title: "Section Titles",
327
+ description: "Titles of ancestor section nodes"
328
+ },
329
+ doc_title: {
330
+ type: "string",
331
+ title: "Document Title",
332
+ description: "Title of the parent document"
567
333
  }
568
334
  },
569
- required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
335
+ required: ["chunkId", "doc_id", "text", "nodePath", "depth"],
570
336
  additionalProperties: true
571
- };
572
- var ChunkMetadataArraySchema = {
337
+ });
338
+ var ChunkRecordArraySchema = {
573
339
  type: "array",
574
- items: ChunkMetadataSchema,
575
- title: "Chunk Metadata",
576
- description: "Metadata for each chunk"
340
+ items: ChunkRecordSchema(),
341
+ title: "Chunk Records",
342
+ description: "Array of chunk records"
577
343
  };
578
- var EnrichedChunkMetadataSchema = {
344
+ // src/chunk/ChunkVectorStorageSchema.ts
345
+ import { TypedArraySchema } from "@workglow/util";
346
+ var ChunkVectorStorageSchema = {
579
347
  type: "object",
580
348
  properties: {
581
- doc_id: {
582
- type: "string",
583
- title: "Document ID",
584
- description: "ID of the parent document"
585
- },
586
- chunkId: {
587
- type: "string",
588
- title: "Chunk ID",
589
- description: "Unique identifier for this chunk"
590
- },
591
- leafNodeId: {
592
- type: "string",
593
- title: "Leaf Node ID",
594
- description: "ID of the leaf node this chunk belongs to"
595
- },
596
- depth: {
597
- type: "integer",
598
- title: "Depth",
599
- description: "Depth in the document tree"
600
- },
601
- text: {
602
- type: "string",
603
- title: "Text",
604
- description: "Text content of the chunk"
605
- },
606
- nodePath: {
607
- type: "array",
608
- items: { type: "string" },
609
- title: "Node Path",
610
- description: "Node IDs from root to leaf"
611
- },
612
- summary: {
613
- type: "string",
614
- title: "Summary",
615
- description: "Summary of the chunk content"
616
- },
617
- entities: {
618
- type: "array",
619
- items: EntitySchema,
620
- title: "Entities",
621
- description: "Named entities (rolled up from hierarchy)"
622
- },
623
- parentSummaries: {
624
- type: "array",
625
- items: { type: "string" },
626
- title: "Parent Summaries",
627
- description: "Summaries from ancestor nodes"
628
- },
629
- sectionTitles: {
630
- type: "array",
631
- items: { type: "string" },
632
- title: "Section Titles",
633
- description: "Titles of ancestor section nodes"
634
- }
349
+ chunk_id: { type: "string", "x-auto-generated": true },
350
+ doc_id: { type: "string" },
351
+ vector: TypedArraySchema(),
352
+ metadata: { type: "object", format: "metadata", additionalProperties: true }
635
353
  },
636
- required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
637
- additionalProperties: true
638
- };
639
- var EnrichedChunkMetadataArraySchema = {
640
- type: "array",
641
- items: EnrichedChunkMetadataSchema,
642
- title: "Enriched Metadata",
643
- description: "Metadata enriched with hierarchy information"
354
+ required: ["chunk_id", "doc_id", "vector", "metadata"],
355
+ additionalProperties: false
644
356
  };
645
- var DocumentMetadataSchema = {
357
+ var ChunkVectorPrimaryKey = ["chunk_id"];
358
+ // src/document/Document.ts
359
+ class Document {
360
+ doc_id;
361
+ metadata;
362
+ root;
363
+ chunks;
364
+ constructor(root, metadata, chunks = [], doc_id) {
365
+ this.doc_id = doc_id;
366
+ this.root = root;
367
+ this.metadata = metadata;
368
+ this.chunks = chunks || [];
369
+ }
370
+ setChunks(chunks) {
371
+ this.chunks = chunks;
372
+ }
373
+ getChunks() {
374
+ return this.chunks;
375
+ }
376
+ setDocId(doc_id) {
377
+ this.doc_id = doc_id;
378
+ }
379
+ findChunksByNodeId(nodeId) {
380
+ return this.chunks.filter((chunk) => chunk.nodePath.includes(nodeId));
381
+ }
382
+ toJSON() {
383
+ return {
384
+ metadata: this.metadata,
385
+ root: this.root,
386
+ chunks: this.chunks
387
+ };
388
+ }
389
+ static fromJSON(json, doc_id) {
390
+ const obj = JSON.parse(json);
391
+ return new Document(obj.root, obj.metadata, obj.chunks, doc_id);
392
+ }
393
+ }
394
+
395
+ // src/knowledge-base/KnowledgeBase.ts
396
+ class KnowledgeBase {
397
+ name;
398
+ tabularStorage;
399
+ chunkStorage;
400
+ constructor(name, documentStorage, chunkStorage) {
401
+ this.name = name;
402
+ this.tabularStorage = documentStorage;
403
+ this.chunkStorage = chunkStorage;
404
+ }
405
+ async upsertDocument(document) {
406
+ const serialized = JSON.stringify(document.toJSON());
407
+ const insertEntity = {
408
+ doc_id: document.doc_id,
409
+ data: serialized
410
+ };
411
+ const entity = await this.tabularStorage.put(insertEntity);
412
+ if (document.doc_id !== entity.doc_id) {
413
+ document.setDocId(entity.doc_id);
414
+ }
415
+ return document;
416
+ }
417
+ async getDocument(doc_id) {
418
+ const entity = await this.tabularStorage.get({ doc_id });
419
+ if (!entity) {
420
+ return;
421
+ }
422
+ return Document.fromJSON(entity.data, entity.doc_id);
423
+ }
424
+ async deleteDocument(doc_id) {
425
+ await this.deleteChunksForDocument(doc_id);
426
+ await this.tabularStorage.delete({ doc_id });
427
+ }
428
+ async listDocuments() {
429
+ const entities = await this.tabularStorage.getAll();
430
+ if (!entities) {
431
+ return [];
432
+ }
433
+ return entities.map((e) => e.doc_id);
434
+ }
435
+ async getNode(doc_id, nodeId) {
436
+ const doc = await this.getDocument(doc_id);
437
+ if (!doc) {
438
+ return;
439
+ }
440
+ const traverse = (node) => {
441
+ if (node.nodeId === nodeId) {
442
+ return node;
443
+ }
444
+ if ("children" in node && Array.isArray(node.children)) {
445
+ for (const child of node.children) {
446
+ const found = traverse(child);
447
+ if (found)
448
+ return found;
449
+ }
450
+ }
451
+ return;
452
+ };
453
+ return traverse(doc.root);
454
+ }
455
+ async getAncestors(doc_id, nodeId) {
456
+ const doc = await this.getDocument(doc_id);
457
+ if (!doc) {
458
+ return [];
459
+ }
460
+ const path = [];
461
+ const findPath = (node) => {
462
+ path.push(node.nodeId);
463
+ if (node.nodeId === nodeId) {
464
+ return true;
465
+ }
466
+ if ("children" in node && Array.isArray(node.children)) {
467
+ for (const child of node.children) {
468
+ if (findPath(child)) {
469
+ return true;
470
+ }
471
+ }
472
+ }
473
+ path.pop();
474
+ return false;
475
+ };
476
+ if (!findPath(doc.root)) {
477
+ return [];
478
+ }
479
+ const ancestors = [];
480
+ let currentNode = doc.root;
481
+ ancestors.push(currentNode);
482
+ for (let i = 1;i < path.length; i++) {
483
+ const targetId = path[i];
484
+ if ("children" in currentNode && Array.isArray(currentNode.children)) {
485
+ const found = currentNode.children.find((child) => child.nodeId === targetId);
486
+ if (found) {
487
+ currentNode = found;
488
+ ancestors.push(currentNode);
489
+ } else {
490
+ break;
491
+ }
492
+ } else {
493
+ break;
494
+ }
495
+ }
496
+ return ancestors;
497
+ }
498
+ async upsertChunk(chunk) {
499
+ if (chunk.vector.length !== this.getVectorDimensions()) {
500
+ throw new Error(`Vector dimension mismatch: expected ${this.getVectorDimensions()}, got ${chunk.vector.length}.`);
501
+ }
502
+ return this.chunkStorage.put(chunk);
503
+ }
504
+ async upsertChunksBulk(chunks) {
505
+ const expected = this.getVectorDimensions();
506
+ for (const chunk of chunks) {
507
+ if (chunk.vector.length !== expected) {
508
+ throw new Error(`Vector dimension mismatch: expected ${expected}, got ${chunk.vector.length}.`);
509
+ }
510
+ }
511
+ return this.chunkStorage.putBulk(chunks);
512
+ }
513
+ async deleteChunksForDocument(doc_id) {
514
+ await this.chunkStorage.deleteSearch({ doc_id });
515
+ }
516
+ async getChunksForDocument(doc_id) {
517
+ const results = await this.chunkStorage.query({ doc_id });
518
+ return results ?? [];
519
+ }
520
+ async similaritySearch(query, options) {
521
+ return this.chunkStorage.similaritySearch(query, options);
522
+ }
523
+ async hybridSearch(query, options) {
524
+ if (typeof this.chunkStorage.hybridSearch !== "function") {
525
+ throw new Error("Hybrid search is not supported by the configured chunk storage backend. " + "Please use a vector storage implementation that provides `hybridSearch`.");
526
+ }
527
+ return this.chunkStorage.hybridSearch(query, options);
528
+ }
529
+ async prepareReindex(doc_id) {
530
+ const doc = await this.getDocument(doc_id);
531
+ if (!doc) {
532
+ return;
533
+ }
534
+ await this.deleteChunksForDocument(doc_id);
535
+ return doc;
536
+ }
537
+ async setupDatabase() {
538
+ await this.tabularStorage.setupDatabase();
539
+ await this.chunkStorage.setupDatabase();
540
+ }
541
+ destroy() {
542
+ this.tabularStorage.destroy();
543
+ this.chunkStorage.destroy();
544
+ }
545
+ async getChunk(chunk_id) {
546
+ return this.chunkStorage.get({ chunk_id });
547
+ }
548
+ async put(chunk) {
549
+ return this.chunkStorage.put(chunk);
550
+ }
551
+ async putBulk(chunks) {
552
+ return this.chunkStorage.putBulk(chunks);
553
+ }
554
+ async getAllChunks() {
555
+ return this.chunkStorage.getAll();
556
+ }
557
+ async chunkCount() {
558
+ return this.chunkStorage.size();
559
+ }
560
+ async clearChunks() {
561
+ return this.chunkStorage.deleteAll();
562
+ }
563
+ getVectorDimensions() {
564
+ return this.chunkStorage.getVectorDimensions();
565
+ }
566
+ async getDocumentChunks(doc_id) {
567
+ const doc = await this.getDocument(doc_id);
568
+ if (!doc) {
569
+ return [];
570
+ }
571
+ return doc.getChunks();
572
+ }
573
+ async findChunksByNodeId(doc_id, nodeId) {
574
+ const doc = await this.getDocument(doc_id);
575
+ if (!doc) {
576
+ return [];
577
+ }
578
+ return doc.findChunksByNodeId(nodeId);
579
+ }
580
+ }
581
+ // src/knowledge-base/KnowledgeBaseRegistry.ts
582
+ import {
583
+ createServiceToken,
584
+ globalServiceRegistry,
585
+ registerInputResolver
586
+ } from "@workglow/util";
587
+ var KNOWLEDGE_BASES = createServiceToken("dataset.knowledge-bases");
588
+ if (!globalServiceRegistry.has(KNOWLEDGE_BASES)) {
589
+ globalServiceRegistry.register(KNOWLEDGE_BASES, () => new Map, true);
590
+ }
591
+ function getGlobalKnowledgeBases() {
592
+ return globalServiceRegistry.get(KNOWLEDGE_BASES);
593
+ }
594
+ function registerKnowledgeBase(id, kb) {
595
+ const kbs = getGlobalKnowledgeBases();
596
+ kbs.set(id, kb);
597
+ }
598
+ function getKnowledgeBase(id) {
599
+ return getGlobalKnowledgeBases().get(id);
600
+ }
601
+ async function resolveKnowledgeBaseFromRegistry(id, format, registry) {
602
+ const kbs = registry.has(KNOWLEDGE_BASES) ? registry.get(KNOWLEDGE_BASES) : getGlobalKnowledgeBases();
603
+ const kb = kbs.get(id);
604
+ if (!kb) {
605
+ throw new Error(`Knowledge base "${id}" not found in registry`);
606
+ }
607
+ return kb;
608
+ }
609
+ registerInputResolver("dataset:knowledge-base", resolveKnowledgeBaseFromRegistry);
610
+ // src/knowledge-base/createKnowledgeBase.ts
611
+ import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
612
+
613
+ // src/document/DocumentStorageSchema.ts
614
+ var DocumentStorageSchema = {
646
615
  type: "object",
647
616
  properties: {
648
- title: {
617
+ doc_id: {
649
618
  type: "string",
650
- title: "Title",
651
- description: "Document title"
619
+ "x-auto-generated": true,
620
+ title: "Document ID",
621
+ description: "Unique identifier for the document"
652
622
  },
653
- sourceUri: {
623
+ data: {
654
624
  type: "string",
655
- title: "Source URI",
656
- description: "Original source URI of the document"
625
+ title: "Document Data",
626
+ description: "JSON-serialized document"
657
627
  },
658
- createdAt: {
659
- type: "string",
660
- title: "Created At",
661
- description: "ISO timestamp of creation"
628
+ metadata: {
629
+ type: "object",
630
+ title: "Metadata",
631
+ description: "Metadata of the document"
662
632
  }
663
633
  },
664
- required: ["title"],
634
+ required: ["doc_id", "data"],
665
635
  additionalProperties: true
666
636
  };
637
+ var DocumentStorageKey = ["doc_id"];
667
638
 
639
+ // src/knowledge-base/createKnowledgeBase.ts
640
+ async function createKnowledgeBase(options) {
641
+ const {
642
+ name,
643
+ vectorDimensions,
644
+ vectorType = Float32Array,
645
+ register: shouldRegister = true
646
+ } = options;
647
+ const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, DocumentStorageKey);
648
+ await tabularStorage.setupDatabase();
649
+ const vectorStorage = new InMemoryVectorStorage(ChunkVectorStorageSchema, ChunkVectorPrimaryKey, [], vectorDimensions, vectorType);
650
+ await vectorStorage.setupDatabase();
651
+ const kb = new KnowledgeBase(name, tabularStorage, vectorStorage);
652
+ if (shouldRegister) {
653
+ registerKnowledgeBase(name, kb);
654
+ }
655
+ return kb;
656
+ }
657
+ // src/util/DatasetSchema.ts
658
+ function TypeTabularStorage(options = {}) {
659
+ return {
660
+ title: "Tabular Storage",
661
+ description: "Storage ID or instance for tabular data storage",
662
+ ...options,
663
+ format: "storage:tabular",
664
+ oneOf: [
665
+ { type: "string", title: "Storage ID" },
666
+ { title: "Storage Instance", additionalProperties: true }
667
+ ]
668
+ };
669
+ }
670
+ function TypeKnowledgeBase(options = {}) {
671
+ return {
672
+ title: "Knowledge Base",
673
+ description: "Knowledge base ID or instance",
674
+ ...options,
675
+ format: "dataset:knowledge-base",
676
+ anyOf: [
677
+ { type: "string", title: "Knowledge Base ID" },
678
+ { title: "Knowledge Base Instance", additionalProperties: true }
679
+ ]
680
+ };
681
+ }
668
682
  // src/document/DocumentNode.ts
669
683
  function estimateTokens(text) {
670
684
  return Math.ceil(text.length / 4);
@@ -724,31 +738,6 @@ function getDocumentRange(root, nodePath) {
724
738
  }
725
739
  return currentNode.range;
726
740
  }
727
- // src/document/DocumentStorageSchema.ts
728
- var DocumentStorageSchema = {
729
- type: "object",
730
- properties: {
731
- doc_id: {
732
- type: "string",
733
- "x-auto-generated": true,
734
- title: "Document ID",
735
- description: "Unique identifier for the document"
736
- },
737
- data: {
738
- type: "string",
739
- title: "Document Data",
740
- description: "JSON-serialized document"
741
- },
742
- metadata: {
743
- type: "object",
744
- title: "Metadata",
745
- description: "Metadata of the document"
746
- }
747
- },
748
- required: ["doc_id", "data"],
749
- additionalProperties: true
750
- };
751
- var DocumentStorageKey = ["doc_id"];
752
741
  // src/document/StructuralParser.ts
753
742
  import { uuid4 } from "@workglow/util";
754
743
  class StructuralParser {
@@ -910,116 +899,19 @@ class StructuralParser {
910
899
  return /^#{1,6}\s/m.test(text);
911
900
  }
912
901
  }
913
- // src/document-chunk/DocumentChunkDataset.ts
914
- class DocumentChunkDataset {
915
- storage;
916
- constructor(storage) {
917
- this.storage = storage;
918
- }
919
- getStorage() {
920
- return this.storage;
921
- }
922
- async put(chunk) {
923
- return this.storage.put(chunk);
924
- }
925
- async putBulk(chunks) {
926
- return this.storage.putBulk(chunks);
927
- }
928
- async get(chunk_id) {
929
- const key = { chunk_id };
930
- return this.storage.get(key);
931
- }
932
- async delete(chunk_id) {
933
- const key = { chunk_id };
934
- return this.storage.delete(key);
935
- }
936
- async similaritySearch(query, options) {
937
- return this.storage.similaritySearch(query, options);
938
- }
939
- async hybridSearch(query, options) {
940
- if (this.storage.hybridSearch) {
941
- return this.storage.hybridSearch(query, options);
942
- }
943
- throw new Error("Hybrid search not supported by this storage backend");
944
- }
945
- async getAll() {
946
- return this.storage.getAll();
947
- }
948
- async size() {
949
- return this.storage.size();
950
- }
951
- async clear() {
952
- return this.storage.deleteAll();
953
- }
954
- destroy() {
955
- return this.storage.destroy();
956
- }
957
- async setupDatabase() {
958
- return this.storage.setupDatabase();
959
- }
960
- getVectorDimensions() {
961
- return this.storage.getVectorDimensions();
962
- }
963
- }
964
- // src/document-chunk/DocumentChunkDatasetRegistry.ts
965
- import {
966
- createServiceToken as createServiceToken2,
967
- globalServiceRegistry as globalServiceRegistry2,
968
- registerInputResolver as registerInputResolver2
969
- } from "@workglow/util";
970
- var DOCUMENT_CHUNK_DATASET = createServiceToken2("dataset.document-chunk");
971
- if (!globalServiceRegistry2.has(DOCUMENT_CHUNK_DATASET)) {
972
- globalServiceRegistry2.register(DOCUMENT_CHUNK_DATASET, () => new Map, true);
973
- }
974
- function getGlobalDocumentChunkDataset() {
975
- return globalServiceRegistry2.get(DOCUMENT_CHUNK_DATASET);
976
- }
977
- function registerDocumentChunkDataset(id, dataset) {
978
- const datasets = getGlobalDocumentChunkDataset();
979
- datasets.set(id, dataset);
980
- }
981
- function getDocumentChunkDataset(id) {
982
- return getGlobalDocumentChunkDataset().get(id);
983
- }
984
- async function resolveDocumentChunkDatasetFromRegistry(id, format, registry) {
985
- const datasets = registry.has(DOCUMENT_CHUNK_DATASET) ? registry.get(DOCUMENT_CHUNK_DATASET) : getGlobalDocumentChunkDataset();
986
- const dataset = datasets.get(id);
987
- if (!dataset) {
988
- throw new Error(`Document chunk dataset "${id}" not found in registry`);
989
- }
990
- return dataset;
991
- }
992
- registerInputResolver2("dataset:document-chunk", resolveDocumentChunkDatasetFromRegistry);
993
- // src/document-chunk/DocumentChunkSchema.ts
994
- import { TypedArraySchema } from "@workglow/util";
995
- var DocumentChunkSchema = {
996
- type: "object",
997
- properties: {
998
- chunk_id: { type: "string", "x-auto-generated": true },
999
- doc_id: { type: "string" },
1000
- vector: TypedArraySchema(),
1001
- metadata: { type: "object", format: "metadata", additionalProperties: true }
1002
- },
1003
- required: ["chunk_id", "doc_id", "vector", "metadata"],
1004
- additionalProperties: false
1005
- };
1006
- var DocumentChunkPrimaryKey = ["chunk_id"];
1007
902
  export {
1008
903
  traverseDepthFirst,
1009
- registerDocumentDataset,
1010
- registerDocumentChunkDataset,
904
+ registerKnowledgeBase,
1011
905
  hasChildren,
1012
906
  getNodePath,
1013
- getGlobalDocumentDatasets,
1014
- getGlobalDocumentChunkDataset,
907
+ getKnowledgeBase,
908
+ getGlobalKnowledgeBases,
1015
909
  getDocumentRange,
1016
- getDocumentDataset,
1017
- getDocumentChunkDataset,
1018
910
  getChildren,
1019
911
  estimateTokens,
912
+ createKnowledgeBase,
1020
913
  TypeTabularStorage,
1021
- TypeDocumentDataset,
1022
- TypeDocumentChunkDataset,
914
+ TypeKnowledgeBase,
1023
915
  TopicNodeSchema,
1024
916
  TokenBudgetSchema,
1025
917
  StructuralParser,
@@ -1029,26 +921,20 @@ export {
1029
921
  NodeRangeSchema,
1030
922
  NodeKind,
1031
923
  NodeEnrichmentSchema,
924
+ KnowledgeBase,
925
+ KNOWLEDGE_BASES,
1032
926
  EntitySchema,
1033
- EnrichedChunkMetadataSchema,
1034
- EnrichedChunkMetadataArraySchema,
1035
927
  DocumentStorageSchema,
1036
928
  DocumentStorageKey,
1037
929
  DocumentRootNodeSchema,
1038
930
  DocumentNodeSchema,
1039
931
  DocumentNodeBaseSchema,
1040
932
  DocumentMetadataSchema,
1041
- DocumentDataset,
1042
- DocumentChunkSchema,
1043
- DocumentChunkPrimaryKey,
1044
- DocumentChunkDataset,
1045
933
  Document,
1046
- DOCUMENT_DATASETS,
1047
- DOCUMENT_CHUNK_DATASET,
1048
- ChunkNodeSchema,
1049
- ChunkMetadataSchema,
1050
- ChunkMetadataArraySchema,
1051
- ChunkEnrichmentSchema
934
+ ChunkVectorStorageSchema,
935
+ ChunkVectorPrimaryKey,
936
+ ChunkRecordSchema,
937
+ ChunkRecordArraySchema
1052
938
  };
1053
939
 
1054
- //# debugId=A87A6D34F0F0F74964756E2164756E21
940
+ //# debugId=37F95C4A725151A064756E2164756E21