@workglow/dataset 0.0.109 → 0.0.113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +504 -968
  2. package/dist/browser.js +376 -490
  3. package/dist/browser.js.map +13 -13
  4. package/dist/bun.js +376 -490
  5. package/dist/bun.js.map +13 -13
  6. package/dist/chunk/ChunkSchema.d.ts +206 -0
  7. package/dist/chunk/ChunkSchema.d.ts.map +1 -0
  8. package/dist/chunk/ChunkVectorStorageSchema.d.ts +64 -0
  9. package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -0
  10. package/dist/common.d.ts +5 -5
  11. package/dist/common.d.ts.map +1 -1
  12. package/dist/document/Document.d.ts +7 -6
  13. package/dist/document/Document.d.ts.map +1 -1
  14. package/dist/document/DocumentSchema.d.ts +0 -465
  15. package/dist/document/DocumentSchema.d.ts.map +1 -1
  16. package/dist/knowledge-base/KnowledgeBase.d.ts +122 -0
  17. package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -0
  18. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +24 -0
  19. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -0
  20. package/dist/knowledge-base/createKnowledgeBase.d.ts +28 -0
  21. package/dist/knowledge-base/createKnowledgeBase.d.ts.map +1 -0
  22. package/dist/node.js +376 -490
  23. package/dist/node.js.map +13 -13
  24. package/dist/util/DatasetSchema.d.ts +9 -49
  25. package/dist/util/DatasetSchema.d.ts.map +1 -1
  26. package/package.json +7 -5
  27. package/dist/document/DocumentDataset.d.ts +0 -79
  28. package/dist/document/DocumentDataset.d.ts.map +0 -1
  29. package/dist/document/DocumentDatasetRegistry.d.ts +0 -29
  30. package/dist/document/DocumentDatasetRegistry.d.ts.map +0 -1
  31. package/dist/document-chunk/DocumentChunkDataset.d.ts +0 -79
  32. package/dist/document-chunk/DocumentChunkDataset.d.ts.map +0 -1
  33. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +0 -29
  34. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +0 -1
  35. package/dist/document-chunk/DocumentChunkSchema.d.ts +0 -56
  36. package/dist/document-chunk/DocumentChunkSchema.d.ts.map +0 -1
  37. package/src/document-chunk/README.md +0 -362
package/dist/bun.js CHANGED
@@ -1,228 +1,4 @@
1
1
  // @bun
2
- // src/util/DatasetSchema.ts
3
- function TypeTabularStorage(options = {}) {
4
- return {
5
- title: "Tabular Storage",
6
- description: "Storage ID or instance for tabular data storage",
7
- ...options,
8
- format: "storage:tabular",
9
- oneOf: [
10
- { type: "string", title: "Storage ID" },
11
- { title: "Storage Instance", additionalProperties: true }
12
- ]
13
- };
14
- }
15
- function TypeDocumentChunkDataset(options = {}) {
16
- return {
17
- title: "Document Chunk Dataset",
18
- description: "Dataset ID or instance for document chunk data storage",
19
- ...options,
20
- format: "dataset:document-chunk",
21
- anyOf: [
22
- { type: "string", title: "Dataset ID" },
23
- { title: "Dataset Instance", additionalProperties: true }
24
- ]
25
- };
26
- }
27
- function TypeDocumentDataset(options = {}) {
28
- return {
29
- title: "Document Dataset",
30
- description: "Dataset ID or instance for document data storage",
31
- ...options,
32
- format: "dataset:document",
33
- anyOf: [
34
- { type: "string", title: "Dataset ID" },
35
- { title: "Dataset Instance", additionalProperties: true }
36
- ]
37
- };
38
- }
39
- // src/document/Document.ts
40
- class Document {
41
- doc_id;
42
- metadata;
43
- root;
44
- chunks;
45
- constructor(root, metadata, chunks = [], doc_id) {
46
- this.doc_id = doc_id;
47
- this.root = root;
48
- this.metadata = metadata;
49
- this.chunks = chunks || [];
50
- }
51
- setChunks(chunks) {
52
- this.chunks = chunks;
53
- }
54
- getChunks() {
55
- return this.chunks;
56
- }
57
- setDocId(doc_id) {
58
- this.doc_id = doc_id;
59
- }
60
- findChunksByNodeId(nodeId) {
61
- return this.chunks.filter((chunk) => chunk.nodePath.includes(nodeId));
62
- }
63
- toJSON() {
64
- return {
65
- metadata: this.metadata,
66
- root: this.root,
67
- chunks: this.chunks
68
- };
69
- }
70
- static fromJSON(json, doc_id) {
71
- const obj = JSON.parse(json);
72
- return new Document(obj.root, obj.metadata, obj.chunks, doc_id);
73
- }
74
- }
75
- // src/document/DocumentDataset.ts
76
- class DocumentDataset {
77
- tabularStorage;
78
- vectorStorage;
79
- constructor(tabularStorage, vectorStorage) {
80
- this.tabularStorage = tabularStorage;
81
- this.vectorStorage = vectorStorage;
82
- }
83
- async upsert(document) {
84
- const serialized = JSON.stringify(document.toJSON());
85
- const insertEntity = {
86
- doc_id: document.doc_id,
87
- data: serialized
88
- };
89
- const entity = await this.tabularStorage.put(insertEntity);
90
- if (document.doc_id !== entity.doc_id) {
91
- document.setDocId(entity.doc_id);
92
- }
93
- return document;
94
- }
95
- async get(doc_id) {
96
- const entity = await this.tabularStorage.get({ doc_id });
97
- if (!entity) {
98
- return;
99
- }
100
- return Document.fromJSON(entity.data, entity.doc_id);
101
- }
102
- async delete(doc_id) {
103
- await this.tabularStorage.delete({ doc_id });
104
- }
105
- async getNode(doc_id, nodeId) {
106
- const doc = await this.get(doc_id);
107
- if (!doc) {
108
- return;
109
- }
110
- const traverse = (node) => {
111
- if (node.nodeId === nodeId) {
112
- return node;
113
- }
114
- if (node.children && Array.isArray(node.children)) {
115
- for (const child of node.children) {
116
- const found = traverse(child);
117
- if (found)
118
- return found;
119
- }
120
- }
121
- return;
122
- };
123
- return traverse(doc.root);
124
- }
125
- async getAncestors(doc_id, nodeId) {
126
- const doc = await this.get(doc_id);
127
- if (!doc) {
128
- return [];
129
- }
130
- const path = [];
131
- const findPath = (node) => {
132
- path.push(node.nodeId);
133
- if (node.nodeId === nodeId) {
134
- return true;
135
- }
136
- if (node.children && Array.isArray(node.children)) {
137
- for (const child of node.children) {
138
- if (findPath(child)) {
139
- return true;
140
- }
141
- }
142
- }
143
- path.pop();
144
- return false;
145
- };
146
- if (!findPath(doc.root)) {
147
- return [];
148
- }
149
- const ancestors = [];
150
- let currentNode = doc.root;
151
- ancestors.push(currentNode);
152
- for (let i = 1;i < path.length; i++) {
153
- const targetId = path[i];
154
- if (currentNode.children && Array.isArray(currentNode.children)) {
155
- const found = currentNode.children.find((child) => child.nodeId === targetId);
156
- if (found) {
157
- currentNode = found;
158
- ancestors.push(currentNode);
159
- } else {
160
- break;
161
- }
162
- } else {
163
- break;
164
- }
165
- }
166
- return ancestors;
167
- }
168
- async getChunks(doc_id) {
169
- const doc = await this.get(doc_id);
170
- if (!doc) {
171
- return [];
172
- }
173
- return doc.getChunks();
174
- }
175
- async findChunksByNodeId(doc_id, nodeId) {
176
- const doc = await this.get(doc_id);
177
- if (!doc) {
178
- return [];
179
- }
180
- if (doc.findChunksByNodeId) {
181
- return doc.findChunksByNodeId(nodeId);
182
- }
183
- const chunks = doc.getChunks();
184
- return chunks.filter((chunk) => chunk.nodePath && chunk.nodePath.includes(nodeId));
185
- }
186
- async list() {
187
- const entities = await this.tabularStorage.getAll();
188
- if (!entities) {
189
- return [];
190
- }
191
- return entities.map((e) => e.doc_id);
192
- }
193
- async search(query, options) {
194
- return this.vectorStorage?.similaritySearch(query, options) || [];
195
- }
196
- }
197
- // src/document/DocumentDatasetRegistry.ts
198
- import {
199
- createServiceToken,
200
- globalServiceRegistry,
201
- registerInputResolver
202
- } from "@workglow/util";
203
- var DOCUMENT_DATASETS = createServiceToken("dataset.documents");
204
- if (!globalServiceRegistry.has(DOCUMENT_DATASETS)) {
205
- globalServiceRegistry.register(DOCUMENT_DATASETS, () => new Map, true);
206
- }
207
- function getGlobalDocumentDatasets() {
208
- return globalServiceRegistry.get(DOCUMENT_DATASETS);
209
- }
210
- function registerDocumentDataset(id, dataset) {
211
- const datasets = getGlobalDocumentDatasets();
212
- datasets.set(id, dataset);
213
- }
214
- function getDocumentDataset(id) {
215
- return getGlobalDocumentDatasets().get(id);
216
- }
217
- async function resolveDocumentDatasetFromRegistry(id, format, registry) {
218
- const datasets = registry.has(DOCUMENT_DATASETS) ? registry.get(DOCUMENT_DATASETS) : getGlobalDocumentDatasets();
219
- const dataset = datasets.get(id);
220
- if (!dataset) {
221
- throw new Error(`Document dataset "${id}" not found in registry`);
222
- }
223
- return dataset;
224
- }
225
- registerInputResolver("dataset:document", resolveDocumentDatasetFromRegistry);
226
2
  // src/document/DocumentSchema.ts
227
3
  var NodeKind = {
228
4
  DOCUMENT: "document",
@@ -470,24 +246,31 @@ var TokenBudgetSchema = {
470
246
  required: ["maxTokensPerChunk", "overlapTokens", "reservedTokens"],
471
247
  additionalProperties: false
472
248
  };
473
- var ChunkEnrichmentSchema = {
249
+ var DocumentMetadataSchema = {
474
250
  type: "object",
475
251
  properties: {
476
- summary: {
252
+ title: {
477
253
  type: "string",
478
- title: "Summary",
479
- description: "Summary of the chunk content"
254
+ title: "Title",
255
+ description: "Document title"
480
256
  },
481
- entities: {
482
- type: "array",
483
- items: EntitySchema,
484
- title: "Entities",
485
- description: "Named entities extracted from the chunk"
257
+ sourceUri: {
258
+ type: "string",
259
+ title: "Source URI",
260
+ description: "Original source URI of the document"
261
+ },
262
+ createdAt: {
263
+ type: "string",
264
+ title: "Created At",
265
+ description: "ISO timestamp of creation"
486
266
  }
487
267
  },
488
- additionalProperties: false
268
+ required: ["title"],
269
+ additionalProperties: true
489
270
  };
490
- var ChunkNodeSchema = () => ({
271
+
272
+ // src/chunk/ChunkSchema.ts
273
+ var ChunkRecordSchema = () => ({
491
274
  type: "object",
492
275
  properties: {
493
276
  chunkId: {
@@ -516,45 +299,11 @@ var ChunkNodeSchema = () => ({
516
299
  title: "Depth",
517
300
  description: "Depth in the document tree"
518
301
  },
519
- enrichment: ChunkEnrichmentSchema
520
- },
521
- required: ["chunkId", "doc_id", "text", "nodePath", "depth"],
522
- additionalProperties: false
523
- });
524
- var ChunkMetadataSchema = {
525
- type: "object",
526
- properties: {
527
- doc_id: {
528
- type: "string",
529
- title: "Document ID",
530
- description: "ID of the parent document"
531
- },
532
- chunkId: {
533
- type: "string",
534
- title: "Chunk ID",
535
- description: "Unique identifier for this chunk"
536
- },
537
302
  leafNodeId: {
538
303
  type: "string",
539
304
  title: "Leaf Node ID",
540
305
  description: "ID of the leaf node this chunk belongs to"
541
306
  },
542
- depth: {
543
- type: "integer",
544
- title: "Depth",
545
- description: "Depth in the document tree"
546
- },
547
- text: {
548
- type: "string",
549
- title: "Text",
550
- description: "Text content of the chunk"
551
- },
552
- nodePath: {
553
- type: "array",
554
- items: { type: "string" },
555
- title: "Node Path",
556
- description: "Node IDs from root to leaf"
557
- },
558
307
  summary: {
559
308
  type: "string",
560
309
  title: "Summary",
@@ -565,107 +314,372 @@ var ChunkMetadataSchema = {
565
314
  items: EntitySchema,
566
315
  title: "Entities",
567
316
  description: "Named entities extracted from the chunk"
317
+ },
318
+ parentSummaries: {
319
+ type: "array",
320
+ items: { type: "string" },
321
+ title: "Parent Summaries",
322
+ description: "Summaries from ancestor nodes"
323
+ },
324
+ sectionTitles: {
325
+ type: "array",
326
+ items: { type: "string" },
327
+ title: "Section Titles",
328
+ description: "Titles of ancestor section nodes"
329
+ },
330
+ doc_title: {
331
+ type: "string",
332
+ title: "Document Title",
333
+ description: "Title of the parent document"
568
334
  }
569
335
  },
570
- required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
336
+ required: ["chunkId", "doc_id", "text", "nodePath", "depth"],
571
337
  additionalProperties: true
572
- };
573
- var ChunkMetadataArraySchema = {
338
+ });
339
+ var ChunkRecordArraySchema = {
574
340
  type: "array",
575
- items: ChunkMetadataSchema,
576
- title: "Chunk Metadata",
577
- description: "Metadata for each chunk"
341
+ items: ChunkRecordSchema(),
342
+ title: "Chunk Records",
343
+ description: "Array of chunk records"
578
344
  };
579
- var EnrichedChunkMetadataSchema = {
345
+ // src/chunk/ChunkVectorStorageSchema.ts
346
+ import { TypedArraySchema } from "@workglow/util";
347
+ var ChunkVectorStorageSchema = {
580
348
  type: "object",
581
349
  properties: {
582
- doc_id: {
583
- type: "string",
584
- title: "Document ID",
585
- description: "ID of the parent document"
586
- },
587
- chunkId: {
588
- type: "string",
589
- title: "Chunk ID",
590
- description: "Unique identifier for this chunk"
591
- },
592
- leafNodeId: {
593
- type: "string",
594
- title: "Leaf Node ID",
595
- description: "ID of the leaf node this chunk belongs to"
596
- },
597
- depth: {
598
- type: "integer",
599
- title: "Depth",
600
- description: "Depth in the document tree"
601
- },
602
- text: {
603
- type: "string",
604
- title: "Text",
605
- description: "Text content of the chunk"
606
- },
607
- nodePath: {
608
- type: "array",
609
- items: { type: "string" },
610
- title: "Node Path",
611
- description: "Node IDs from root to leaf"
612
- },
613
- summary: {
614
- type: "string",
615
- title: "Summary",
616
- description: "Summary of the chunk content"
617
- },
618
- entities: {
619
- type: "array",
620
- items: EntitySchema,
621
- title: "Entities",
622
- description: "Named entities (rolled up from hierarchy)"
623
- },
624
- parentSummaries: {
625
- type: "array",
626
- items: { type: "string" },
627
- title: "Parent Summaries",
628
- description: "Summaries from ancestor nodes"
629
- },
630
- sectionTitles: {
631
- type: "array",
632
- items: { type: "string" },
633
- title: "Section Titles",
634
- description: "Titles of ancestor section nodes"
635
- }
350
+ chunk_id: { type: "string", "x-auto-generated": true },
351
+ doc_id: { type: "string" },
352
+ vector: TypedArraySchema(),
353
+ metadata: { type: "object", format: "metadata", additionalProperties: true }
636
354
  },
637
- required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
638
- additionalProperties: true
639
- };
640
- var EnrichedChunkMetadataArraySchema = {
641
- type: "array",
642
- items: EnrichedChunkMetadataSchema,
643
- title: "Enriched Metadata",
644
- description: "Metadata enriched with hierarchy information"
355
+ required: ["chunk_id", "doc_id", "vector", "metadata"],
356
+ additionalProperties: false
645
357
  };
646
- var DocumentMetadataSchema = {
358
+ var ChunkVectorPrimaryKey = ["chunk_id"];
359
+ // src/document/Document.ts
360
+ class Document {
361
+ doc_id;
362
+ metadata;
363
+ root;
364
+ chunks;
365
+ constructor(root, metadata, chunks = [], doc_id) {
366
+ this.doc_id = doc_id;
367
+ this.root = root;
368
+ this.metadata = metadata;
369
+ this.chunks = chunks || [];
370
+ }
371
+ setChunks(chunks) {
372
+ this.chunks = chunks;
373
+ }
374
+ getChunks() {
375
+ return this.chunks;
376
+ }
377
+ setDocId(doc_id) {
378
+ this.doc_id = doc_id;
379
+ }
380
+ findChunksByNodeId(nodeId) {
381
+ return this.chunks.filter((chunk) => chunk.nodePath.includes(nodeId));
382
+ }
383
+ toJSON() {
384
+ return {
385
+ metadata: this.metadata,
386
+ root: this.root,
387
+ chunks: this.chunks
388
+ };
389
+ }
390
+ static fromJSON(json, doc_id) {
391
+ const obj = JSON.parse(json);
392
+ return new Document(obj.root, obj.metadata, obj.chunks, doc_id);
393
+ }
394
+ }
395
+
396
+ // src/knowledge-base/KnowledgeBase.ts
397
+ class KnowledgeBase {
398
+ name;
399
+ tabularStorage;
400
+ chunkStorage;
401
+ constructor(name, documentStorage, chunkStorage) {
402
+ this.name = name;
403
+ this.tabularStorage = documentStorage;
404
+ this.chunkStorage = chunkStorage;
405
+ }
406
+ async upsertDocument(document) {
407
+ const serialized = JSON.stringify(document.toJSON());
408
+ const insertEntity = {
409
+ doc_id: document.doc_id,
410
+ data: serialized
411
+ };
412
+ const entity = await this.tabularStorage.put(insertEntity);
413
+ if (document.doc_id !== entity.doc_id) {
414
+ document.setDocId(entity.doc_id);
415
+ }
416
+ return document;
417
+ }
418
+ async getDocument(doc_id) {
419
+ const entity = await this.tabularStorage.get({ doc_id });
420
+ if (!entity) {
421
+ return;
422
+ }
423
+ return Document.fromJSON(entity.data, entity.doc_id);
424
+ }
425
+ async deleteDocument(doc_id) {
426
+ await this.deleteChunksForDocument(doc_id);
427
+ await this.tabularStorage.delete({ doc_id });
428
+ }
429
+ async listDocuments() {
430
+ const entities = await this.tabularStorage.getAll();
431
+ if (!entities) {
432
+ return [];
433
+ }
434
+ return entities.map((e) => e.doc_id);
435
+ }
436
+ async getNode(doc_id, nodeId) {
437
+ const doc = await this.getDocument(doc_id);
438
+ if (!doc) {
439
+ return;
440
+ }
441
+ const traverse = (node) => {
442
+ if (node.nodeId === nodeId) {
443
+ return node;
444
+ }
445
+ if ("children" in node && Array.isArray(node.children)) {
446
+ for (const child of node.children) {
447
+ const found = traverse(child);
448
+ if (found)
449
+ return found;
450
+ }
451
+ }
452
+ return;
453
+ };
454
+ return traverse(doc.root);
455
+ }
456
+ async getAncestors(doc_id, nodeId) {
457
+ const doc = await this.getDocument(doc_id);
458
+ if (!doc) {
459
+ return [];
460
+ }
461
+ const path = [];
462
+ const findPath = (node) => {
463
+ path.push(node.nodeId);
464
+ if (node.nodeId === nodeId) {
465
+ return true;
466
+ }
467
+ if ("children" in node && Array.isArray(node.children)) {
468
+ for (const child of node.children) {
469
+ if (findPath(child)) {
470
+ return true;
471
+ }
472
+ }
473
+ }
474
+ path.pop();
475
+ return false;
476
+ };
477
+ if (!findPath(doc.root)) {
478
+ return [];
479
+ }
480
+ const ancestors = [];
481
+ let currentNode = doc.root;
482
+ ancestors.push(currentNode);
483
+ for (let i = 1;i < path.length; i++) {
484
+ const targetId = path[i];
485
+ if ("children" in currentNode && Array.isArray(currentNode.children)) {
486
+ const found = currentNode.children.find((child) => child.nodeId === targetId);
487
+ if (found) {
488
+ currentNode = found;
489
+ ancestors.push(currentNode);
490
+ } else {
491
+ break;
492
+ }
493
+ } else {
494
+ break;
495
+ }
496
+ }
497
+ return ancestors;
498
+ }
499
+ async upsertChunk(chunk) {
500
+ if (chunk.vector.length !== this.getVectorDimensions()) {
501
+ throw new Error(`Vector dimension mismatch: expected ${this.getVectorDimensions()}, got ${chunk.vector.length}.`);
502
+ }
503
+ return this.chunkStorage.put(chunk);
504
+ }
505
+ async upsertChunksBulk(chunks) {
506
+ const expected = this.getVectorDimensions();
507
+ for (const chunk of chunks) {
508
+ if (chunk.vector.length !== expected) {
509
+ throw new Error(`Vector dimension mismatch: expected ${expected}, got ${chunk.vector.length}.`);
510
+ }
511
+ }
512
+ return this.chunkStorage.putBulk(chunks);
513
+ }
514
+ async deleteChunksForDocument(doc_id) {
515
+ await this.chunkStorage.deleteSearch({ doc_id });
516
+ }
517
+ async getChunksForDocument(doc_id) {
518
+ const results = await this.chunkStorage.query({ doc_id });
519
+ return results ?? [];
520
+ }
521
+ async similaritySearch(query, options) {
522
+ return this.chunkStorage.similaritySearch(query, options);
523
+ }
524
+ async hybridSearch(query, options) {
525
+ if (typeof this.chunkStorage.hybridSearch !== "function") {
526
+ throw new Error("Hybrid search is not supported by the configured chunk storage backend. " + "Please use a vector storage implementation that provides `hybridSearch`.");
527
+ }
528
+ return this.chunkStorage.hybridSearch(query, options);
529
+ }
530
+ async prepareReindex(doc_id) {
531
+ const doc = await this.getDocument(doc_id);
532
+ if (!doc) {
533
+ return;
534
+ }
535
+ await this.deleteChunksForDocument(doc_id);
536
+ return doc;
537
+ }
538
+ async setupDatabase() {
539
+ await this.tabularStorage.setupDatabase();
540
+ await this.chunkStorage.setupDatabase();
541
+ }
542
+ destroy() {
543
+ this.tabularStorage.destroy();
544
+ this.chunkStorage.destroy();
545
+ }
546
+ async getChunk(chunk_id) {
547
+ return this.chunkStorage.get({ chunk_id });
548
+ }
549
+ async put(chunk) {
550
+ return this.chunkStorage.put(chunk);
551
+ }
552
+ async putBulk(chunks) {
553
+ return this.chunkStorage.putBulk(chunks);
554
+ }
555
+ async getAllChunks() {
556
+ return this.chunkStorage.getAll();
557
+ }
558
+ async chunkCount() {
559
+ return this.chunkStorage.size();
560
+ }
561
+ async clearChunks() {
562
+ return this.chunkStorage.deleteAll();
563
+ }
564
+ getVectorDimensions() {
565
+ return this.chunkStorage.getVectorDimensions();
566
+ }
567
+ async getDocumentChunks(doc_id) {
568
+ const doc = await this.getDocument(doc_id);
569
+ if (!doc) {
570
+ return [];
571
+ }
572
+ return doc.getChunks();
573
+ }
574
+ async findChunksByNodeId(doc_id, nodeId) {
575
+ const doc = await this.getDocument(doc_id);
576
+ if (!doc) {
577
+ return [];
578
+ }
579
+ return doc.findChunksByNodeId(nodeId);
580
+ }
581
+ }
582
+ // src/knowledge-base/KnowledgeBaseRegistry.ts
583
+ import {
584
+ createServiceToken,
585
+ globalServiceRegistry,
586
+ registerInputResolver
587
+ } from "@workglow/util";
588
+ var KNOWLEDGE_BASES = createServiceToken("dataset.knowledge-bases");
589
+ if (!globalServiceRegistry.has(KNOWLEDGE_BASES)) {
590
+ globalServiceRegistry.register(KNOWLEDGE_BASES, () => new Map, true);
591
+ }
592
+ function getGlobalKnowledgeBases() {
593
+ return globalServiceRegistry.get(KNOWLEDGE_BASES);
594
+ }
595
+ function registerKnowledgeBase(id, kb) {
596
+ const kbs = getGlobalKnowledgeBases();
597
+ kbs.set(id, kb);
598
+ }
599
+ function getKnowledgeBase(id) {
600
+ return getGlobalKnowledgeBases().get(id);
601
+ }
602
+ async function resolveKnowledgeBaseFromRegistry(id, format, registry) {
603
+ const kbs = registry.has(KNOWLEDGE_BASES) ? registry.get(KNOWLEDGE_BASES) : getGlobalKnowledgeBases();
604
+ const kb = kbs.get(id);
605
+ if (!kb) {
606
+ throw new Error(`Knowledge base "${id}" not found in registry`);
607
+ }
608
+ return kb;
609
+ }
610
+ registerInputResolver("dataset:knowledge-base", resolveKnowledgeBaseFromRegistry);
611
+ // src/knowledge-base/createKnowledgeBase.ts
612
+ import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
613
+
614
+ // src/document/DocumentStorageSchema.ts
615
+ var DocumentStorageSchema = {
647
616
  type: "object",
648
617
  properties: {
649
- title: {
618
+ doc_id: {
650
619
  type: "string",
651
- title: "Title",
652
- description: "Document title"
620
+ "x-auto-generated": true,
621
+ title: "Document ID",
622
+ description: "Unique identifier for the document"
653
623
  },
654
- sourceUri: {
624
+ data: {
655
625
  type: "string",
656
- title: "Source URI",
657
- description: "Original source URI of the document"
626
+ title: "Document Data",
627
+ description: "JSON-serialized document"
658
628
  },
659
- createdAt: {
660
- type: "string",
661
- title: "Created At",
662
- description: "ISO timestamp of creation"
629
+ metadata: {
630
+ type: "object",
631
+ title: "Metadata",
632
+ description: "Metadata of the document"
663
633
  }
664
634
  },
665
- required: ["title"],
635
+ required: ["doc_id", "data"],
666
636
  additionalProperties: true
667
637
  };
638
+ var DocumentStorageKey = ["doc_id"];
668
639
 
640
+ // src/knowledge-base/createKnowledgeBase.ts
641
+ async function createKnowledgeBase(options) {
642
+ const {
643
+ name,
644
+ vectorDimensions,
645
+ vectorType = Float32Array,
646
+ register: shouldRegister = true
647
+ } = options;
648
+ const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, DocumentStorageKey);
649
+ await tabularStorage.setupDatabase();
650
+ const vectorStorage = new InMemoryVectorStorage(ChunkVectorStorageSchema, ChunkVectorPrimaryKey, [], vectorDimensions, vectorType);
651
+ await vectorStorage.setupDatabase();
652
+ const kb = new KnowledgeBase(name, tabularStorage, vectorStorage);
653
+ if (shouldRegister) {
654
+ registerKnowledgeBase(name, kb);
655
+ }
656
+ return kb;
657
+ }
658
+ // src/util/DatasetSchema.ts
659
+ function TypeTabularStorage(options = {}) {
660
+ return {
661
+ title: "Tabular Storage",
662
+ description: "Storage ID or instance for tabular data storage",
663
+ ...options,
664
+ format: "storage:tabular",
665
+ oneOf: [
666
+ { type: "string", title: "Storage ID" },
667
+ { title: "Storage Instance", additionalProperties: true }
668
+ ]
669
+ };
670
+ }
671
+ function TypeKnowledgeBase(options = {}) {
672
+ return {
673
+ title: "Knowledge Base",
674
+ description: "Knowledge base ID or instance",
675
+ ...options,
676
+ format: "dataset:knowledge-base",
677
+ anyOf: [
678
+ { type: "string", title: "Knowledge Base ID" },
679
+ { title: "Knowledge Base Instance", additionalProperties: true }
680
+ ]
681
+ };
682
+ }
669
683
  // src/document/DocumentNode.ts
670
684
  function estimateTokens(text) {
671
685
  return Math.ceil(text.length / 4);
@@ -725,31 +739,6 @@ function getDocumentRange(root, nodePath) {
725
739
  }
726
740
  return currentNode.range;
727
741
  }
728
- // src/document/DocumentStorageSchema.ts
729
- var DocumentStorageSchema = {
730
- type: "object",
731
- properties: {
732
- doc_id: {
733
- type: "string",
734
- "x-auto-generated": true,
735
- title: "Document ID",
736
- description: "Unique identifier for the document"
737
- },
738
- data: {
739
- type: "string",
740
- title: "Document Data",
741
- description: "JSON-serialized document"
742
- },
743
- metadata: {
744
- type: "object",
745
- title: "Metadata",
746
- description: "Metadata of the document"
747
- }
748
- },
749
- required: ["doc_id", "data"],
750
- additionalProperties: true
751
- };
752
- var DocumentStorageKey = ["doc_id"];
753
742
  // src/document/StructuralParser.ts
754
743
  import { uuid4 } from "@workglow/util";
755
744
  class StructuralParser {
@@ -911,116 +900,19 @@ class StructuralParser {
911
900
  return /^#{1,6}\s/m.test(text);
912
901
  }
913
902
  }
914
- // src/document-chunk/DocumentChunkDataset.ts
915
- class DocumentChunkDataset {
916
- storage;
917
- constructor(storage) {
918
- this.storage = storage;
919
- }
920
- getStorage() {
921
- return this.storage;
922
- }
923
- async put(chunk) {
924
- return this.storage.put(chunk);
925
- }
926
- async putBulk(chunks) {
927
- return this.storage.putBulk(chunks);
928
- }
929
- async get(chunk_id) {
930
- const key = { chunk_id };
931
- return this.storage.get(key);
932
- }
933
- async delete(chunk_id) {
934
- const key = { chunk_id };
935
- return this.storage.delete(key);
936
- }
937
- async similaritySearch(query, options) {
938
- return this.storage.similaritySearch(query, options);
939
- }
940
- async hybridSearch(query, options) {
941
- if (this.storage.hybridSearch) {
942
- return this.storage.hybridSearch(query, options);
943
- }
944
- throw new Error("Hybrid search not supported by this storage backend");
945
- }
946
- async getAll() {
947
- return this.storage.getAll();
948
- }
949
- async size() {
950
- return this.storage.size();
951
- }
952
- async clear() {
953
- return this.storage.deleteAll();
954
- }
955
- destroy() {
956
- return this.storage.destroy();
957
- }
958
- async setupDatabase() {
959
- return this.storage.setupDatabase();
960
- }
961
- getVectorDimensions() {
962
- return this.storage.getVectorDimensions();
963
- }
964
- }
965
- // src/document-chunk/DocumentChunkDatasetRegistry.ts
966
- import {
967
- createServiceToken as createServiceToken2,
968
- globalServiceRegistry as globalServiceRegistry2,
969
- registerInputResolver as registerInputResolver2
970
- } from "@workglow/util";
971
- var DOCUMENT_CHUNK_DATASET = createServiceToken2("dataset.document-chunk");
972
- if (!globalServiceRegistry2.has(DOCUMENT_CHUNK_DATASET)) {
973
- globalServiceRegistry2.register(DOCUMENT_CHUNK_DATASET, () => new Map, true);
974
- }
975
- function getGlobalDocumentChunkDataset() {
976
- return globalServiceRegistry2.get(DOCUMENT_CHUNK_DATASET);
977
- }
978
- function registerDocumentChunkDataset(id, dataset) {
979
- const datasets = getGlobalDocumentChunkDataset();
980
- datasets.set(id, dataset);
981
- }
982
- function getDocumentChunkDataset(id) {
983
- return getGlobalDocumentChunkDataset().get(id);
984
- }
985
- async function resolveDocumentChunkDatasetFromRegistry(id, format, registry) {
986
- const datasets = registry.has(DOCUMENT_CHUNK_DATASET) ? registry.get(DOCUMENT_CHUNK_DATASET) : getGlobalDocumentChunkDataset();
987
- const dataset = datasets.get(id);
988
- if (!dataset) {
989
- throw new Error(`Document chunk dataset "${id}" not found in registry`);
990
- }
991
- return dataset;
992
- }
993
- registerInputResolver2("dataset:document-chunk", resolveDocumentChunkDatasetFromRegistry);
994
- // src/document-chunk/DocumentChunkSchema.ts
995
- import { TypedArraySchema } from "@workglow/util";
996
- var DocumentChunkSchema = {
997
- type: "object",
998
- properties: {
999
- chunk_id: { type: "string", "x-auto-generated": true },
1000
- doc_id: { type: "string" },
1001
- vector: TypedArraySchema(),
1002
- metadata: { type: "object", format: "metadata", additionalProperties: true }
1003
- },
1004
- required: ["chunk_id", "doc_id", "vector", "metadata"],
1005
- additionalProperties: false
1006
- };
1007
- var DocumentChunkPrimaryKey = ["chunk_id"];
1008
903
  export {
1009
904
  traverseDepthFirst,
1010
- registerDocumentDataset,
1011
- registerDocumentChunkDataset,
905
+ registerKnowledgeBase,
1012
906
  hasChildren,
1013
907
  getNodePath,
1014
- getGlobalDocumentDatasets,
1015
- getGlobalDocumentChunkDataset,
908
+ getKnowledgeBase,
909
+ getGlobalKnowledgeBases,
1016
910
  getDocumentRange,
1017
- getDocumentDataset,
1018
- getDocumentChunkDataset,
1019
911
  getChildren,
1020
912
  estimateTokens,
913
+ createKnowledgeBase,
1021
914
  TypeTabularStorage,
1022
- TypeDocumentDataset,
1023
- TypeDocumentChunkDataset,
915
+ TypeKnowledgeBase,
1024
916
  TopicNodeSchema,
1025
917
  TokenBudgetSchema,
1026
918
  StructuralParser,
@@ -1030,26 +922,20 @@ export {
1030
922
  NodeRangeSchema,
1031
923
  NodeKind,
1032
924
  NodeEnrichmentSchema,
925
+ KnowledgeBase,
926
+ KNOWLEDGE_BASES,
1033
927
  EntitySchema,
1034
- EnrichedChunkMetadataSchema,
1035
- EnrichedChunkMetadataArraySchema,
1036
928
  DocumentStorageSchema,
1037
929
  DocumentStorageKey,
1038
930
  DocumentRootNodeSchema,
1039
931
  DocumentNodeSchema,
1040
932
  DocumentNodeBaseSchema,
1041
933
  DocumentMetadataSchema,
1042
- DocumentDataset,
1043
- DocumentChunkSchema,
1044
- DocumentChunkPrimaryKey,
1045
- DocumentChunkDataset,
1046
934
  Document,
1047
- DOCUMENT_DATASETS,
1048
- DOCUMENT_CHUNK_DATASET,
1049
- ChunkNodeSchema,
1050
- ChunkMetadataSchema,
1051
- ChunkMetadataArraySchema,
1052
- ChunkEnrichmentSchema
935
+ ChunkVectorStorageSchema,
936
+ ChunkVectorPrimaryKey,
937
+ ChunkRecordSchema,
938
+ ChunkRecordArraySchema
1053
939
  };
1054
940
 
1055
- //# debugId=6A205B9D3594B93F64756E2164756E21
941
+ //# debugId=22C0D4D10EDCEF7064756E2164756E21