@workglow/dataset 0.0.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +1134 -0
  3. package/dist/browser.js +1053 -0
  4. package/dist/browser.js.map +20 -0
  5. package/dist/bun.js +1054 -0
  6. package/dist/bun.js.map +20 -0
  7. package/dist/common-server.d.ts +7 -0
  8. package/dist/common-server.d.ts.map +1 -0
  9. package/dist/common.d.ts +17 -0
  10. package/dist/common.d.ts.map +1 -0
  11. package/dist/document/Document.d.ts +50 -0
  12. package/dist/document/Document.d.ts.map +1 -0
  13. package/dist/document/DocumentDataset.d.ts +79 -0
  14. package/dist/document/DocumentDataset.d.ts.map +1 -0
  15. package/dist/document/DocumentDatasetRegistry.d.ts +29 -0
  16. package/dist/document/DocumentDatasetRegistry.d.ts.map +1 -0
  17. package/dist/document/DocumentNode.d.ts +31 -0
  18. package/dist/document/DocumentNode.d.ts.map +1 -0
  19. package/dist/document/DocumentSchema.d.ts +1668 -0
  20. package/dist/document/DocumentSchema.d.ts.map +1 -0
  21. package/dist/document/DocumentStorageSchema.d.ts +43 -0
  22. package/dist/document/DocumentStorageSchema.d.ts.map +1 -0
  23. package/dist/document/StructuralParser.d.ts +30 -0
  24. package/dist/document/StructuralParser.d.ts.map +1 -0
  25. package/dist/document-chunk/DocumentChunkDataset.d.ts +79 -0
  26. package/dist/document-chunk/DocumentChunkDataset.d.ts.map +1 -0
  27. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +29 -0
  28. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +1 -0
  29. package/dist/document-chunk/DocumentChunkSchema.d.ts +55 -0
  30. package/dist/document-chunk/DocumentChunkSchema.d.ts.map +1 -0
  31. package/dist/node.js +1053 -0
  32. package/dist/node.js.map +20 -0
  33. package/dist/types.d.ts +7 -0
  34. package/dist/types.d.ts.map +1 -0
  35. package/dist/util/DatasetSchema.d.ts +85 -0
  36. package/dist/util/DatasetSchema.d.ts.map +1 -0
  37. package/package.json +54 -0
  38. package/src/document-chunk/README.md +362 -0
package/dist/bun.js ADDED
@@ -0,0 +1,1054 @@
1
+ // @bun
2
+ // src/util/DatasetSchema.ts
3
+ function TypeTabularStorage(options = {}) {
4
+ return {
5
+ title: "Tabular Storage",
6
+ description: "Storage ID or instance for tabular data storage",
7
+ ...options,
8
+ format: "storage:tabular",
9
+ oneOf: [
10
+ { type: "string", title: "Storage ID" },
11
+ { title: "Storage Instance", additionalProperties: true }
12
+ ]
13
+ };
14
+ }
15
+ function TypeDocumentChunkDataset(options = {}) {
16
+ return {
17
+ title: "Document Chunk Dataset",
18
+ description: "Dataset ID or instance for document chunk data storage",
19
+ ...options,
20
+ format: "dataset:document-chunk",
21
+ anyOf: [
22
+ { type: "string", title: "Dataset ID" },
23
+ { title: "Dataset Instance", additionalProperties: true }
24
+ ]
25
+ };
26
+ }
27
+ function TypeDocumentDataset(options = {}) {
28
+ return {
29
+ title: "Document Dataset",
30
+ description: "Dataset ID or instance for document data storage",
31
+ ...options,
32
+ format: "dataset:document",
33
+ anyOf: [
34
+ { type: "string", title: "Dataset ID" },
35
+ { title: "Dataset Instance", additionalProperties: true }
36
+ ]
37
+ };
38
+ }
39
+ // src/document/Document.ts
40
+ class Document {
41
+ doc_id;
42
+ metadata;
43
+ root;
44
+ chunks;
45
+ constructor(root, metadata, chunks = [], doc_id) {
46
+ this.doc_id = doc_id;
47
+ this.root = root;
48
+ this.metadata = metadata;
49
+ this.chunks = chunks || [];
50
+ }
51
+ setChunks(chunks) {
52
+ this.chunks = chunks;
53
+ }
54
+ getChunks() {
55
+ return this.chunks;
56
+ }
57
+ setDocId(doc_id) {
58
+ this.doc_id = doc_id;
59
+ }
60
+ findChunksByNodeId(nodeId) {
61
+ return this.chunks.filter((chunk) => chunk.nodePath.includes(nodeId));
62
+ }
63
+ toJSON() {
64
+ return {
65
+ metadata: this.metadata,
66
+ root: this.root,
67
+ chunks: this.chunks
68
+ };
69
+ }
70
+ static fromJSON(json, doc_id) {
71
+ const obj = JSON.parse(json);
72
+ return new Document(obj.root, obj.metadata, obj.chunks, doc_id);
73
+ }
74
+ }
75
+ // src/document/DocumentDataset.ts
76
+ class DocumentDataset {
77
+ tabularStorage;
78
+ vectorStorage;
79
+ constructor(tabularStorage, vectorStorage) {
80
+ this.tabularStorage = tabularStorage;
81
+ this.vectorStorage = vectorStorage;
82
+ }
83
+ async upsert(document) {
84
+ const serialized = JSON.stringify(document.toJSON());
85
+ const insertEntity = {
86
+ doc_id: document.doc_id,
87
+ data: serialized
88
+ };
89
+ const entity = await this.tabularStorage.put(insertEntity);
90
+ if (document.doc_id !== entity.doc_id) {
91
+ document.setDocId(entity.doc_id);
92
+ }
93
+ return document;
94
+ }
95
+ async get(doc_id) {
96
+ const entity = await this.tabularStorage.get({ doc_id });
97
+ if (!entity) {
98
+ return;
99
+ }
100
+ return Document.fromJSON(entity.data, entity.doc_id);
101
+ }
102
+ async delete(doc_id) {
103
+ await this.tabularStorage.delete({ doc_id });
104
+ }
105
+ async getNode(doc_id, nodeId) {
106
+ const doc = await this.get(doc_id);
107
+ if (!doc) {
108
+ return;
109
+ }
110
+ const traverse = (node) => {
111
+ if (node.nodeId === nodeId) {
112
+ return node;
113
+ }
114
+ if (node.children && Array.isArray(node.children)) {
115
+ for (const child of node.children) {
116
+ const found = traverse(child);
117
+ if (found)
118
+ return found;
119
+ }
120
+ }
121
+ return;
122
+ };
123
+ return traverse(doc.root);
124
+ }
125
+ async getAncestors(doc_id, nodeId) {
126
+ const doc = await this.get(doc_id);
127
+ if (!doc) {
128
+ return [];
129
+ }
130
+ const path = [];
131
+ const findPath = (node) => {
132
+ path.push(node.nodeId);
133
+ if (node.nodeId === nodeId) {
134
+ return true;
135
+ }
136
+ if (node.children && Array.isArray(node.children)) {
137
+ for (const child of node.children) {
138
+ if (findPath(child)) {
139
+ return true;
140
+ }
141
+ }
142
+ }
143
+ path.pop();
144
+ return false;
145
+ };
146
+ if (!findPath(doc.root)) {
147
+ return [];
148
+ }
149
+ const ancestors = [];
150
+ let currentNode = doc.root;
151
+ ancestors.push(currentNode);
152
+ for (let i = 1;i < path.length; i++) {
153
+ const targetId = path[i];
154
+ if (currentNode.children && Array.isArray(currentNode.children)) {
155
+ const found = currentNode.children.find((child) => child.nodeId === targetId);
156
+ if (found) {
157
+ currentNode = found;
158
+ ancestors.push(currentNode);
159
+ } else {
160
+ break;
161
+ }
162
+ } else {
163
+ break;
164
+ }
165
+ }
166
+ return ancestors;
167
+ }
168
+ async getChunks(doc_id) {
169
+ const doc = await this.get(doc_id);
170
+ if (!doc) {
171
+ return [];
172
+ }
173
+ return doc.getChunks();
174
+ }
175
+ async findChunksByNodeId(doc_id, nodeId) {
176
+ const doc = await this.get(doc_id);
177
+ if (!doc) {
178
+ return [];
179
+ }
180
+ if (doc.findChunksByNodeId) {
181
+ return doc.findChunksByNodeId(nodeId);
182
+ }
183
+ const chunks = doc.getChunks();
184
+ return chunks.filter((chunk) => chunk.nodePath && chunk.nodePath.includes(nodeId));
185
+ }
186
+ async list() {
187
+ const entities = await this.tabularStorage.getAll();
188
+ if (!entities) {
189
+ return [];
190
+ }
191
+ return entities.map((e) => e.doc_id);
192
+ }
193
+ async search(query, options) {
194
+ return this.vectorStorage?.similaritySearch(query, options) || [];
195
+ }
196
+ }
197
+ // src/document/DocumentDatasetRegistry.ts
198
+ import {
199
+ createServiceToken,
200
+ globalServiceRegistry,
201
+ registerInputResolver
202
+ } from "@workglow/util";
203
+ var DOCUMENT_DATASETS = createServiceToken("dataset.documents");
204
+ if (!globalServiceRegistry.has(DOCUMENT_DATASETS)) {
205
+ globalServiceRegistry.register(DOCUMENT_DATASETS, () => new Map, true);
206
+ }
207
+ function getGlobalDocumentDatasets() {
208
+ return globalServiceRegistry.get(DOCUMENT_DATASETS);
209
+ }
210
+ function registerDocumentDataset(id, dataset) {
211
+ const datasets = getGlobalDocumentDatasets();
212
+ datasets.set(id, dataset);
213
+ }
214
+ function getDocumentDataset(id) {
215
+ return getGlobalDocumentDatasets().get(id);
216
+ }
217
+ async function resolveDocumentDatasetFromRegistry(id, format, registry) {
218
+ const datasets = registry.has(DOCUMENT_DATASETS) ? registry.get(DOCUMENT_DATASETS) : getGlobalDocumentDatasets();
219
+ const dataset = datasets.get(id);
220
+ if (!dataset) {
221
+ throw new Error(`Document dataset "${id}" not found in registry`);
222
+ }
223
+ return dataset;
224
+ }
225
+ registerInputResolver("dataset:document", resolveDocumentDatasetFromRegistry);
226
+ // src/document/DocumentSchema.ts
227
+ var NodeKind = {
228
+ DOCUMENT: "document",
229
+ SECTION: "section",
230
+ PARAGRAPH: "paragraph",
231
+ SENTENCE: "sentence",
232
+ TOPIC: "topic"
233
+ };
234
+ var NodeRangeSchema = {
235
+ type: "object",
236
+ properties: {
237
+ startOffset: {
238
+ type: "integer",
239
+ title: "Start Offset",
240
+ description: "Starting character offset"
241
+ },
242
+ endOffset: {
243
+ type: "integer",
244
+ title: "End Offset",
245
+ description: "Ending character offset"
246
+ }
247
+ },
248
+ required: ["startOffset", "endOffset"],
249
+ additionalProperties: false
250
+ };
251
+ var EntitySchema = {
252
+ type: "object",
253
+ properties: {
254
+ text: {
255
+ type: "string",
256
+ title: "Text",
257
+ description: "Entity text"
258
+ },
259
+ type: {
260
+ type: "string",
261
+ title: "Type",
262
+ description: "Entity type (e.g., PERSON, ORG, LOC)"
263
+ },
264
+ score: {
265
+ type: "number",
266
+ title: "Score",
267
+ description: "Confidence score"
268
+ }
269
+ },
270
+ required: ["text", "type", "score"],
271
+ additionalProperties: false
272
+ };
273
+ var NodeEnrichmentSchema = {
274
+ type: "object",
275
+ properties: {
276
+ summary: {
277
+ type: "string",
278
+ title: "Summary",
279
+ description: "Summary of the node content"
280
+ },
281
+ entities: {
282
+ type: "array",
283
+ items: EntitySchema,
284
+ title: "Entities",
285
+ description: "Named entities extracted from the node"
286
+ },
287
+ keywords: {
288
+ type: "array",
289
+ items: { type: "string" },
290
+ title: "Keywords",
291
+ description: "Keywords associated with the node"
292
+ }
293
+ },
294
+ additionalProperties: false
295
+ };
296
+ var DocumentNodeBaseSchema = {
297
+ type: "object",
298
+ properties: {
299
+ nodeId: {
300
+ type: "string",
301
+ title: "Node ID",
302
+ description: "Unique identifier for this node"
303
+ },
304
+ kind: {
305
+ type: "string",
306
+ enum: Object.values(NodeKind),
307
+ title: "Kind",
308
+ description: "Node type discriminator"
309
+ },
310
+ range: NodeRangeSchema,
311
+ text: {
312
+ type: "string",
313
+ title: "Text",
314
+ description: "Text content of the node"
315
+ },
316
+ enrichment: NodeEnrichmentSchema
317
+ },
318
+ required: ["nodeId", "kind", "range", "text"],
319
+ additionalProperties: true
320
+ };
321
+ var DocumentNodeSchema = {
322
+ type: "object",
323
+ title: "Document Node",
324
+ description: "A node in the hierarchical document tree",
325
+ properties: {
326
+ ...DocumentNodeBaseSchema.properties,
327
+ level: {
328
+ type: "integer",
329
+ title: "Level",
330
+ description: "Header level for section nodes"
331
+ },
332
+ title: {
333
+ type: "string",
334
+ title: "Title",
335
+ description: "Section title"
336
+ },
337
+ children: {
338
+ type: "array",
339
+ title: "Children",
340
+ description: "Child nodes"
341
+ }
342
+ },
343
+ required: [...DocumentNodeBaseSchema.required],
344
+ additionalProperties: false
345
+ };
346
+ var ParagraphNodeSchema = {
347
+ type: "object",
348
+ properties: {
349
+ ...DocumentNodeBaseSchema.properties,
350
+ kind: {
351
+ type: "string",
352
+ const: NodeKind.PARAGRAPH,
353
+ title: "Kind",
354
+ description: "Node type discriminator"
355
+ }
356
+ },
357
+ required: [...DocumentNodeBaseSchema.required],
358
+ additionalProperties: false
359
+ };
360
+ var SentenceNodeSchema = {
361
+ type: "object",
362
+ properties: {
363
+ ...DocumentNodeBaseSchema.properties,
364
+ kind: {
365
+ type: "string",
366
+ const: NodeKind.SENTENCE,
367
+ title: "Kind",
368
+ description: "Node type discriminator"
369
+ }
370
+ },
371
+ required: [...DocumentNodeBaseSchema.required],
372
+ additionalProperties: false
373
+ };
374
+ var SectionNodeSchema = {
375
+ type: "object",
376
+ properties: {
377
+ ...DocumentNodeBaseSchema.properties,
378
+ kind: {
379
+ type: "string",
380
+ const: NodeKind.SECTION,
381
+ title: "Kind",
382
+ description: "Node type discriminator"
383
+ },
384
+ level: {
385
+ type: "integer",
386
+ minimum: 1,
387
+ maximum: 6,
388
+ title: "Level",
389
+ description: "Header level (1-6 for markdown)"
390
+ },
391
+ title: {
392
+ type: "string",
393
+ title: "Title",
394
+ description: "Section title"
395
+ },
396
+ children: {
397
+ type: "array",
398
+ items: DocumentNodeSchema,
399
+ title: "Children",
400
+ description: "Child nodes"
401
+ }
402
+ },
403
+ required: [...DocumentNodeBaseSchema.required, "level", "title", "children"],
404
+ additionalProperties: false
405
+ };
406
+ var TopicNodeSchema = {
407
+ type: "object",
408
+ properties: {
409
+ ...DocumentNodeBaseSchema.properties,
410
+ kind: {
411
+ type: "string",
412
+ const: NodeKind.TOPIC,
413
+ title: "Kind",
414
+ description: "Node type discriminator"
415
+ },
416
+ children: {
417
+ type: "array",
418
+ items: DocumentNodeSchema,
419
+ title: "Children",
420
+ description: "Child nodes"
421
+ }
422
+ },
423
+ required: [...DocumentNodeBaseSchema.required, "children"],
424
+ additionalProperties: false
425
+ };
426
+ var DocumentRootNodeSchema = {
427
+ type: "object",
428
+ properties: {
429
+ ...DocumentNodeBaseSchema.properties,
430
+ kind: {
431
+ type: "string",
432
+ const: NodeKind.DOCUMENT,
433
+ title: "Kind",
434
+ description: "Node type discriminator"
435
+ },
436
+ title: {
437
+ type: "string",
438
+ title: "Title",
439
+ description: "Document title"
440
+ },
441
+ children: {
442
+ type: "array",
443
+ items: DocumentNodeSchema,
444
+ title: "Children",
445
+ description: "Child nodes"
446
+ }
447
+ },
448
+ required: [...DocumentNodeBaseSchema.required, "title", "children"],
449
+ additionalProperties: false
450
+ };
451
+ var TokenBudgetSchema = {
452
+ type: "object",
453
+ properties: {
454
+ maxTokensPerChunk: {
455
+ type: "integer",
456
+ title: "Max Tokens Per Chunk",
457
+ description: "Maximum tokens allowed per chunk"
458
+ },
459
+ overlapTokens: {
460
+ type: "integer",
461
+ title: "Overlap Tokens",
462
+ description: "Number of tokens to overlap between chunks"
463
+ },
464
+ reservedTokens: {
465
+ type: "integer",
466
+ title: "Reserved Tokens",
467
+ description: "Tokens reserved for metadata or context"
468
+ }
469
+ },
470
+ required: ["maxTokensPerChunk", "overlapTokens", "reservedTokens"],
471
+ additionalProperties: false
472
+ };
473
+ var ChunkEnrichmentSchema = {
474
+ type: "object",
475
+ properties: {
476
+ summary: {
477
+ type: "string",
478
+ title: "Summary",
479
+ description: "Summary of the chunk content"
480
+ },
481
+ entities: {
482
+ type: "array",
483
+ items: EntitySchema,
484
+ title: "Entities",
485
+ description: "Named entities extracted from the chunk"
486
+ }
487
+ },
488
+ additionalProperties: false
489
+ };
490
+ var ChunkNodeSchema = () => ({
491
+ type: "object",
492
+ properties: {
493
+ chunkId: {
494
+ type: "string",
495
+ title: "Chunk ID",
496
+ description: "Unique identifier for this chunk"
497
+ },
498
+ doc_id: {
499
+ type: "string",
500
+ title: "Document ID",
501
+ description: "ID of the parent document"
502
+ },
503
+ text: {
504
+ type: "string",
505
+ title: "Text",
506
+ description: "Text content of the chunk"
507
+ },
508
+ nodePath: {
509
+ type: "array",
510
+ items: { type: "string" },
511
+ title: "Node Path",
512
+ description: "Node IDs from root to leaf"
513
+ },
514
+ depth: {
515
+ type: "integer",
516
+ title: "Depth",
517
+ description: "Depth in the document tree"
518
+ },
519
+ enrichment: ChunkEnrichmentSchema
520
+ },
521
+ required: ["chunkId", "doc_id", "text", "nodePath", "depth"],
522
+ additionalProperties: false
523
+ });
524
+ var ChunkMetadataSchema = {
525
+ type: "object",
526
+ properties: {
527
+ doc_id: {
528
+ type: "string",
529
+ title: "Document ID",
530
+ description: "ID of the parent document"
531
+ },
532
+ chunkId: {
533
+ type: "string",
534
+ title: "Chunk ID",
535
+ description: "Unique identifier for this chunk"
536
+ },
537
+ leafNodeId: {
538
+ type: "string",
539
+ title: "Leaf Node ID",
540
+ description: "ID of the leaf node this chunk belongs to"
541
+ },
542
+ depth: {
543
+ type: "integer",
544
+ title: "Depth",
545
+ description: "Depth in the document tree"
546
+ },
547
+ text: {
548
+ type: "string",
549
+ title: "Text",
550
+ description: "Text content of the chunk"
551
+ },
552
+ nodePath: {
553
+ type: "array",
554
+ items: { type: "string" },
555
+ title: "Node Path",
556
+ description: "Node IDs from root to leaf"
557
+ },
558
+ summary: {
559
+ type: "string",
560
+ title: "Summary",
561
+ description: "Summary of the chunk content"
562
+ },
563
+ entities: {
564
+ type: "array",
565
+ items: EntitySchema,
566
+ title: "Entities",
567
+ description: "Named entities extracted from the chunk"
568
+ }
569
+ },
570
+ required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
571
+ additionalProperties: true
572
+ };
573
+ var ChunkMetadataArraySchema = {
574
+ type: "array",
575
+ items: ChunkMetadataSchema,
576
+ title: "Chunk Metadata",
577
+ description: "Metadata for each chunk"
578
+ };
579
+ var EnrichedChunkMetadataSchema = {
580
+ type: "object",
581
+ properties: {
582
+ doc_id: {
583
+ type: "string",
584
+ title: "Document ID",
585
+ description: "ID of the parent document"
586
+ },
587
+ chunkId: {
588
+ type: "string",
589
+ title: "Chunk ID",
590
+ description: "Unique identifier for this chunk"
591
+ },
592
+ leafNodeId: {
593
+ type: "string",
594
+ title: "Leaf Node ID",
595
+ description: "ID of the leaf node this chunk belongs to"
596
+ },
597
+ depth: {
598
+ type: "integer",
599
+ title: "Depth",
600
+ description: "Depth in the document tree"
601
+ },
602
+ text: {
603
+ type: "string",
604
+ title: "Text",
605
+ description: "Text content of the chunk"
606
+ },
607
+ nodePath: {
608
+ type: "array",
609
+ items: { type: "string" },
610
+ title: "Node Path",
611
+ description: "Node IDs from root to leaf"
612
+ },
613
+ summary: {
614
+ type: "string",
615
+ title: "Summary",
616
+ description: "Summary of the chunk content"
617
+ },
618
+ entities: {
619
+ type: "array",
620
+ items: EntitySchema,
621
+ title: "Entities",
622
+ description: "Named entities (rolled up from hierarchy)"
623
+ },
624
+ parentSummaries: {
625
+ type: "array",
626
+ items: { type: "string" },
627
+ title: "Parent Summaries",
628
+ description: "Summaries from ancestor nodes"
629
+ },
630
+ sectionTitles: {
631
+ type: "array",
632
+ items: { type: "string" },
633
+ title: "Section Titles",
634
+ description: "Titles of ancestor section nodes"
635
+ }
636
+ },
637
+ required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
638
+ additionalProperties: true
639
+ };
640
+ var EnrichedChunkMetadataArraySchema = {
641
+ type: "array",
642
+ items: EnrichedChunkMetadataSchema,
643
+ title: "Enriched Metadata",
644
+ description: "Metadata enriched with hierarchy information"
645
+ };
646
+ var DocumentMetadataSchema = {
647
+ type: "object",
648
+ properties: {
649
+ title: {
650
+ type: "string",
651
+ title: "Title",
652
+ description: "Document title"
653
+ },
654
+ sourceUri: {
655
+ type: "string",
656
+ title: "Source URI",
657
+ description: "Original source URI of the document"
658
+ },
659
+ createdAt: {
660
+ type: "string",
661
+ title: "Created At",
662
+ description: "ISO timestamp of creation"
663
+ }
664
+ },
665
+ required: ["title"],
666
+ additionalProperties: true
667
+ };
668
+
669
+ // src/document/DocumentNode.ts
670
+ function estimateTokens(text) {
671
+ return Math.ceil(text.length / 4);
672
+ }
673
+ function hasChildren(node) {
674
+ return node.kind === NodeKind.DOCUMENT || node.kind === NodeKind.SECTION || node.kind === NodeKind.TOPIC;
675
+ }
676
+ function getChildren(node) {
677
+ if (hasChildren(node)) {
678
+ return node.children;
679
+ }
680
+ return [];
681
+ }
682
+ function* traverseDepthFirst(node) {
683
+ yield node;
684
+ if (hasChildren(node)) {
685
+ for (const child of node.children) {
686
+ yield* traverseDepthFirst(child);
687
+ }
688
+ }
689
+ }
690
+ function getNodePath(root, targetNodeId) {
691
+ const path = [];
692
+ function search(node) {
693
+ path.push(node.nodeId);
694
+ if (node.nodeId === targetNodeId) {
695
+ return true;
696
+ }
697
+ if (hasChildren(node)) {
698
+ for (const child of node.children) {
699
+ if (search(child)) {
700
+ return true;
701
+ }
702
+ }
703
+ }
704
+ path.pop();
705
+ return false;
706
+ }
707
+ return search(root) ? path : undefined;
708
+ }
709
+ function getDocumentRange(root, nodePath) {
710
+ let currentNode = root;
711
+ for (let i = 1;i < nodePath.length; i++) {
712
+ const targetId = nodePath[i];
713
+ const children = currentNode.children;
714
+ let found;
715
+ for (let j = 0;j < children.length; j++) {
716
+ if (children[j].nodeId === targetId) {
717
+ found = children[j];
718
+ break;
719
+ }
720
+ }
721
+ if (!found) {
722
+ throw new Error(`Node with id ${targetId} not found in path`);
723
+ }
724
+ currentNode = found;
725
+ }
726
+ return currentNode.range;
727
+ }
728
+ // src/document/DocumentStorageSchema.ts
729
+ var DocumentStorageSchema = {
730
+ type: "object",
731
+ properties: {
732
+ doc_id: {
733
+ type: "string",
734
+ "x-auto-generated": true,
735
+ title: "Document ID",
736
+ description: "Unique identifier for the document"
737
+ },
738
+ data: {
739
+ type: "string",
740
+ title: "Document Data",
741
+ description: "JSON-serialized document"
742
+ },
743
+ metadata: {
744
+ type: "object",
745
+ title: "Metadata",
746
+ description: "Metadata of the document"
747
+ }
748
+ },
749
+ required: ["doc_id", "data"],
750
+ additionalProperties: true
751
+ };
752
+ var DocumentStorageKey = ["doc_id"];
753
+ // src/document/StructuralParser.ts
754
+ import { uuid4 } from "@workglow/util";
755
+ class StructuralParser {
756
+ static async parseMarkdown(doc_id, text, title) {
757
+ const lines = text.split(`
758
+ `);
759
+ let currentOffset = 0;
760
+ const root = {
761
+ nodeId: uuid4(),
762
+ kind: NodeKind.DOCUMENT,
763
+ range: { startOffset: 0, endOffset: text.length },
764
+ text: title,
765
+ title,
766
+ children: []
767
+ };
768
+ let currentParentStack = [root];
769
+ let textBuffer = [];
770
+ let textBufferStartOffset = 0;
771
+ const flushTextBuffer = async () => {
772
+ if (textBuffer.length > 0) {
773
+ const content = textBuffer.join(`
774
+ `).trim();
775
+ if (content) {
776
+ const paragraphStartOffset = textBufferStartOffset;
777
+ const paragraphEndOffset = currentOffset;
778
+ const paragraph = {
779
+ nodeId: uuid4(),
780
+ kind: NodeKind.PARAGRAPH,
781
+ range: {
782
+ startOffset: paragraphStartOffset,
783
+ endOffset: paragraphEndOffset
784
+ },
785
+ text: content
786
+ };
787
+ currentParentStack[currentParentStack.length - 1].children.push(paragraph);
788
+ }
789
+ textBuffer = [];
790
+ }
791
+ };
792
+ for (const line of lines) {
793
+ const lineLength = line.length + 1;
794
+ const headerMatch = line.match(/^(#{1,6})\s+(.*)$/);
795
+ if (headerMatch) {
796
+ await flushTextBuffer();
797
+ const level = headerMatch[1].length;
798
+ const headerTitle = headerMatch[2];
799
+ while (currentParentStack.length > 1 && currentParentStack[currentParentStack.length - 1].kind === NodeKind.SECTION && currentParentStack[currentParentStack.length - 1].level >= level) {
800
+ const poppedSection = currentParentStack.pop();
801
+ const updatedSection = {
802
+ ...poppedSection,
803
+ range: {
804
+ ...poppedSection.range,
805
+ endOffset: currentOffset
806
+ }
807
+ };
808
+ const parent = currentParentStack[currentParentStack.length - 1];
809
+ parent.children[parent.children.length - 1] = updatedSection;
810
+ }
811
+ const sectionStartOffset = currentOffset;
812
+ const section = {
813
+ nodeId: uuid4(),
814
+ kind: NodeKind.SECTION,
815
+ level,
816
+ title: headerTitle,
817
+ range: {
818
+ startOffset: sectionStartOffset,
819
+ endOffset: text.length
820
+ },
821
+ text: headerTitle,
822
+ children: []
823
+ };
824
+ currentParentStack[currentParentStack.length - 1].children.push(section);
825
+ currentParentStack.push(section);
826
+ } else {
827
+ if (textBuffer.length === 0) {
828
+ textBufferStartOffset = currentOffset;
829
+ }
830
+ textBuffer.push(line);
831
+ }
832
+ currentOffset += lineLength;
833
+ }
834
+ await flushTextBuffer();
835
+ while (currentParentStack.length > 1) {
836
+ const section = currentParentStack.pop();
837
+ const updatedSection = {
838
+ ...section,
839
+ range: {
840
+ ...section.range,
841
+ endOffset: text.length
842
+ }
843
+ };
844
+ const parent = currentParentStack[currentParentStack.length - 1];
845
+ parent.children[parent.children.length - 1] = updatedSection;
846
+ }
847
+ return root;
848
+ }
849
+ static async parsePlainText(doc_id, text, title) {
850
+ const root = {
851
+ nodeId: uuid4(),
852
+ kind: NodeKind.DOCUMENT,
853
+ range: { startOffset: 0, endOffset: text.length },
854
+ text: title,
855
+ title,
856
+ children: []
857
+ };
858
+ const paragraphRegex = /\n\s*\n/g;
859
+ let lastIndex = 0;
860
+ let paragraphIndex = 0;
861
+ let match;
862
+ while ((match = paragraphRegex.exec(text)) !== null) {
863
+ const rawParagraph = text.slice(lastIndex, match.index);
864
+ const paragraphText = rawParagraph.trim();
865
+ if (paragraphText.length > 0) {
866
+ const trimmedRelativeStart = rawParagraph.indexOf(paragraphText);
867
+ const startOffset = lastIndex + trimmedRelativeStart;
868
+ const endOffset = startOffset + paragraphText.length;
869
+ const paragraph = {
870
+ nodeId: uuid4(),
871
+ kind: NodeKind.PARAGRAPH,
872
+ range: {
873
+ startOffset,
874
+ endOffset
875
+ },
876
+ text: paragraphText
877
+ };
878
+ root.children.push(paragraph);
879
+ paragraphIndex++;
880
+ }
881
+ lastIndex = paragraphRegex.lastIndex;
882
+ }
883
+ if (lastIndex < text.length) {
884
+ const rawParagraph = text.slice(lastIndex);
885
+ const paragraphText = rawParagraph.trim();
886
+ if (paragraphText.length > 0) {
887
+ const trimmedRelativeStart = rawParagraph.indexOf(paragraphText);
888
+ const startOffset = lastIndex + trimmedRelativeStart;
889
+ const endOffset = startOffset + paragraphText.length;
890
+ const paragraph = {
891
+ nodeId: uuid4(),
892
+ kind: NodeKind.PARAGRAPH,
893
+ range: {
894
+ startOffset,
895
+ endOffset
896
+ },
897
+ text: paragraphText
898
+ };
899
+ root.children.push(paragraph);
900
+ }
901
+ }
902
+ return root;
903
+ }
904
+ static parse(doc_id, text, title, format) {
905
+ if (format === "markdown" || !format && this.looksLikeMarkdown(text)) {
906
+ return this.parseMarkdown(doc_id, text, title);
907
+ }
908
+ return this.parsePlainText(doc_id, text, title);
909
+ }
910
+ static looksLikeMarkdown(text) {
911
+ return /^#{1,6}\s/m.test(text);
912
+ }
913
+ }
914
+ // src/document-chunk/DocumentChunkDataset.ts
915
+ class DocumentChunkDataset {
916
+ storage;
917
+ constructor(storage) {
918
+ this.storage = storage;
919
+ }
920
+ getStorage() {
921
+ return this.storage;
922
+ }
923
+ async put(chunk) {
924
+ return this.storage.put(chunk);
925
+ }
926
+ async putBulk(chunks) {
927
+ return this.storage.putBulk(chunks);
928
+ }
929
+ async get(chunk_id) {
930
+ const key = { chunk_id };
931
+ return this.storage.get(key);
932
+ }
933
+ async delete(chunk_id) {
934
+ const key = { chunk_id };
935
+ return this.storage.delete(key);
936
+ }
937
+ async similaritySearch(query, options) {
938
+ return this.storage.similaritySearch(query, options);
939
+ }
940
+ async hybridSearch(query, options) {
941
+ if (this.storage.hybridSearch) {
942
+ return this.storage.hybridSearch(query, options);
943
+ }
944
+ throw new Error("Hybrid search not supported by this storage backend");
945
+ }
946
+ async getAll() {
947
+ return this.storage.getAll();
948
+ }
949
+ async size() {
950
+ return this.storage.size();
951
+ }
952
+ async clear() {
953
+ return this.storage.clear();
954
+ }
955
+ destroy() {
956
+ return this.storage.destroy();
957
+ }
958
+ async setupDatabase() {
959
+ return this.storage.setupDatabase();
960
+ }
961
+ getVectorDimensions() {
962
+ return this.storage.getVectorDimensions();
963
+ }
964
+ }
965
+ // src/document-chunk/DocumentChunkDatasetRegistry.ts
966
+ import {
967
+ createServiceToken as createServiceToken2,
968
+ globalServiceRegistry as globalServiceRegistry2,
969
+ registerInputResolver as registerInputResolver2
970
+ } from "@workglow/util";
971
+ var DOCUMENT_CHUNK_DATASET = createServiceToken2("dataset.document-chunk");
972
+ if (!globalServiceRegistry2.has(DOCUMENT_CHUNK_DATASET)) {
973
+ globalServiceRegistry2.register(DOCUMENT_CHUNK_DATASET, () => new Map, true);
974
+ }
975
+ function getGlobalDocumentChunkDataset() {
976
+ return globalServiceRegistry2.get(DOCUMENT_CHUNK_DATASET);
977
+ }
978
+ function registerDocumentChunkDataset(id, dataset) {
979
+ const datasets = getGlobalDocumentChunkDataset();
980
+ datasets.set(id, dataset);
981
+ }
982
+ function getDocumentChunkDataset(id) {
983
+ return getGlobalDocumentChunkDataset().get(id);
984
+ }
985
+ async function resolveDocumentChunkDatasetFromRegistry(id, format, registry) {
986
+ const datasets = registry.has(DOCUMENT_CHUNK_DATASET) ? registry.get(DOCUMENT_CHUNK_DATASET) : getGlobalDocumentChunkDataset();
987
+ const dataset = datasets.get(id);
988
+ if (!dataset) {
989
+ throw new Error(`Document chunk dataset "${id}" not found in registry`);
990
+ }
991
+ return dataset;
992
+ }
993
+ registerInputResolver2("dataset:document-chunk", resolveDocumentChunkDatasetFromRegistry);
994
+ // src/document-chunk/DocumentChunkSchema.ts
995
+ import { TypedArraySchema } from "@workglow/util";
996
+ var DocumentChunkSchema = {
997
+ type: "object",
998
+ properties: {
999
+ chunk_id: { type: "string", "x-auto-generated": true },
1000
+ doc_id: { type: "string" },
1001
+ vector: TypedArraySchema(),
1002
+ metadata: { type: "object", format: "metadata", additionalProperties: true }
1003
+ },
1004
+ additionalProperties: false
1005
+ };
1006
+ var DocumentChunkPrimaryKey = ["chunk_id"];
1007
+ export {
1008
+ traverseDepthFirst,
1009
+ registerDocumentDataset,
1010
+ registerDocumentChunkDataset,
1011
+ hasChildren,
1012
+ getNodePath,
1013
+ getGlobalDocumentDatasets,
1014
+ getGlobalDocumentChunkDataset,
1015
+ getDocumentRange,
1016
+ getDocumentDataset,
1017
+ getDocumentChunkDataset,
1018
+ getChildren,
1019
+ estimateTokens,
1020
+ TypeTabularStorage,
1021
+ TypeDocumentDataset,
1022
+ TypeDocumentChunkDataset,
1023
+ TopicNodeSchema,
1024
+ TokenBudgetSchema,
1025
+ StructuralParser,
1026
+ SentenceNodeSchema,
1027
+ SectionNodeSchema,
1028
+ ParagraphNodeSchema,
1029
+ NodeRangeSchema,
1030
+ NodeKind,
1031
+ NodeEnrichmentSchema,
1032
+ EntitySchema,
1033
+ EnrichedChunkMetadataSchema,
1034
+ EnrichedChunkMetadataArraySchema,
1035
+ DocumentStorageSchema,
1036
+ DocumentStorageKey,
1037
+ DocumentRootNodeSchema,
1038
+ DocumentNodeSchema,
1039
+ DocumentNodeBaseSchema,
1040
+ DocumentMetadataSchema,
1041
+ DocumentDataset,
1042
+ DocumentChunkSchema,
1043
+ DocumentChunkPrimaryKey,
1044
+ DocumentChunkDataset,
1045
+ Document,
1046
+ DOCUMENT_DATASETS,
1047
+ DOCUMENT_CHUNK_DATASET,
1048
+ ChunkNodeSchema,
1049
+ ChunkMetadataSchema,
1050
+ ChunkMetadataArraySchema,
1051
+ ChunkEnrichmentSchema
1052
+ };
1053
+
1054
+ //# debugId=1AB5FF88396B3AD064756E2164756E21