@workglow/dataset 0.0.86
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +1134 -0
- package/dist/browser.js +1053 -0
- package/dist/browser.js.map +20 -0
- package/dist/bun.js +1054 -0
- package/dist/bun.js.map +20 -0
- package/dist/common-server.d.ts +7 -0
- package/dist/common-server.d.ts.map +1 -0
- package/dist/common.d.ts +17 -0
- package/dist/common.d.ts.map +1 -0
- package/dist/document/Document.d.ts +50 -0
- package/dist/document/Document.d.ts.map +1 -0
- package/dist/document/DocumentDataset.d.ts +79 -0
- package/dist/document/DocumentDataset.d.ts.map +1 -0
- package/dist/document/DocumentDatasetRegistry.d.ts +29 -0
- package/dist/document/DocumentDatasetRegistry.d.ts.map +1 -0
- package/dist/document/DocumentNode.d.ts +31 -0
- package/dist/document/DocumentNode.d.ts.map +1 -0
- package/dist/document/DocumentSchema.d.ts +1668 -0
- package/dist/document/DocumentSchema.d.ts.map +1 -0
- package/dist/document/DocumentStorageSchema.d.ts +43 -0
- package/dist/document/DocumentStorageSchema.d.ts.map +1 -0
- package/dist/document/StructuralParser.d.ts +30 -0
- package/dist/document/StructuralParser.d.ts.map +1 -0
- package/dist/document-chunk/DocumentChunkDataset.d.ts +79 -0
- package/dist/document-chunk/DocumentChunkDataset.d.ts.map +1 -0
- package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +29 -0
- package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +1 -0
- package/dist/document-chunk/DocumentChunkSchema.d.ts +55 -0
- package/dist/document-chunk/DocumentChunkSchema.d.ts.map +1 -0
- package/dist/node.js +1053 -0
- package/dist/node.js.map +20 -0
- package/dist/types.d.ts +7 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/util/DatasetSchema.d.ts +85 -0
- package/dist/util/DatasetSchema.d.ts.map +1 -0
- package/package.json +54 -0
- package/src/document-chunk/README.md +362 -0
package/dist/browser.js
ADDED
|
@@ -0,0 +1,1053 @@
|
|
|
1
|
+
// src/util/DatasetSchema.ts
|
|
2
|
+
function TypeTabularStorage(options = {}) {
|
|
3
|
+
return {
|
|
4
|
+
title: "Tabular Storage",
|
|
5
|
+
description: "Storage ID or instance for tabular data storage",
|
|
6
|
+
...options,
|
|
7
|
+
format: "storage:tabular",
|
|
8
|
+
oneOf: [
|
|
9
|
+
{ type: "string", title: "Storage ID" },
|
|
10
|
+
{ title: "Storage Instance", additionalProperties: true }
|
|
11
|
+
]
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
function TypeDocumentChunkDataset(options = {}) {
|
|
15
|
+
return {
|
|
16
|
+
title: "Document Chunk Dataset",
|
|
17
|
+
description: "Dataset ID or instance for document chunk data storage",
|
|
18
|
+
...options,
|
|
19
|
+
format: "dataset:document-chunk",
|
|
20
|
+
anyOf: [
|
|
21
|
+
{ type: "string", title: "Dataset ID" },
|
|
22
|
+
{ title: "Dataset Instance", additionalProperties: true }
|
|
23
|
+
]
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
function TypeDocumentDataset(options = {}) {
|
|
27
|
+
return {
|
|
28
|
+
title: "Document Dataset",
|
|
29
|
+
description: "Dataset ID or instance for document data storage",
|
|
30
|
+
...options,
|
|
31
|
+
format: "dataset:document",
|
|
32
|
+
anyOf: [
|
|
33
|
+
{ type: "string", title: "Dataset ID" },
|
|
34
|
+
{ title: "Dataset Instance", additionalProperties: true }
|
|
35
|
+
]
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
// src/document/Document.ts
|
|
39
|
+
class Document {
|
|
40
|
+
doc_id;
|
|
41
|
+
metadata;
|
|
42
|
+
root;
|
|
43
|
+
chunks;
|
|
44
|
+
constructor(root, metadata, chunks = [], doc_id) {
|
|
45
|
+
this.doc_id = doc_id;
|
|
46
|
+
this.root = root;
|
|
47
|
+
this.metadata = metadata;
|
|
48
|
+
this.chunks = chunks || [];
|
|
49
|
+
}
|
|
50
|
+
setChunks(chunks) {
|
|
51
|
+
this.chunks = chunks;
|
|
52
|
+
}
|
|
53
|
+
getChunks() {
|
|
54
|
+
return this.chunks;
|
|
55
|
+
}
|
|
56
|
+
setDocId(doc_id) {
|
|
57
|
+
this.doc_id = doc_id;
|
|
58
|
+
}
|
|
59
|
+
findChunksByNodeId(nodeId) {
|
|
60
|
+
return this.chunks.filter((chunk) => chunk.nodePath.includes(nodeId));
|
|
61
|
+
}
|
|
62
|
+
toJSON() {
|
|
63
|
+
return {
|
|
64
|
+
metadata: this.metadata,
|
|
65
|
+
root: this.root,
|
|
66
|
+
chunks: this.chunks
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
static fromJSON(json, doc_id) {
|
|
70
|
+
const obj = JSON.parse(json);
|
|
71
|
+
return new Document(obj.root, obj.metadata, obj.chunks, doc_id);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// src/document/DocumentDataset.ts
|
|
75
|
+
class DocumentDataset {
|
|
76
|
+
tabularStorage;
|
|
77
|
+
vectorStorage;
|
|
78
|
+
constructor(tabularStorage, vectorStorage) {
|
|
79
|
+
this.tabularStorage = tabularStorage;
|
|
80
|
+
this.vectorStorage = vectorStorage;
|
|
81
|
+
}
|
|
82
|
+
async upsert(document) {
|
|
83
|
+
const serialized = JSON.stringify(document.toJSON());
|
|
84
|
+
const insertEntity = {
|
|
85
|
+
doc_id: document.doc_id,
|
|
86
|
+
data: serialized
|
|
87
|
+
};
|
|
88
|
+
const entity = await this.tabularStorage.put(insertEntity);
|
|
89
|
+
if (document.doc_id !== entity.doc_id) {
|
|
90
|
+
document.setDocId(entity.doc_id);
|
|
91
|
+
}
|
|
92
|
+
return document;
|
|
93
|
+
}
|
|
94
|
+
async get(doc_id) {
|
|
95
|
+
const entity = await this.tabularStorage.get({ doc_id });
|
|
96
|
+
if (!entity) {
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
return Document.fromJSON(entity.data, entity.doc_id);
|
|
100
|
+
}
|
|
101
|
+
async delete(doc_id) {
|
|
102
|
+
await this.tabularStorage.delete({ doc_id });
|
|
103
|
+
}
|
|
104
|
+
async getNode(doc_id, nodeId) {
|
|
105
|
+
const doc = await this.get(doc_id);
|
|
106
|
+
if (!doc) {
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
const traverse = (node) => {
|
|
110
|
+
if (node.nodeId === nodeId) {
|
|
111
|
+
return node;
|
|
112
|
+
}
|
|
113
|
+
if (node.children && Array.isArray(node.children)) {
|
|
114
|
+
for (const child of node.children) {
|
|
115
|
+
const found = traverse(child);
|
|
116
|
+
if (found)
|
|
117
|
+
return found;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return;
|
|
121
|
+
};
|
|
122
|
+
return traverse(doc.root);
|
|
123
|
+
}
|
|
124
|
+
async getAncestors(doc_id, nodeId) {
|
|
125
|
+
const doc = await this.get(doc_id);
|
|
126
|
+
if (!doc) {
|
|
127
|
+
return [];
|
|
128
|
+
}
|
|
129
|
+
const path = [];
|
|
130
|
+
const findPath = (node) => {
|
|
131
|
+
path.push(node.nodeId);
|
|
132
|
+
if (node.nodeId === nodeId) {
|
|
133
|
+
return true;
|
|
134
|
+
}
|
|
135
|
+
if (node.children && Array.isArray(node.children)) {
|
|
136
|
+
for (const child of node.children) {
|
|
137
|
+
if (findPath(child)) {
|
|
138
|
+
return true;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
path.pop();
|
|
143
|
+
return false;
|
|
144
|
+
};
|
|
145
|
+
if (!findPath(doc.root)) {
|
|
146
|
+
return [];
|
|
147
|
+
}
|
|
148
|
+
const ancestors = [];
|
|
149
|
+
let currentNode = doc.root;
|
|
150
|
+
ancestors.push(currentNode);
|
|
151
|
+
for (let i = 1;i < path.length; i++) {
|
|
152
|
+
const targetId = path[i];
|
|
153
|
+
if (currentNode.children && Array.isArray(currentNode.children)) {
|
|
154
|
+
const found = currentNode.children.find((child) => child.nodeId === targetId);
|
|
155
|
+
if (found) {
|
|
156
|
+
currentNode = found;
|
|
157
|
+
ancestors.push(currentNode);
|
|
158
|
+
} else {
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
} else {
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return ancestors;
|
|
166
|
+
}
|
|
167
|
+
async getChunks(doc_id) {
|
|
168
|
+
const doc = await this.get(doc_id);
|
|
169
|
+
if (!doc) {
|
|
170
|
+
return [];
|
|
171
|
+
}
|
|
172
|
+
return doc.getChunks();
|
|
173
|
+
}
|
|
174
|
+
async findChunksByNodeId(doc_id, nodeId) {
|
|
175
|
+
const doc = await this.get(doc_id);
|
|
176
|
+
if (!doc) {
|
|
177
|
+
return [];
|
|
178
|
+
}
|
|
179
|
+
if (doc.findChunksByNodeId) {
|
|
180
|
+
return doc.findChunksByNodeId(nodeId);
|
|
181
|
+
}
|
|
182
|
+
const chunks = doc.getChunks();
|
|
183
|
+
return chunks.filter((chunk) => chunk.nodePath && chunk.nodePath.includes(nodeId));
|
|
184
|
+
}
|
|
185
|
+
async list() {
|
|
186
|
+
const entities = await this.tabularStorage.getAll();
|
|
187
|
+
if (!entities) {
|
|
188
|
+
return [];
|
|
189
|
+
}
|
|
190
|
+
return entities.map((e) => e.doc_id);
|
|
191
|
+
}
|
|
192
|
+
async search(query, options) {
|
|
193
|
+
return this.vectorStorage?.similaritySearch(query, options) || [];
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
// src/document/DocumentDatasetRegistry.ts
|
|
197
|
+
import {
|
|
198
|
+
createServiceToken,
|
|
199
|
+
globalServiceRegistry,
|
|
200
|
+
registerInputResolver
|
|
201
|
+
} from "@workglow/util";
|
|
202
|
+
var DOCUMENT_DATASETS = createServiceToken("dataset.documents");
|
|
203
|
+
if (!globalServiceRegistry.has(DOCUMENT_DATASETS)) {
|
|
204
|
+
globalServiceRegistry.register(DOCUMENT_DATASETS, () => new Map, true);
|
|
205
|
+
}
|
|
206
|
+
function getGlobalDocumentDatasets() {
|
|
207
|
+
return globalServiceRegistry.get(DOCUMENT_DATASETS);
|
|
208
|
+
}
|
|
209
|
+
function registerDocumentDataset(id, dataset) {
|
|
210
|
+
const datasets = getGlobalDocumentDatasets();
|
|
211
|
+
datasets.set(id, dataset);
|
|
212
|
+
}
|
|
213
|
+
function getDocumentDataset(id) {
|
|
214
|
+
return getGlobalDocumentDatasets().get(id);
|
|
215
|
+
}
|
|
216
|
+
async function resolveDocumentDatasetFromRegistry(id, format, registry) {
|
|
217
|
+
const datasets = registry.has(DOCUMENT_DATASETS) ? registry.get(DOCUMENT_DATASETS) : getGlobalDocumentDatasets();
|
|
218
|
+
const dataset = datasets.get(id);
|
|
219
|
+
if (!dataset) {
|
|
220
|
+
throw new Error(`Document dataset "${id}" not found in registry`);
|
|
221
|
+
}
|
|
222
|
+
return dataset;
|
|
223
|
+
}
|
|
224
|
+
registerInputResolver("dataset:document", resolveDocumentDatasetFromRegistry);
|
|
225
|
+
// src/document/DocumentSchema.ts
|
|
226
|
+
var NodeKind = {
|
|
227
|
+
DOCUMENT: "document",
|
|
228
|
+
SECTION: "section",
|
|
229
|
+
PARAGRAPH: "paragraph",
|
|
230
|
+
SENTENCE: "sentence",
|
|
231
|
+
TOPIC: "topic"
|
|
232
|
+
};
|
|
233
|
+
var NodeRangeSchema = {
|
|
234
|
+
type: "object",
|
|
235
|
+
properties: {
|
|
236
|
+
startOffset: {
|
|
237
|
+
type: "integer",
|
|
238
|
+
title: "Start Offset",
|
|
239
|
+
description: "Starting character offset"
|
|
240
|
+
},
|
|
241
|
+
endOffset: {
|
|
242
|
+
type: "integer",
|
|
243
|
+
title: "End Offset",
|
|
244
|
+
description: "Ending character offset"
|
|
245
|
+
}
|
|
246
|
+
},
|
|
247
|
+
required: ["startOffset", "endOffset"],
|
|
248
|
+
additionalProperties: false
|
|
249
|
+
};
|
|
250
|
+
var EntitySchema = {
|
|
251
|
+
type: "object",
|
|
252
|
+
properties: {
|
|
253
|
+
text: {
|
|
254
|
+
type: "string",
|
|
255
|
+
title: "Text",
|
|
256
|
+
description: "Entity text"
|
|
257
|
+
},
|
|
258
|
+
type: {
|
|
259
|
+
type: "string",
|
|
260
|
+
title: "Type",
|
|
261
|
+
description: "Entity type (e.g., PERSON, ORG, LOC)"
|
|
262
|
+
},
|
|
263
|
+
score: {
|
|
264
|
+
type: "number",
|
|
265
|
+
title: "Score",
|
|
266
|
+
description: "Confidence score"
|
|
267
|
+
}
|
|
268
|
+
},
|
|
269
|
+
required: ["text", "type", "score"],
|
|
270
|
+
additionalProperties: false
|
|
271
|
+
};
|
|
272
|
+
var NodeEnrichmentSchema = {
|
|
273
|
+
type: "object",
|
|
274
|
+
properties: {
|
|
275
|
+
summary: {
|
|
276
|
+
type: "string",
|
|
277
|
+
title: "Summary",
|
|
278
|
+
description: "Summary of the node content"
|
|
279
|
+
},
|
|
280
|
+
entities: {
|
|
281
|
+
type: "array",
|
|
282
|
+
items: EntitySchema,
|
|
283
|
+
title: "Entities",
|
|
284
|
+
description: "Named entities extracted from the node"
|
|
285
|
+
},
|
|
286
|
+
keywords: {
|
|
287
|
+
type: "array",
|
|
288
|
+
items: { type: "string" },
|
|
289
|
+
title: "Keywords",
|
|
290
|
+
description: "Keywords associated with the node"
|
|
291
|
+
}
|
|
292
|
+
},
|
|
293
|
+
additionalProperties: false
|
|
294
|
+
};
|
|
295
|
+
var DocumentNodeBaseSchema = {
|
|
296
|
+
type: "object",
|
|
297
|
+
properties: {
|
|
298
|
+
nodeId: {
|
|
299
|
+
type: "string",
|
|
300
|
+
title: "Node ID",
|
|
301
|
+
description: "Unique identifier for this node"
|
|
302
|
+
},
|
|
303
|
+
kind: {
|
|
304
|
+
type: "string",
|
|
305
|
+
enum: Object.values(NodeKind),
|
|
306
|
+
title: "Kind",
|
|
307
|
+
description: "Node type discriminator"
|
|
308
|
+
},
|
|
309
|
+
range: NodeRangeSchema,
|
|
310
|
+
text: {
|
|
311
|
+
type: "string",
|
|
312
|
+
title: "Text",
|
|
313
|
+
description: "Text content of the node"
|
|
314
|
+
},
|
|
315
|
+
enrichment: NodeEnrichmentSchema
|
|
316
|
+
},
|
|
317
|
+
required: ["nodeId", "kind", "range", "text"],
|
|
318
|
+
additionalProperties: true
|
|
319
|
+
};
|
|
320
|
+
var DocumentNodeSchema = {
|
|
321
|
+
type: "object",
|
|
322
|
+
title: "Document Node",
|
|
323
|
+
description: "A node in the hierarchical document tree",
|
|
324
|
+
properties: {
|
|
325
|
+
...DocumentNodeBaseSchema.properties,
|
|
326
|
+
level: {
|
|
327
|
+
type: "integer",
|
|
328
|
+
title: "Level",
|
|
329
|
+
description: "Header level for section nodes"
|
|
330
|
+
},
|
|
331
|
+
title: {
|
|
332
|
+
type: "string",
|
|
333
|
+
title: "Title",
|
|
334
|
+
description: "Section title"
|
|
335
|
+
},
|
|
336
|
+
children: {
|
|
337
|
+
type: "array",
|
|
338
|
+
title: "Children",
|
|
339
|
+
description: "Child nodes"
|
|
340
|
+
}
|
|
341
|
+
},
|
|
342
|
+
required: [...DocumentNodeBaseSchema.required],
|
|
343
|
+
additionalProperties: false
|
|
344
|
+
};
|
|
345
|
+
var ParagraphNodeSchema = {
|
|
346
|
+
type: "object",
|
|
347
|
+
properties: {
|
|
348
|
+
...DocumentNodeBaseSchema.properties,
|
|
349
|
+
kind: {
|
|
350
|
+
type: "string",
|
|
351
|
+
const: NodeKind.PARAGRAPH,
|
|
352
|
+
title: "Kind",
|
|
353
|
+
description: "Node type discriminator"
|
|
354
|
+
}
|
|
355
|
+
},
|
|
356
|
+
required: [...DocumentNodeBaseSchema.required],
|
|
357
|
+
additionalProperties: false
|
|
358
|
+
};
|
|
359
|
+
var SentenceNodeSchema = {
|
|
360
|
+
type: "object",
|
|
361
|
+
properties: {
|
|
362
|
+
...DocumentNodeBaseSchema.properties,
|
|
363
|
+
kind: {
|
|
364
|
+
type: "string",
|
|
365
|
+
const: NodeKind.SENTENCE,
|
|
366
|
+
title: "Kind",
|
|
367
|
+
description: "Node type discriminator"
|
|
368
|
+
}
|
|
369
|
+
},
|
|
370
|
+
required: [...DocumentNodeBaseSchema.required],
|
|
371
|
+
additionalProperties: false
|
|
372
|
+
};
|
|
373
|
+
var SectionNodeSchema = {
|
|
374
|
+
type: "object",
|
|
375
|
+
properties: {
|
|
376
|
+
...DocumentNodeBaseSchema.properties,
|
|
377
|
+
kind: {
|
|
378
|
+
type: "string",
|
|
379
|
+
const: NodeKind.SECTION,
|
|
380
|
+
title: "Kind",
|
|
381
|
+
description: "Node type discriminator"
|
|
382
|
+
},
|
|
383
|
+
level: {
|
|
384
|
+
type: "integer",
|
|
385
|
+
minimum: 1,
|
|
386
|
+
maximum: 6,
|
|
387
|
+
title: "Level",
|
|
388
|
+
description: "Header level (1-6 for markdown)"
|
|
389
|
+
},
|
|
390
|
+
title: {
|
|
391
|
+
type: "string",
|
|
392
|
+
title: "Title",
|
|
393
|
+
description: "Section title"
|
|
394
|
+
},
|
|
395
|
+
children: {
|
|
396
|
+
type: "array",
|
|
397
|
+
items: DocumentNodeSchema,
|
|
398
|
+
title: "Children",
|
|
399
|
+
description: "Child nodes"
|
|
400
|
+
}
|
|
401
|
+
},
|
|
402
|
+
required: [...DocumentNodeBaseSchema.required, "level", "title", "children"],
|
|
403
|
+
additionalProperties: false
|
|
404
|
+
};
|
|
405
|
+
var TopicNodeSchema = {
|
|
406
|
+
type: "object",
|
|
407
|
+
properties: {
|
|
408
|
+
...DocumentNodeBaseSchema.properties,
|
|
409
|
+
kind: {
|
|
410
|
+
type: "string",
|
|
411
|
+
const: NodeKind.TOPIC,
|
|
412
|
+
title: "Kind",
|
|
413
|
+
description: "Node type discriminator"
|
|
414
|
+
},
|
|
415
|
+
children: {
|
|
416
|
+
type: "array",
|
|
417
|
+
items: DocumentNodeSchema,
|
|
418
|
+
title: "Children",
|
|
419
|
+
description: "Child nodes"
|
|
420
|
+
}
|
|
421
|
+
},
|
|
422
|
+
required: [...DocumentNodeBaseSchema.required, "children"],
|
|
423
|
+
additionalProperties: false
|
|
424
|
+
};
|
|
425
|
+
var DocumentRootNodeSchema = {
|
|
426
|
+
type: "object",
|
|
427
|
+
properties: {
|
|
428
|
+
...DocumentNodeBaseSchema.properties,
|
|
429
|
+
kind: {
|
|
430
|
+
type: "string",
|
|
431
|
+
const: NodeKind.DOCUMENT,
|
|
432
|
+
title: "Kind",
|
|
433
|
+
description: "Node type discriminator"
|
|
434
|
+
},
|
|
435
|
+
title: {
|
|
436
|
+
type: "string",
|
|
437
|
+
title: "Title",
|
|
438
|
+
description: "Document title"
|
|
439
|
+
},
|
|
440
|
+
children: {
|
|
441
|
+
type: "array",
|
|
442
|
+
items: DocumentNodeSchema,
|
|
443
|
+
title: "Children",
|
|
444
|
+
description: "Child nodes"
|
|
445
|
+
}
|
|
446
|
+
},
|
|
447
|
+
required: [...DocumentNodeBaseSchema.required, "title", "children"],
|
|
448
|
+
additionalProperties: false
|
|
449
|
+
};
|
|
450
|
+
var TokenBudgetSchema = {
|
|
451
|
+
type: "object",
|
|
452
|
+
properties: {
|
|
453
|
+
maxTokensPerChunk: {
|
|
454
|
+
type: "integer",
|
|
455
|
+
title: "Max Tokens Per Chunk",
|
|
456
|
+
description: "Maximum tokens allowed per chunk"
|
|
457
|
+
},
|
|
458
|
+
overlapTokens: {
|
|
459
|
+
type: "integer",
|
|
460
|
+
title: "Overlap Tokens",
|
|
461
|
+
description: "Number of tokens to overlap between chunks"
|
|
462
|
+
},
|
|
463
|
+
reservedTokens: {
|
|
464
|
+
type: "integer",
|
|
465
|
+
title: "Reserved Tokens",
|
|
466
|
+
description: "Tokens reserved for metadata or context"
|
|
467
|
+
}
|
|
468
|
+
},
|
|
469
|
+
required: ["maxTokensPerChunk", "overlapTokens", "reservedTokens"],
|
|
470
|
+
additionalProperties: false
|
|
471
|
+
};
|
|
472
|
+
var ChunkEnrichmentSchema = {
|
|
473
|
+
type: "object",
|
|
474
|
+
properties: {
|
|
475
|
+
summary: {
|
|
476
|
+
type: "string",
|
|
477
|
+
title: "Summary",
|
|
478
|
+
description: "Summary of the chunk content"
|
|
479
|
+
},
|
|
480
|
+
entities: {
|
|
481
|
+
type: "array",
|
|
482
|
+
items: EntitySchema,
|
|
483
|
+
title: "Entities",
|
|
484
|
+
description: "Named entities extracted from the chunk"
|
|
485
|
+
}
|
|
486
|
+
},
|
|
487
|
+
additionalProperties: false
|
|
488
|
+
};
|
|
489
|
+
var ChunkNodeSchema = () => ({
|
|
490
|
+
type: "object",
|
|
491
|
+
properties: {
|
|
492
|
+
chunkId: {
|
|
493
|
+
type: "string",
|
|
494
|
+
title: "Chunk ID",
|
|
495
|
+
description: "Unique identifier for this chunk"
|
|
496
|
+
},
|
|
497
|
+
doc_id: {
|
|
498
|
+
type: "string",
|
|
499
|
+
title: "Document ID",
|
|
500
|
+
description: "ID of the parent document"
|
|
501
|
+
},
|
|
502
|
+
text: {
|
|
503
|
+
type: "string",
|
|
504
|
+
title: "Text",
|
|
505
|
+
description: "Text content of the chunk"
|
|
506
|
+
},
|
|
507
|
+
nodePath: {
|
|
508
|
+
type: "array",
|
|
509
|
+
items: { type: "string" },
|
|
510
|
+
title: "Node Path",
|
|
511
|
+
description: "Node IDs from root to leaf"
|
|
512
|
+
},
|
|
513
|
+
depth: {
|
|
514
|
+
type: "integer",
|
|
515
|
+
title: "Depth",
|
|
516
|
+
description: "Depth in the document tree"
|
|
517
|
+
},
|
|
518
|
+
enrichment: ChunkEnrichmentSchema
|
|
519
|
+
},
|
|
520
|
+
required: ["chunkId", "doc_id", "text", "nodePath", "depth"],
|
|
521
|
+
additionalProperties: false
|
|
522
|
+
});
|
|
523
|
+
var ChunkMetadataSchema = {
|
|
524
|
+
type: "object",
|
|
525
|
+
properties: {
|
|
526
|
+
doc_id: {
|
|
527
|
+
type: "string",
|
|
528
|
+
title: "Document ID",
|
|
529
|
+
description: "ID of the parent document"
|
|
530
|
+
},
|
|
531
|
+
chunkId: {
|
|
532
|
+
type: "string",
|
|
533
|
+
title: "Chunk ID",
|
|
534
|
+
description: "Unique identifier for this chunk"
|
|
535
|
+
},
|
|
536
|
+
leafNodeId: {
|
|
537
|
+
type: "string",
|
|
538
|
+
title: "Leaf Node ID",
|
|
539
|
+
description: "ID of the leaf node this chunk belongs to"
|
|
540
|
+
},
|
|
541
|
+
depth: {
|
|
542
|
+
type: "integer",
|
|
543
|
+
title: "Depth",
|
|
544
|
+
description: "Depth in the document tree"
|
|
545
|
+
},
|
|
546
|
+
text: {
|
|
547
|
+
type: "string",
|
|
548
|
+
title: "Text",
|
|
549
|
+
description: "Text content of the chunk"
|
|
550
|
+
},
|
|
551
|
+
nodePath: {
|
|
552
|
+
type: "array",
|
|
553
|
+
items: { type: "string" },
|
|
554
|
+
title: "Node Path",
|
|
555
|
+
description: "Node IDs from root to leaf"
|
|
556
|
+
},
|
|
557
|
+
summary: {
|
|
558
|
+
type: "string",
|
|
559
|
+
title: "Summary",
|
|
560
|
+
description: "Summary of the chunk content"
|
|
561
|
+
},
|
|
562
|
+
entities: {
|
|
563
|
+
type: "array",
|
|
564
|
+
items: EntitySchema,
|
|
565
|
+
title: "Entities",
|
|
566
|
+
description: "Named entities extracted from the chunk"
|
|
567
|
+
}
|
|
568
|
+
},
|
|
569
|
+
required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
|
|
570
|
+
additionalProperties: true
|
|
571
|
+
};
|
|
572
|
+
var ChunkMetadataArraySchema = {
|
|
573
|
+
type: "array",
|
|
574
|
+
items: ChunkMetadataSchema,
|
|
575
|
+
title: "Chunk Metadata",
|
|
576
|
+
description: "Metadata for each chunk"
|
|
577
|
+
};
|
|
578
|
+
var EnrichedChunkMetadataSchema = {
|
|
579
|
+
type: "object",
|
|
580
|
+
properties: {
|
|
581
|
+
doc_id: {
|
|
582
|
+
type: "string",
|
|
583
|
+
title: "Document ID",
|
|
584
|
+
description: "ID of the parent document"
|
|
585
|
+
},
|
|
586
|
+
chunkId: {
|
|
587
|
+
type: "string",
|
|
588
|
+
title: "Chunk ID",
|
|
589
|
+
description: "Unique identifier for this chunk"
|
|
590
|
+
},
|
|
591
|
+
leafNodeId: {
|
|
592
|
+
type: "string",
|
|
593
|
+
title: "Leaf Node ID",
|
|
594
|
+
description: "ID of the leaf node this chunk belongs to"
|
|
595
|
+
},
|
|
596
|
+
depth: {
|
|
597
|
+
type: "integer",
|
|
598
|
+
title: "Depth",
|
|
599
|
+
description: "Depth in the document tree"
|
|
600
|
+
},
|
|
601
|
+
text: {
|
|
602
|
+
type: "string",
|
|
603
|
+
title: "Text",
|
|
604
|
+
description: "Text content of the chunk"
|
|
605
|
+
},
|
|
606
|
+
nodePath: {
|
|
607
|
+
type: "array",
|
|
608
|
+
items: { type: "string" },
|
|
609
|
+
title: "Node Path",
|
|
610
|
+
description: "Node IDs from root to leaf"
|
|
611
|
+
},
|
|
612
|
+
summary: {
|
|
613
|
+
type: "string",
|
|
614
|
+
title: "Summary",
|
|
615
|
+
description: "Summary of the chunk content"
|
|
616
|
+
},
|
|
617
|
+
entities: {
|
|
618
|
+
type: "array",
|
|
619
|
+
items: EntitySchema,
|
|
620
|
+
title: "Entities",
|
|
621
|
+
description: "Named entities (rolled up from hierarchy)"
|
|
622
|
+
},
|
|
623
|
+
parentSummaries: {
|
|
624
|
+
type: "array",
|
|
625
|
+
items: { type: "string" },
|
|
626
|
+
title: "Parent Summaries",
|
|
627
|
+
description: "Summaries from ancestor nodes"
|
|
628
|
+
},
|
|
629
|
+
sectionTitles: {
|
|
630
|
+
type: "array",
|
|
631
|
+
items: { type: "string" },
|
|
632
|
+
title: "Section Titles",
|
|
633
|
+
description: "Titles of ancestor section nodes"
|
|
634
|
+
}
|
|
635
|
+
},
|
|
636
|
+
required: ["doc_id", "chunkId", "leafNodeId", "depth", "text", "nodePath"],
|
|
637
|
+
additionalProperties: true
|
|
638
|
+
};
|
|
639
|
+
var EnrichedChunkMetadataArraySchema = {
|
|
640
|
+
type: "array",
|
|
641
|
+
items: EnrichedChunkMetadataSchema,
|
|
642
|
+
title: "Enriched Metadata",
|
|
643
|
+
description: "Metadata enriched with hierarchy information"
|
|
644
|
+
};
|
|
645
|
+
var DocumentMetadataSchema = {
|
|
646
|
+
type: "object",
|
|
647
|
+
properties: {
|
|
648
|
+
title: {
|
|
649
|
+
type: "string",
|
|
650
|
+
title: "Title",
|
|
651
|
+
description: "Document title"
|
|
652
|
+
},
|
|
653
|
+
sourceUri: {
|
|
654
|
+
type: "string",
|
|
655
|
+
title: "Source URI",
|
|
656
|
+
description: "Original source URI of the document"
|
|
657
|
+
},
|
|
658
|
+
createdAt: {
|
|
659
|
+
type: "string",
|
|
660
|
+
title: "Created At",
|
|
661
|
+
description: "ISO timestamp of creation"
|
|
662
|
+
}
|
|
663
|
+
},
|
|
664
|
+
required: ["title"],
|
|
665
|
+
additionalProperties: true
|
|
666
|
+
};
|
|
667
|
+
|
|
668
|
+
// src/document/DocumentNode.ts
|
|
669
|
+
function estimateTokens(text) {
|
|
670
|
+
return Math.ceil(text.length / 4);
|
|
671
|
+
}
|
|
672
|
+
function hasChildren(node) {
|
|
673
|
+
return node.kind === NodeKind.DOCUMENT || node.kind === NodeKind.SECTION || node.kind === NodeKind.TOPIC;
|
|
674
|
+
}
|
|
675
|
+
function getChildren(node) {
|
|
676
|
+
if (hasChildren(node)) {
|
|
677
|
+
return node.children;
|
|
678
|
+
}
|
|
679
|
+
return [];
|
|
680
|
+
}
|
|
681
|
+
function* traverseDepthFirst(node) {
|
|
682
|
+
yield node;
|
|
683
|
+
if (hasChildren(node)) {
|
|
684
|
+
for (const child of node.children) {
|
|
685
|
+
yield* traverseDepthFirst(child);
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
function getNodePath(root, targetNodeId) {
|
|
690
|
+
const path = [];
|
|
691
|
+
function search(node) {
|
|
692
|
+
path.push(node.nodeId);
|
|
693
|
+
if (node.nodeId === targetNodeId) {
|
|
694
|
+
return true;
|
|
695
|
+
}
|
|
696
|
+
if (hasChildren(node)) {
|
|
697
|
+
for (const child of node.children) {
|
|
698
|
+
if (search(child)) {
|
|
699
|
+
return true;
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
path.pop();
|
|
704
|
+
return false;
|
|
705
|
+
}
|
|
706
|
+
return search(root) ? path : undefined;
|
|
707
|
+
}
|
|
708
|
+
function getDocumentRange(root, nodePath) {
|
|
709
|
+
let currentNode = root;
|
|
710
|
+
for (let i = 1;i < nodePath.length; i++) {
|
|
711
|
+
const targetId = nodePath[i];
|
|
712
|
+
const children = currentNode.children;
|
|
713
|
+
let found;
|
|
714
|
+
for (let j = 0;j < children.length; j++) {
|
|
715
|
+
if (children[j].nodeId === targetId) {
|
|
716
|
+
found = children[j];
|
|
717
|
+
break;
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
if (!found) {
|
|
721
|
+
throw new Error(`Node with id ${targetId} not found in path`);
|
|
722
|
+
}
|
|
723
|
+
currentNode = found;
|
|
724
|
+
}
|
|
725
|
+
return currentNode.range;
|
|
726
|
+
}
|
|
727
|
+
// src/document/DocumentStorageSchema.ts
|
|
728
|
+
var DocumentStorageSchema = {
|
|
729
|
+
type: "object",
|
|
730
|
+
properties: {
|
|
731
|
+
doc_id: {
|
|
732
|
+
type: "string",
|
|
733
|
+
"x-auto-generated": true,
|
|
734
|
+
title: "Document ID",
|
|
735
|
+
description: "Unique identifier for the document"
|
|
736
|
+
},
|
|
737
|
+
data: {
|
|
738
|
+
type: "string",
|
|
739
|
+
title: "Document Data",
|
|
740
|
+
description: "JSON-serialized document"
|
|
741
|
+
},
|
|
742
|
+
metadata: {
|
|
743
|
+
type: "object",
|
|
744
|
+
title: "Metadata",
|
|
745
|
+
description: "Metadata of the document"
|
|
746
|
+
}
|
|
747
|
+
},
|
|
748
|
+
required: ["doc_id", "data"],
|
|
749
|
+
additionalProperties: true
|
|
750
|
+
};
|
|
751
|
+
var DocumentStorageKey = ["doc_id"];
|
|
752
|
+
// src/document/StructuralParser.ts
|
|
753
|
+
import { uuid4 } from "@workglow/util";
|
|
754
|
+
class StructuralParser {
|
|
755
|
+
static async parseMarkdown(doc_id, text, title) {
|
|
756
|
+
const lines = text.split(`
|
|
757
|
+
`);
|
|
758
|
+
let currentOffset = 0;
|
|
759
|
+
const root = {
|
|
760
|
+
nodeId: uuid4(),
|
|
761
|
+
kind: NodeKind.DOCUMENT,
|
|
762
|
+
range: { startOffset: 0, endOffset: text.length },
|
|
763
|
+
text: title,
|
|
764
|
+
title,
|
|
765
|
+
children: []
|
|
766
|
+
};
|
|
767
|
+
let currentParentStack = [root];
|
|
768
|
+
let textBuffer = [];
|
|
769
|
+
let textBufferStartOffset = 0;
|
|
770
|
+
const flushTextBuffer = async () => {
|
|
771
|
+
if (textBuffer.length > 0) {
|
|
772
|
+
const content = textBuffer.join(`
|
|
773
|
+
`).trim();
|
|
774
|
+
if (content) {
|
|
775
|
+
const paragraphStartOffset = textBufferStartOffset;
|
|
776
|
+
const paragraphEndOffset = currentOffset;
|
|
777
|
+
const paragraph = {
|
|
778
|
+
nodeId: uuid4(),
|
|
779
|
+
kind: NodeKind.PARAGRAPH,
|
|
780
|
+
range: {
|
|
781
|
+
startOffset: paragraphStartOffset,
|
|
782
|
+
endOffset: paragraphEndOffset
|
|
783
|
+
},
|
|
784
|
+
text: content
|
|
785
|
+
};
|
|
786
|
+
currentParentStack[currentParentStack.length - 1].children.push(paragraph);
|
|
787
|
+
}
|
|
788
|
+
textBuffer = [];
|
|
789
|
+
}
|
|
790
|
+
};
|
|
791
|
+
for (const line of lines) {
|
|
792
|
+
const lineLength = line.length + 1;
|
|
793
|
+
const headerMatch = line.match(/^(#{1,6})\s+(.*)$/);
|
|
794
|
+
if (headerMatch) {
|
|
795
|
+
await flushTextBuffer();
|
|
796
|
+
const level = headerMatch[1].length;
|
|
797
|
+
const headerTitle = headerMatch[2];
|
|
798
|
+
while (currentParentStack.length > 1 && currentParentStack[currentParentStack.length - 1].kind === NodeKind.SECTION && currentParentStack[currentParentStack.length - 1].level >= level) {
|
|
799
|
+
const poppedSection = currentParentStack.pop();
|
|
800
|
+
const updatedSection = {
|
|
801
|
+
...poppedSection,
|
|
802
|
+
range: {
|
|
803
|
+
...poppedSection.range,
|
|
804
|
+
endOffset: currentOffset
|
|
805
|
+
}
|
|
806
|
+
};
|
|
807
|
+
const parent = currentParentStack[currentParentStack.length - 1];
|
|
808
|
+
parent.children[parent.children.length - 1] = updatedSection;
|
|
809
|
+
}
|
|
810
|
+
const sectionStartOffset = currentOffset;
|
|
811
|
+
const section = {
|
|
812
|
+
nodeId: uuid4(),
|
|
813
|
+
kind: NodeKind.SECTION,
|
|
814
|
+
level,
|
|
815
|
+
title: headerTitle,
|
|
816
|
+
range: {
|
|
817
|
+
startOffset: sectionStartOffset,
|
|
818
|
+
endOffset: text.length
|
|
819
|
+
},
|
|
820
|
+
text: headerTitle,
|
|
821
|
+
children: []
|
|
822
|
+
};
|
|
823
|
+
currentParentStack[currentParentStack.length - 1].children.push(section);
|
|
824
|
+
currentParentStack.push(section);
|
|
825
|
+
} else {
|
|
826
|
+
if (textBuffer.length === 0) {
|
|
827
|
+
textBufferStartOffset = currentOffset;
|
|
828
|
+
}
|
|
829
|
+
textBuffer.push(line);
|
|
830
|
+
}
|
|
831
|
+
currentOffset += lineLength;
|
|
832
|
+
}
|
|
833
|
+
await flushTextBuffer();
|
|
834
|
+
while (currentParentStack.length > 1) {
|
|
835
|
+
const section = currentParentStack.pop();
|
|
836
|
+
const updatedSection = {
|
|
837
|
+
...section,
|
|
838
|
+
range: {
|
|
839
|
+
...section.range,
|
|
840
|
+
endOffset: text.length
|
|
841
|
+
}
|
|
842
|
+
};
|
|
843
|
+
const parent = currentParentStack[currentParentStack.length - 1];
|
|
844
|
+
parent.children[parent.children.length - 1] = updatedSection;
|
|
845
|
+
}
|
|
846
|
+
return root;
|
|
847
|
+
}
|
|
848
|
+
static async parsePlainText(doc_id, text, title) {
|
|
849
|
+
const root = {
|
|
850
|
+
nodeId: uuid4(),
|
|
851
|
+
kind: NodeKind.DOCUMENT,
|
|
852
|
+
range: { startOffset: 0, endOffset: text.length },
|
|
853
|
+
text: title,
|
|
854
|
+
title,
|
|
855
|
+
children: []
|
|
856
|
+
};
|
|
857
|
+
const paragraphRegex = /\n\s*\n/g;
|
|
858
|
+
let lastIndex = 0;
|
|
859
|
+
let paragraphIndex = 0;
|
|
860
|
+
let match;
|
|
861
|
+
while ((match = paragraphRegex.exec(text)) !== null) {
|
|
862
|
+
const rawParagraph = text.slice(lastIndex, match.index);
|
|
863
|
+
const paragraphText = rawParagraph.trim();
|
|
864
|
+
if (paragraphText.length > 0) {
|
|
865
|
+
const trimmedRelativeStart = rawParagraph.indexOf(paragraphText);
|
|
866
|
+
const startOffset = lastIndex + trimmedRelativeStart;
|
|
867
|
+
const endOffset = startOffset + paragraphText.length;
|
|
868
|
+
const paragraph = {
|
|
869
|
+
nodeId: uuid4(),
|
|
870
|
+
kind: NodeKind.PARAGRAPH,
|
|
871
|
+
range: {
|
|
872
|
+
startOffset,
|
|
873
|
+
endOffset
|
|
874
|
+
},
|
|
875
|
+
text: paragraphText
|
|
876
|
+
};
|
|
877
|
+
root.children.push(paragraph);
|
|
878
|
+
paragraphIndex++;
|
|
879
|
+
}
|
|
880
|
+
lastIndex = paragraphRegex.lastIndex;
|
|
881
|
+
}
|
|
882
|
+
if (lastIndex < text.length) {
|
|
883
|
+
const rawParagraph = text.slice(lastIndex);
|
|
884
|
+
const paragraphText = rawParagraph.trim();
|
|
885
|
+
if (paragraphText.length > 0) {
|
|
886
|
+
const trimmedRelativeStart = rawParagraph.indexOf(paragraphText);
|
|
887
|
+
const startOffset = lastIndex + trimmedRelativeStart;
|
|
888
|
+
const endOffset = startOffset + paragraphText.length;
|
|
889
|
+
const paragraph = {
|
|
890
|
+
nodeId: uuid4(),
|
|
891
|
+
kind: NodeKind.PARAGRAPH,
|
|
892
|
+
range: {
|
|
893
|
+
startOffset,
|
|
894
|
+
endOffset
|
|
895
|
+
},
|
|
896
|
+
text: paragraphText
|
|
897
|
+
};
|
|
898
|
+
root.children.push(paragraph);
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
return root;
|
|
902
|
+
}
|
|
903
|
+
static parse(doc_id, text, title, format) {
|
|
904
|
+
if (format === "markdown" || !format && this.looksLikeMarkdown(text)) {
|
|
905
|
+
return this.parseMarkdown(doc_id, text, title);
|
|
906
|
+
}
|
|
907
|
+
return this.parsePlainText(doc_id, text, title);
|
|
908
|
+
}
|
|
909
|
+
static looksLikeMarkdown(text) {
|
|
910
|
+
return /^#{1,6}\s/m.test(text);
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
// src/document-chunk/DocumentChunkDataset.ts
|
|
914
|
+
class DocumentChunkDataset {
|
|
915
|
+
storage;
|
|
916
|
+
constructor(storage) {
|
|
917
|
+
this.storage = storage;
|
|
918
|
+
}
|
|
919
|
+
getStorage() {
|
|
920
|
+
return this.storage;
|
|
921
|
+
}
|
|
922
|
+
async put(chunk) {
|
|
923
|
+
return this.storage.put(chunk);
|
|
924
|
+
}
|
|
925
|
+
async putBulk(chunks) {
|
|
926
|
+
return this.storage.putBulk(chunks);
|
|
927
|
+
}
|
|
928
|
+
async get(chunk_id) {
|
|
929
|
+
const key = { chunk_id };
|
|
930
|
+
return this.storage.get(key);
|
|
931
|
+
}
|
|
932
|
+
async delete(chunk_id) {
|
|
933
|
+
const key = { chunk_id };
|
|
934
|
+
return this.storage.delete(key);
|
|
935
|
+
}
|
|
936
|
+
async similaritySearch(query, options) {
|
|
937
|
+
return this.storage.similaritySearch(query, options);
|
|
938
|
+
}
|
|
939
|
+
async hybridSearch(query, options) {
|
|
940
|
+
if (this.storage.hybridSearch) {
|
|
941
|
+
return this.storage.hybridSearch(query, options);
|
|
942
|
+
}
|
|
943
|
+
throw new Error("Hybrid search not supported by this storage backend");
|
|
944
|
+
}
|
|
945
|
+
async getAll() {
|
|
946
|
+
return this.storage.getAll();
|
|
947
|
+
}
|
|
948
|
+
async size() {
|
|
949
|
+
return this.storage.size();
|
|
950
|
+
}
|
|
951
|
+
async clear() {
|
|
952
|
+
return this.storage.clear();
|
|
953
|
+
}
|
|
954
|
+
destroy() {
|
|
955
|
+
return this.storage.destroy();
|
|
956
|
+
}
|
|
957
|
+
async setupDatabase() {
|
|
958
|
+
return this.storage.setupDatabase();
|
|
959
|
+
}
|
|
960
|
+
getVectorDimensions() {
|
|
961
|
+
return this.storage.getVectorDimensions();
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
// src/document-chunk/DocumentChunkDatasetRegistry.ts
|
|
965
|
+
import {
|
|
966
|
+
createServiceToken as createServiceToken2,
|
|
967
|
+
globalServiceRegistry as globalServiceRegistry2,
|
|
968
|
+
registerInputResolver as registerInputResolver2
|
|
969
|
+
} from "@workglow/util";
|
|
970
|
+
var DOCUMENT_CHUNK_DATASET = createServiceToken2("dataset.document-chunk");
|
|
971
|
+
if (!globalServiceRegistry2.has(DOCUMENT_CHUNK_DATASET)) {
|
|
972
|
+
globalServiceRegistry2.register(DOCUMENT_CHUNK_DATASET, () => new Map, true);
|
|
973
|
+
}
|
|
974
|
+
function getGlobalDocumentChunkDataset() {
|
|
975
|
+
return globalServiceRegistry2.get(DOCUMENT_CHUNK_DATASET);
|
|
976
|
+
}
|
|
977
|
+
function registerDocumentChunkDataset(id, dataset) {
|
|
978
|
+
const datasets = getGlobalDocumentChunkDataset();
|
|
979
|
+
datasets.set(id, dataset);
|
|
980
|
+
}
|
|
981
|
+
function getDocumentChunkDataset(id) {
|
|
982
|
+
return getGlobalDocumentChunkDataset().get(id);
|
|
983
|
+
}
|
|
984
|
+
async function resolveDocumentChunkDatasetFromRegistry(id, format, registry) {
|
|
985
|
+
const datasets = registry.has(DOCUMENT_CHUNK_DATASET) ? registry.get(DOCUMENT_CHUNK_DATASET) : getGlobalDocumentChunkDataset();
|
|
986
|
+
const dataset = datasets.get(id);
|
|
987
|
+
if (!dataset) {
|
|
988
|
+
throw new Error(`Document chunk dataset "${id}" not found in registry`);
|
|
989
|
+
}
|
|
990
|
+
return dataset;
|
|
991
|
+
}
|
|
992
|
+
registerInputResolver2("dataset:document-chunk", resolveDocumentChunkDatasetFromRegistry);
|
|
993
|
+
// src/document-chunk/DocumentChunkSchema.ts
|
|
994
|
+
import { TypedArraySchema } from "@workglow/util";
|
|
995
|
+
var DocumentChunkSchema = {
|
|
996
|
+
type: "object",
|
|
997
|
+
properties: {
|
|
998
|
+
chunk_id: { type: "string", "x-auto-generated": true },
|
|
999
|
+
doc_id: { type: "string" },
|
|
1000
|
+
vector: TypedArraySchema(),
|
|
1001
|
+
metadata: { type: "object", format: "metadata", additionalProperties: true }
|
|
1002
|
+
},
|
|
1003
|
+
additionalProperties: false
|
|
1004
|
+
};
|
|
1005
|
+
var DocumentChunkPrimaryKey = ["chunk_id"];
|
|
1006
|
+
export {
|
|
1007
|
+
traverseDepthFirst,
|
|
1008
|
+
registerDocumentDataset,
|
|
1009
|
+
registerDocumentChunkDataset,
|
|
1010
|
+
hasChildren,
|
|
1011
|
+
getNodePath,
|
|
1012
|
+
getGlobalDocumentDatasets,
|
|
1013
|
+
getGlobalDocumentChunkDataset,
|
|
1014
|
+
getDocumentRange,
|
|
1015
|
+
getDocumentDataset,
|
|
1016
|
+
getDocumentChunkDataset,
|
|
1017
|
+
getChildren,
|
|
1018
|
+
estimateTokens,
|
|
1019
|
+
TypeTabularStorage,
|
|
1020
|
+
TypeDocumentDataset,
|
|
1021
|
+
TypeDocumentChunkDataset,
|
|
1022
|
+
TopicNodeSchema,
|
|
1023
|
+
TokenBudgetSchema,
|
|
1024
|
+
StructuralParser,
|
|
1025
|
+
SentenceNodeSchema,
|
|
1026
|
+
SectionNodeSchema,
|
|
1027
|
+
ParagraphNodeSchema,
|
|
1028
|
+
NodeRangeSchema,
|
|
1029
|
+
NodeKind,
|
|
1030
|
+
NodeEnrichmentSchema,
|
|
1031
|
+
EntitySchema,
|
|
1032
|
+
EnrichedChunkMetadataSchema,
|
|
1033
|
+
EnrichedChunkMetadataArraySchema,
|
|
1034
|
+
DocumentStorageSchema,
|
|
1035
|
+
DocumentStorageKey,
|
|
1036
|
+
DocumentRootNodeSchema,
|
|
1037
|
+
DocumentNodeSchema,
|
|
1038
|
+
DocumentNodeBaseSchema,
|
|
1039
|
+
DocumentMetadataSchema,
|
|
1040
|
+
DocumentDataset,
|
|
1041
|
+
DocumentChunkSchema,
|
|
1042
|
+
DocumentChunkPrimaryKey,
|
|
1043
|
+
DocumentChunkDataset,
|
|
1044
|
+
Document,
|
|
1045
|
+
DOCUMENT_DATASETS,
|
|
1046
|
+
DOCUMENT_CHUNK_DATASET,
|
|
1047
|
+
ChunkNodeSchema,
|
|
1048
|
+
ChunkMetadataSchema,
|
|
1049
|
+
ChunkMetadataArraySchema,
|
|
1050
|
+
ChunkEnrichmentSchema
|
|
1051
|
+
};
|
|
1052
|
+
|
|
1053
|
+
//# debugId=10C8E3D0DA933AA664756E2164756E21
|