@workglow/knowledge-base 0.0.115
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +670 -0
- package/dist/browser.js +1071 -0
- package/dist/browser.js.map +23 -0
- package/dist/bun.js +1072 -0
- package/dist/bun.js.map +23 -0
- package/dist/chunk/ChunkSchema.d.ts +206 -0
- package/dist/chunk/ChunkSchema.d.ts.map +1 -0
- package/dist/chunk/ChunkVectorStorageSchema.d.ts +64 -0
- package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -0
- package/dist/common-server.d.ts +7 -0
- package/dist/common-server.d.ts.map +1 -0
- package/dist/common.d.ts +20 -0
- package/dist/common.d.ts.map +1 -0
- package/dist/document/Document.d.ts +51 -0
- package/dist/document/Document.d.ts.map +1 -0
- package/dist/document/DocumentNode.d.ts +32 -0
- package/dist/document/DocumentNode.d.ts.map +1 -0
- package/dist/document/DocumentSchema.d.ts +1203 -0
- package/dist/document/DocumentSchema.d.ts.map +1 -0
- package/dist/document/DocumentStorageSchema.d.ts +43 -0
- package/dist/document/DocumentStorageSchema.d.ts.map +1 -0
- package/dist/document/StructuralParser.d.ts +30 -0
- package/dist/document/StructuralParser.d.ts.map +1 -0
- package/dist/knowledge-base/InMemoryKnowledgeBaseRepository.d.ts +13 -0
- package/dist/knowledge-base/InMemoryKnowledgeBaseRepository.d.ts.map +1 -0
- package/dist/knowledge-base/KnowledgeBase.d.ts +123 -0
- package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -0
- package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +38 -0
- package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -0
- package/dist/knowledge-base/KnowledgeBaseRepository.d.ts +74 -0
- package/dist/knowledge-base/KnowledgeBaseRepository.d.ts.map +1 -0
- package/dist/knowledge-base/KnowledgeBaseSchema.d.ts +50 -0
- package/dist/knowledge-base/KnowledgeBaseSchema.d.ts.map +1 -0
- package/dist/knowledge-base/createKnowledgeBase.d.ts +30 -0
- package/dist/knowledge-base/createKnowledgeBase.d.ts.map +1 -0
- package/dist/node.js +1071 -0
- package/dist/node.js.map +23 -0
- package/dist/types.d.ts +7 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/util/DatasetSchema.d.ts +40 -0
- package/dist/util/DatasetSchema.d.ts.map +1 -0
- package/package.json +55 -0
package/dist/node.js
ADDED
|
@@ -0,0 +1,1071 @@
|
|
|
1
|
+
// src/document/DocumentSchema.ts
|
|
2
|
+
var NodeKind = {
|
|
3
|
+
DOCUMENT: "document",
|
|
4
|
+
SECTION: "section",
|
|
5
|
+
PARAGRAPH: "paragraph",
|
|
6
|
+
SENTENCE: "sentence",
|
|
7
|
+
TOPIC: "topic"
|
|
8
|
+
};
|
|
9
|
+
var NodeRangeSchema = {
|
|
10
|
+
type: "object",
|
|
11
|
+
properties: {
|
|
12
|
+
startOffset: {
|
|
13
|
+
type: "integer",
|
|
14
|
+
title: "Start Offset",
|
|
15
|
+
description: "Starting character offset"
|
|
16
|
+
},
|
|
17
|
+
endOffset: {
|
|
18
|
+
type: "integer",
|
|
19
|
+
title: "End Offset",
|
|
20
|
+
description: "Ending character offset"
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
required: ["startOffset", "endOffset"],
|
|
24
|
+
additionalProperties: false
|
|
25
|
+
};
|
|
26
|
+
var EntitySchema = {
|
|
27
|
+
type: "object",
|
|
28
|
+
properties: {
|
|
29
|
+
text: {
|
|
30
|
+
type: "string",
|
|
31
|
+
title: "Text",
|
|
32
|
+
description: "Entity text"
|
|
33
|
+
},
|
|
34
|
+
type: {
|
|
35
|
+
type: "string",
|
|
36
|
+
title: "Type",
|
|
37
|
+
description: "Entity type (e.g., PERSON, ORG, LOC)"
|
|
38
|
+
},
|
|
39
|
+
score: {
|
|
40
|
+
type: "number",
|
|
41
|
+
title: "Score",
|
|
42
|
+
description: "Confidence score"
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
required: ["text", "type", "score"],
|
|
46
|
+
additionalProperties: false
|
|
47
|
+
};
|
|
48
|
+
var NodeEnrichmentSchema = {
|
|
49
|
+
type: "object",
|
|
50
|
+
properties: {
|
|
51
|
+
summary: {
|
|
52
|
+
type: "string",
|
|
53
|
+
title: "Summary",
|
|
54
|
+
description: "Summary of the node content"
|
|
55
|
+
},
|
|
56
|
+
entities: {
|
|
57
|
+
type: "array",
|
|
58
|
+
items: EntitySchema,
|
|
59
|
+
title: "Entities",
|
|
60
|
+
description: "Named entities extracted from the node"
|
|
61
|
+
},
|
|
62
|
+
keywords: {
|
|
63
|
+
type: "array",
|
|
64
|
+
items: { type: "string" },
|
|
65
|
+
title: "Keywords",
|
|
66
|
+
description: "Keywords associated with the node"
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
additionalProperties: false
|
|
70
|
+
};
|
|
71
|
+
var DocumentNodeBaseSchema = {
|
|
72
|
+
type: "object",
|
|
73
|
+
properties: {
|
|
74
|
+
nodeId: {
|
|
75
|
+
type: "string",
|
|
76
|
+
title: "Node ID",
|
|
77
|
+
description: "Unique identifier for this node"
|
|
78
|
+
},
|
|
79
|
+
kind: {
|
|
80
|
+
type: "string",
|
|
81
|
+
enum: Object.values(NodeKind),
|
|
82
|
+
title: "Kind",
|
|
83
|
+
description: "Node type discriminator"
|
|
84
|
+
},
|
|
85
|
+
range: NodeRangeSchema,
|
|
86
|
+
text: {
|
|
87
|
+
type: "string",
|
|
88
|
+
title: "Text",
|
|
89
|
+
description: "Text content of the node"
|
|
90
|
+
},
|
|
91
|
+
enrichment: NodeEnrichmentSchema
|
|
92
|
+
},
|
|
93
|
+
required: ["nodeId", "kind", "range", "text"],
|
|
94
|
+
additionalProperties: true
|
|
95
|
+
};
|
|
96
|
+
var DocumentNodeSchema = {
|
|
97
|
+
type: "object",
|
|
98
|
+
title: "Document Node",
|
|
99
|
+
description: "A node in the hierarchical document tree",
|
|
100
|
+
properties: {
|
|
101
|
+
...DocumentNodeBaseSchema.properties,
|
|
102
|
+
level: {
|
|
103
|
+
type: "integer",
|
|
104
|
+
title: "Level",
|
|
105
|
+
description: "Header level for section nodes"
|
|
106
|
+
},
|
|
107
|
+
title: {
|
|
108
|
+
type: "string",
|
|
109
|
+
title: "Title",
|
|
110
|
+
description: "Section title"
|
|
111
|
+
},
|
|
112
|
+
children: {
|
|
113
|
+
type: "array",
|
|
114
|
+
title: "Children",
|
|
115
|
+
description: "Child nodes"
|
|
116
|
+
}
|
|
117
|
+
},
|
|
118
|
+
required: [...DocumentNodeBaseSchema.required],
|
|
119
|
+
additionalProperties: false
|
|
120
|
+
};
|
|
121
|
+
var ParagraphNodeSchema = {
|
|
122
|
+
type: "object",
|
|
123
|
+
properties: {
|
|
124
|
+
...DocumentNodeBaseSchema.properties,
|
|
125
|
+
kind: {
|
|
126
|
+
type: "string",
|
|
127
|
+
const: NodeKind.PARAGRAPH,
|
|
128
|
+
title: "Kind",
|
|
129
|
+
description: "Node type discriminator"
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
required: [...DocumentNodeBaseSchema.required],
|
|
133
|
+
additionalProperties: false
|
|
134
|
+
};
|
|
135
|
+
var SentenceNodeSchema = {
|
|
136
|
+
type: "object",
|
|
137
|
+
properties: {
|
|
138
|
+
...DocumentNodeBaseSchema.properties,
|
|
139
|
+
kind: {
|
|
140
|
+
type: "string",
|
|
141
|
+
const: NodeKind.SENTENCE,
|
|
142
|
+
title: "Kind",
|
|
143
|
+
description: "Node type discriminator"
|
|
144
|
+
}
|
|
145
|
+
},
|
|
146
|
+
required: [...DocumentNodeBaseSchema.required],
|
|
147
|
+
additionalProperties: false
|
|
148
|
+
};
|
|
149
|
+
var SectionNodeSchema = {
|
|
150
|
+
type: "object",
|
|
151
|
+
properties: {
|
|
152
|
+
...DocumentNodeBaseSchema.properties,
|
|
153
|
+
kind: {
|
|
154
|
+
type: "string",
|
|
155
|
+
const: NodeKind.SECTION,
|
|
156
|
+
title: "Kind",
|
|
157
|
+
description: "Node type discriminator"
|
|
158
|
+
},
|
|
159
|
+
level: {
|
|
160
|
+
type: "integer",
|
|
161
|
+
minimum: 1,
|
|
162
|
+
maximum: 6,
|
|
163
|
+
title: "Level",
|
|
164
|
+
description: "Header level (1-6 for markdown)"
|
|
165
|
+
},
|
|
166
|
+
title: {
|
|
167
|
+
type: "string",
|
|
168
|
+
title: "Title",
|
|
169
|
+
description: "Section title"
|
|
170
|
+
},
|
|
171
|
+
children: {
|
|
172
|
+
type: "array",
|
|
173
|
+
items: DocumentNodeSchema,
|
|
174
|
+
title: "Children",
|
|
175
|
+
description: "Child nodes"
|
|
176
|
+
}
|
|
177
|
+
},
|
|
178
|
+
required: [...DocumentNodeBaseSchema.required, "level", "title", "children"],
|
|
179
|
+
additionalProperties: false
|
|
180
|
+
};
|
|
181
|
+
var TopicNodeSchema = {
|
|
182
|
+
type: "object",
|
|
183
|
+
properties: {
|
|
184
|
+
...DocumentNodeBaseSchema.properties,
|
|
185
|
+
kind: {
|
|
186
|
+
type: "string",
|
|
187
|
+
const: NodeKind.TOPIC,
|
|
188
|
+
title: "Kind",
|
|
189
|
+
description: "Node type discriminator"
|
|
190
|
+
},
|
|
191
|
+
children: {
|
|
192
|
+
type: "array",
|
|
193
|
+
items: DocumentNodeSchema,
|
|
194
|
+
title: "Children",
|
|
195
|
+
description: "Child nodes"
|
|
196
|
+
}
|
|
197
|
+
},
|
|
198
|
+
required: [...DocumentNodeBaseSchema.required, "children"],
|
|
199
|
+
additionalProperties: false
|
|
200
|
+
};
|
|
201
|
+
var DocumentRootNodeSchema = {
|
|
202
|
+
type: "object",
|
|
203
|
+
properties: {
|
|
204
|
+
...DocumentNodeBaseSchema.properties,
|
|
205
|
+
kind: {
|
|
206
|
+
type: "string",
|
|
207
|
+
const: NodeKind.DOCUMENT,
|
|
208
|
+
title: "Kind",
|
|
209
|
+
description: "Node type discriminator"
|
|
210
|
+
},
|
|
211
|
+
title: {
|
|
212
|
+
type: "string",
|
|
213
|
+
title: "Title",
|
|
214
|
+
description: "Document title"
|
|
215
|
+
},
|
|
216
|
+
children: {
|
|
217
|
+
type: "array",
|
|
218
|
+
items: DocumentNodeSchema,
|
|
219
|
+
title: "Children",
|
|
220
|
+
description: "Child nodes"
|
|
221
|
+
}
|
|
222
|
+
},
|
|
223
|
+
required: [...DocumentNodeBaseSchema.required, "title", "children"],
|
|
224
|
+
additionalProperties: false
|
|
225
|
+
};
|
|
226
|
+
var TokenBudgetSchema = {
|
|
227
|
+
type: "object",
|
|
228
|
+
properties: {
|
|
229
|
+
maxTokensPerChunk: {
|
|
230
|
+
type: "integer",
|
|
231
|
+
title: "Max Tokens Per Chunk",
|
|
232
|
+
description: "Maximum tokens allowed per chunk"
|
|
233
|
+
},
|
|
234
|
+
overlapTokens: {
|
|
235
|
+
type: "integer",
|
|
236
|
+
title: "Overlap Tokens",
|
|
237
|
+
description: "Number of tokens to overlap between chunks"
|
|
238
|
+
},
|
|
239
|
+
reservedTokens: {
|
|
240
|
+
type: "integer",
|
|
241
|
+
title: "Reserved Tokens",
|
|
242
|
+
description: "Tokens reserved for metadata or context"
|
|
243
|
+
}
|
|
244
|
+
},
|
|
245
|
+
required: ["maxTokensPerChunk", "overlapTokens", "reservedTokens"],
|
|
246
|
+
additionalProperties: false
|
|
247
|
+
};
|
|
248
|
+
var DocumentMetadataSchema = {
|
|
249
|
+
type: "object",
|
|
250
|
+
properties: {
|
|
251
|
+
title: {
|
|
252
|
+
type: "string",
|
|
253
|
+
title: "Title",
|
|
254
|
+
description: "Document title"
|
|
255
|
+
},
|
|
256
|
+
sourceUri: {
|
|
257
|
+
type: "string",
|
|
258
|
+
title: "Source URI",
|
|
259
|
+
description: "Original source URI of the document"
|
|
260
|
+
},
|
|
261
|
+
createdAt: {
|
|
262
|
+
type: "string",
|
|
263
|
+
title: "Created At",
|
|
264
|
+
description: "ISO timestamp of creation"
|
|
265
|
+
}
|
|
266
|
+
},
|
|
267
|
+
required: ["title"],
|
|
268
|
+
additionalProperties: true
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
// src/chunk/ChunkSchema.ts
|
|
272
|
+
var ChunkRecordSchema = () => ({
|
|
273
|
+
type: "object",
|
|
274
|
+
properties: {
|
|
275
|
+
chunkId: {
|
|
276
|
+
type: "string",
|
|
277
|
+
title: "Chunk ID",
|
|
278
|
+
description: "Unique identifier for this chunk"
|
|
279
|
+
},
|
|
280
|
+
doc_id: {
|
|
281
|
+
type: "string",
|
|
282
|
+
title: "Document ID",
|
|
283
|
+
description: "ID of the parent document"
|
|
284
|
+
},
|
|
285
|
+
text: {
|
|
286
|
+
type: "string",
|
|
287
|
+
title: "Text",
|
|
288
|
+
description: "Text content of the chunk"
|
|
289
|
+
},
|
|
290
|
+
nodePath: {
|
|
291
|
+
type: "array",
|
|
292
|
+
items: { type: "string" },
|
|
293
|
+
title: "Node Path",
|
|
294
|
+
description: "Node IDs from root to leaf"
|
|
295
|
+
},
|
|
296
|
+
depth: {
|
|
297
|
+
type: "integer",
|
|
298
|
+
title: "Depth",
|
|
299
|
+
description: "Depth in the document tree"
|
|
300
|
+
},
|
|
301
|
+
leafNodeId: {
|
|
302
|
+
type: "string",
|
|
303
|
+
title: "Leaf Node ID",
|
|
304
|
+
description: "ID of the leaf node this chunk belongs to"
|
|
305
|
+
},
|
|
306
|
+
summary: {
|
|
307
|
+
type: "string",
|
|
308
|
+
title: "Summary",
|
|
309
|
+
description: "Summary of the chunk content"
|
|
310
|
+
},
|
|
311
|
+
entities: {
|
|
312
|
+
type: "array",
|
|
313
|
+
items: EntitySchema,
|
|
314
|
+
title: "Entities",
|
|
315
|
+
description: "Named entities extracted from the chunk"
|
|
316
|
+
},
|
|
317
|
+
parentSummaries: {
|
|
318
|
+
type: "array",
|
|
319
|
+
items: { type: "string" },
|
|
320
|
+
title: "Parent Summaries",
|
|
321
|
+
description: "Summaries from ancestor nodes"
|
|
322
|
+
},
|
|
323
|
+
sectionTitles: {
|
|
324
|
+
type: "array",
|
|
325
|
+
items: { type: "string" },
|
|
326
|
+
title: "Section Titles",
|
|
327
|
+
description: "Titles of ancestor section nodes"
|
|
328
|
+
},
|
|
329
|
+
doc_title: {
|
|
330
|
+
type: "string",
|
|
331
|
+
title: "Document Title",
|
|
332
|
+
description: "Title of the parent document"
|
|
333
|
+
}
|
|
334
|
+
},
|
|
335
|
+
required: ["chunkId", "doc_id", "text", "nodePath", "depth"],
|
|
336
|
+
additionalProperties: true
|
|
337
|
+
});
|
|
338
|
+
var ChunkRecordArraySchema = {
|
|
339
|
+
type: "array",
|
|
340
|
+
items: ChunkRecordSchema(),
|
|
341
|
+
title: "Chunk Records",
|
|
342
|
+
description: "Array of chunk records"
|
|
343
|
+
};
|
|
344
|
+
// src/chunk/ChunkVectorStorageSchema.ts
|
|
345
|
+
import { TypedArraySchema } from "@workglow/util";
|
|
346
|
+
var ChunkVectorStorageSchema = {
|
|
347
|
+
type: "object",
|
|
348
|
+
properties: {
|
|
349
|
+
chunk_id: { type: "string", "x-auto-generated": true },
|
|
350
|
+
doc_id: { type: "string" },
|
|
351
|
+
vector: TypedArraySchema(),
|
|
352
|
+
metadata: { type: "object", format: "metadata", additionalProperties: true }
|
|
353
|
+
},
|
|
354
|
+
required: ["chunk_id", "doc_id", "vector", "metadata"],
|
|
355
|
+
additionalProperties: false
|
|
356
|
+
};
|
|
357
|
+
var ChunkVectorPrimaryKey = ["chunk_id"];
|
|
358
|
+
// src/document/Document.ts
|
|
359
|
+
class Document {
|
|
360
|
+
doc_id;
|
|
361
|
+
metadata;
|
|
362
|
+
root;
|
|
363
|
+
chunks;
|
|
364
|
+
constructor(root, metadata, chunks = [], doc_id) {
|
|
365
|
+
this.doc_id = doc_id;
|
|
366
|
+
this.root = root;
|
|
367
|
+
this.metadata = metadata;
|
|
368
|
+
this.chunks = chunks || [];
|
|
369
|
+
}
|
|
370
|
+
setChunks(chunks) {
|
|
371
|
+
this.chunks = chunks;
|
|
372
|
+
}
|
|
373
|
+
getChunks() {
|
|
374
|
+
return this.chunks;
|
|
375
|
+
}
|
|
376
|
+
setDocId(doc_id) {
|
|
377
|
+
this.doc_id = doc_id;
|
|
378
|
+
}
|
|
379
|
+
findChunksByNodeId(nodeId) {
|
|
380
|
+
return this.chunks.filter((chunk) => chunk.nodePath.includes(nodeId));
|
|
381
|
+
}
|
|
382
|
+
toJSON() {
|
|
383
|
+
return {
|
|
384
|
+
metadata: this.metadata,
|
|
385
|
+
root: this.root,
|
|
386
|
+
chunks: this.chunks
|
|
387
|
+
};
|
|
388
|
+
}
|
|
389
|
+
static fromJSON(json, doc_id) {
|
|
390
|
+
const obj = JSON.parse(json);
|
|
391
|
+
return new Document(obj.root, obj.metadata, obj.chunks, doc_id);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// src/knowledge-base/KnowledgeBase.ts
|
|
396
|
+
class KnowledgeBase {
|
|
397
|
+
name;
|
|
398
|
+
title;
|
|
399
|
+
description;
|
|
400
|
+
tabularStorage;
|
|
401
|
+
chunkStorage;
|
|
402
|
+
constructor(name, documentStorage, chunkStorage, title, description) {
|
|
403
|
+
this.name = name;
|
|
404
|
+
this.title = title ?? name;
|
|
405
|
+
this.description = description ?? "";
|
|
406
|
+
this.tabularStorage = documentStorage;
|
|
407
|
+
this.chunkStorage = chunkStorage;
|
|
408
|
+
}
|
|
409
|
+
async upsertDocument(document) {
|
|
410
|
+
const serialized = JSON.stringify(document.toJSON());
|
|
411
|
+
const insertEntity = {
|
|
412
|
+
doc_id: document.doc_id,
|
|
413
|
+
data: serialized
|
|
414
|
+
};
|
|
415
|
+
const entity = await this.tabularStorage.put(insertEntity);
|
|
416
|
+
if (document.doc_id !== entity.doc_id) {
|
|
417
|
+
document.setDocId(entity.doc_id);
|
|
418
|
+
}
|
|
419
|
+
return document;
|
|
420
|
+
}
|
|
421
|
+
async getDocument(doc_id) {
|
|
422
|
+
const entity = await this.tabularStorage.get({ doc_id });
|
|
423
|
+
if (!entity) {
|
|
424
|
+
return;
|
|
425
|
+
}
|
|
426
|
+
return Document.fromJSON(entity.data, entity.doc_id);
|
|
427
|
+
}
|
|
428
|
+
async deleteDocument(doc_id) {
|
|
429
|
+
await this.deleteChunksForDocument(doc_id);
|
|
430
|
+
await this.tabularStorage.delete({ doc_id });
|
|
431
|
+
}
|
|
432
|
+
async listDocuments() {
|
|
433
|
+
const entities = await this.tabularStorage.getAll();
|
|
434
|
+
if (!entities) {
|
|
435
|
+
return [];
|
|
436
|
+
}
|
|
437
|
+
return entities.map((e) => e.doc_id);
|
|
438
|
+
}
|
|
439
|
+
async getNode(doc_id, nodeId) {
|
|
440
|
+
const doc = await this.getDocument(doc_id);
|
|
441
|
+
if (!doc) {
|
|
442
|
+
return;
|
|
443
|
+
}
|
|
444
|
+
const traverse = (node) => {
|
|
445
|
+
if (node.nodeId === nodeId) {
|
|
446
|
+
return node;
|
|
447
|
+
}
|
|
448
|
+
if ("children" in node && Array.isArray(node.children)) {
|
|
449
|
+
for (const child of node.children) {
|
|
450
|
+
const found = traverse(child);
|
|
451
|
+
if (found)
|
|
452
|
+
return found;
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
return;
|
|
456
|
+
};
|
|
457
|
+
return traverse(doc.root);
|
|
458
|
+
}
|
|
459
|
+
async getAncestors(doc_id, nodeId) {
|
|
460
|
+
const doc = await this.getDocument(doc_id);
|
|
461
|
+
if (!doc) {
|
|
462
|
+
return [];
|
|
463
|
+
}
|
|
464
|
+
const path = [];
|
|
465
|
+
const findPath = (node) => {
|
|
466
|
+
path.push(node.nodeId);
|
|
467
|
+
if (node.nodeId === nodeId) {
|
|
468
|
+
return true;
|
|
469
|
+
}
|
|
470
|
+
if ("children" in node && Array.isArray(node.children)) {
|
|
471
|
+
for (const child of node.children) {
|
|
472
|
+
if (findPath(child)) {
|
|
473
|
+
return true;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
path.pop();
|
|
478
|
+
return false;
|
|
479
|
+
};
|
|
480
|
+
if (!findPath(doc.root)) {
|
|
481
|
+
return [];
|
|
482
|
+
}
|
|
483
|
+
const ancestors = [];
|
|
484
|
+
let currentNode = doc.root;
|
|
485
|
+
ancestors.push(currentNode);
|
|
486
|
+
for (let i = 1;i < path.length; i++) {
|
|
487
|
+
const targetId = path[i];
|
|
488
|
+
if ("children" in currentNode && Array.isArray(currentNode.children)) {
|
|
489
|
+
const found = currentNode.children.find((child) => child.nodeId === targetId);
|
|
490
|
+
if (found) {
|
|
491
|
+
currentNode = found;
|
|
492
|
+
ancestors.push(currentNode);
|
|
493
|
+
} else {
|
|
494
|
+
break;
|
|
495
|
+
}
|
|
496
|
+
} else {
|
|
497
|
+
break;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
return ancestors;
|
|
501
|
+
}
|
|
502
|
+
async upsertChunk(chunk) {
|
|
503
|
+
if (chunk.vector.length !== this.getVectorDimensions()) {
|
|
504
|
+
throw new Error(`Vector dimension mismatch: expected ${this.getVectorDimensions()}, got ${chunk.vector.length}.`);
|
|
505
|
+
}
|
|
506
|
+
return this.chunkStorage.put(chunk);
|
|
507
|
+
}
|
|
508
|
+
async upsertChunksBulk(chunks) {
|
|
509
|
+
const expected = this.getVectorDimensions();
|
|
510
|
+
for (const chunk of chunks) {
|
|
511
|
+
if (chunk.vector.length !== expected) {
|
|
512
|
+
throw new Error(`Vector dimension mismatch: expected ${expected}, got ${chunk.vector.length}.`);
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
return this.chunkStorage.putBulk(chunks);
|
|
516
|
+
}
|
|
517
|
+
async deleteChunksForDocument(doc_id) {
|
|
518
|
+
await this.chunkStorage.deleteSearch({ doc_id });
|
|
519
|
+
}
|
|
520
|
+
async getChunksForDocument(doc_id) {
|
|
521
|
+
const results = await this.chunkStorage.query({ doc_id });
|
|
522
|
+
return results ?? [];
|
|
523
|
+
}
|
|
524
|
+
async similaritySearch(query, options) {
|
|
525
|
+
return this.chunkStorage.similaritySearch(query, options);
|
|
526
|
+
}
|
|
527
|
+
async hybridSearch(query, options) {
|
|
528
|
+
if (typeof this.chunkStorage.hybridSearch !== "function") {
|
|
529
|
+
throw new Error("Hybrid search is not supported by the configured chunk storage backend. " + "Please use a vector storage implementation that provides `hybridSearch`.");
|
|
530
|
+
}
|
|
531
|
+
return this.chunkStorage.hybridSearch(query, options);
|
|
532
|
+
}
|
|
533
|
+
async prepareReindex(doc_id) {
|
|
534
|
+
const doc = await this.getDocument(doc_id);
|
|
535
|
+
if (!doc) {
|
|
536
|
+
return;
|
|
537
|
+
}
|
|
538
|
+
await this.deleteChunksForDocument(doc_id);
|
|
539
|
+
return doc;
|
|
540
|
+
}
|
|
541
|
+
async setupDatabase() {
|
|
542
|
+
await this.tabularStorage.setupDatabase();
|
|
543
|
+
await this.chunkStorage.setupDatabase();
|
|
544
|
+
}
|
|
545
|
+
destroy() {
|
|
546
|
+
this.tabularStorage.destroy();
|
|
547
|
+
this.chunkStorage.destroy();
|
|
548
|
+
}
|
|
549
|
+
async getChunk(chunk_id) {
|
|
550
|
+
return this.chunkStorage.get({ chunk_id });
|
|
551
|
+
}
|
|
552
|
+
async put(chunk) {
|
|
553
|
+
return this.chunkStorage.put(chunk);
|
|
554
|
+
}
|
|
555
|
+
async putBulk(chunks) {
|
|
556
|
+
return this.chunkStorage.putBulk(chunks);
|
|
557
|
+
}
|
|
558
|
+
async getAllChunks() {
|
|
559
|
+
return this.chunkStorage.getAll();
|
|
560
|
+
}
|
|
561
|
+
async chunkCount() {
|
|
562
|
+
return this.chunkStorage.size();
|
|
563
|
+
}
|
|
564
|
+
async clearChunks() {
|
|
565
|
+
return this.chunkStorage.deleteAll();
|
|
566
|
+
}
|
|
567
|
+
getVectorDimensions() {
|
|
568
|
+
return this.chunkStorage.getVectorDimensions();
|
|
569
|
+
}
|
|
570
|
+
async getDocumentChunks(doc_id) {
|
|
571
|
+
const doc = await this.getDocument(doc_id);
|
|
572
|
+
if (!doc) {
|
|
573
|
+
return [];
|
|
574
|
+
}
|
|
575
|
+
return doc.getChunks();
|
|
576
|
+
}
|
|
577
|
+
async findChunksByNodeId(doc_id, nodeId) {
|
|
578
|
+
const doc = await this.getDocument(doc_id);
|
|
579
|
+
if (!doc) {
|
|
580
|
+
return [];
|
|
581
|
+
}
|
|
582
|
+
return doc.findChunksByNodeId(nodeId);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
// src/knowledge-base/KnowledgeBaseSchema.ts
|
|
586
|
+
var KnowledgeBaseRecordSchema = {
|
|
587
|
+
type: "object",
|
|
588
|
+
properties: {
|
|
589
|
+
kb_id: { type: "string" },
|
|
590
|
+
title: { type: "string" },
|
|
591
|
+
description: { type: "string" },
|
|
592
|
+
vector_dimensions: { type: "integer" },
|
|
593
|
+
document_table: { type: "string" },
|
|
594
|
+
chunk_table: { type: "string" },
|
|
595
|
+
created_at: { type: "string" },
|
|
596
|
+
updated_at: { type: "string" }
|
|
597
|
+
},
|
|
598
|
+
required: [
|
|
599
|
+
"kb_id",
|
|
600
|
+
"title",
|
|
601
|
+
"description",
|
|
602
|
+
"vector_dimensions",
|
|
603
|
+
"document_table",
|
|
604
|
+
"chunk_table",
|
|
605
|
+
"created_at",
|
|
606
|
+
"updated_at"
|
|
607
|
+
],
|
|
608
|
+
additionalProperties: false
|
|
609
|
+
};
|
|
610
|
+
var KnowledgeBasePrimaryKeyNames = ["kb_id"];
|
|
611
|
+
function knowledgeBaseTableNames(kbId) {
|
|
612
|
+
const safe = kbId.replace(/[^a-zA-Z0-9_]/g, "_");
|
|
613
|
+
return {
|
|
614
|
+
documentTable: `kb_docs_${safe}`,
|
|
615
|
+
chunkTable: `kb_chunks_${safe}`
|
|
616
|
+
};
|
|
617
|
+
}
|
|
618
|
+
// src/knowledge-base/KnowledgeBaseRepository.ts
|
|
619
|
+
import { EventEmitter } from "@workglow/util";
|
|
620
|
+
|
|
621
|
+
class KnowledgeBaseRepository {
|
|
622
|
+
storage;
|
|
623
|
+
constructor(storage) {
|
|
624
|
+
this.storage = storage;
|
|
625
|
+
}
|
|
626
|
+
events = new EventEmitter;
|
|
627
|
+
async setupDatabase() {
|
|
628
|
+
await this.storage.setupDatabase?.();
|
|
629
|
+
}
|
|
630
|
+
on(name, fn) {
|
|
631
|
+
this.events.on(name, fn);
|
|
632
|
+
}
|
|
633
|
+
off(name, fn) {
|
|
634
|
+
this.events.off(name, fn);
|
|
635
|
+
}
|
|
636
|
+
once(name, fn) {
|
|
637
|
+
this.events.once(name, fn);
|
|
638
|
+
}
|
|
639
|
+
waitOn(name) {
|
|
640
|
+
return this.events.waitOn(name);
|
|
641
|
+
}
|
|
642
|
+
async addKnowledgeBase(record) {
|
|
643
|
+
await this.storage.put(record);
|
|
644
|
+
this.events.emit("knowledge_base_added", record);
|
|
645
|
+
return record;
|
|
646
|
+
}
|
|
647
|
+
async removeKnowledgeBase(kb_id) {
|
|
648
|
+
const record = await this.storage.get({ kb_id });
|
|
649
|
+
if (!record) {
|
|
650
|
+
throw new Error(`KnowledgeBase with id "${kb_id}" not found`);
|
|
651
|
+
}
|
|
652
|
+
await this.storage.delete({ kb_id });
|
|
653
|
+
this.events.emit("knowledge_base_removed", record);
|
|
654
|
+
}
|
|
655
|
+
async getKnowledgeBase(kb_id) {
|
|
656
|
+
if (typeof kb_id !== "string")
|
|
657
|
+
return;
|
|
658
|
+
const record = await this.storage.get({ kb_id });
|
|
659
|
+
return record ?? undefined;
|
|
660
|
+
}
|
|
661
|
+
async enumerateAll() {
|
|
662
|
+
const records = await this.storage.getAll();
|
|
663
|
+
if (!records || records.length === 0)
|
|
664
|
+
return [];
|
|
665
|
+
return records;
|
|
666
|
+
}
|
|
667
|
+
async size() {
|
|
668
|
+
return await this.storage.size();
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
// src/knowledge-base/InMemoryKnowledgeBaseRepository.ts
|
|
672
|
+
import { InMemoryTabularStorage } from "@workglow/storage";
|
|
673
|
+
class InMemoryKnowledgeBaseRepository extends KnowledgeBaseRepository {
|
|
674
|
+
constructor() {
|
|
675
|
+
super(new InMemoryTabularStorage(KnowledgeBaseRecordSchema, KnowledgeBasePrimaryKeyNames));
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
// src/knowledge-base/KnowledgeBaseRegistry.ts
|
|
679
|
+
import {
|
|
680
|
+
createServiceToken,
|
|
681
|
+
globalServiceRegistry,
|
|
682
|
+
registerInputResolver
|
|
683
|
+
} from "@workglow/util";
|
|
684
|
+
var KNOWLEDGE_BASES = createServiceToken("knowledge-base.registry");
|
|
685
|
+
var KNOWLEDGE_BASE_REPOSITORY = createServiceToken("knowledge-base.repository");
|
|
686
|
+
if (!globalServiceRegistry.has(KNOWLEDGE_BASES)) {
|
|
687
|
+
globalServiceRegistry.register(KNOWLEDGE_BASES, () => new Map, true);
|
|
688
|
+
}
|
|
689
|
+
if (!globalServiceRegistry.has(KNOWLEDGE_BASE_REPOSITORY)) {
|
|
690
|
+
globalServiceRegistry.register(KNOWLEDGE_BASE_REPOSITORY, () => new InMemoryKnowledgeBaseRepository, true);
|
|
691
|
+
}
|
|
692
|
+
function getGlobalKnowledgeBases() {
|
|
693
|
+
return globalServiceRegistry.get(KNOWLEDGE_BASES);
|
|
694
|
+
}
|
|
695
|
+
function getGlobalKnowledgeBaseRepository() {
|
|
696
|
+
return globalServiceRegistry.get(KNOWLEDGE_BASE_REPOSITORY);
|
|
697
|
+
}
|
|
698
|
+
function setGlobalKnowledgeBaseRepository(repository) {
|
|
699
|
+
globalServiceRegistry.registerInstance(KNOWLEDGE_BASE_REPOSITORY, repository);
|
|
700
|
+
}
|
|
701
|
+
async function registerKnowledgeBase(id, kb) {
|
|
702
|
+
const kbs = getGlobalKnowledgeBases();
|
|
703
|
+
kbs.set(id, kb);
|
|
704
|
+
const now = new Date().toISOString();
|
|
705
|
+
const tableNames = knowledgeBaseTableNames(id);
|
|
706
|
+
const record = {
|
|
707
|
+
kb_id: id,
|
|
708
|
+
title: kb.title,
|
|
709
|
+
description: kb.description,
|
|
710
|
+
vector_dimensions: kb.getVectorDimensions(),
|
|
711
|
+
document_table: tableNames.documentTable,
|
|
712
|
+
chunk_table: tableNames.chunkTable,
|
|
713
|
+
created_at: now,
|
|
714
|
+
updated_at: now
|
|
715
|
+
};
|
|
716
|
+
const repo = getGlobalKnowledgeBaseRepository();
|
|
717
|
+
await repo.addKnowledgeBase(record);
|
|
718
|
+
}
|
|
719
|
+
function getKnowledgeBase(id) {
|
|
720
|
+
return getGlobalKnowledgeBases().get(id);
|
|
721
|
+
}
|
|
722
|
+
async function resolveKnowledgeBaseFromRegistry(id, format, registry) {
|
|
723
|
+
const kbs = registry.has(KNOWLEDGE_BASES) ? registry.get(KNOWLEDGE_BASES) : getGlobalKnowledgeBases();
|
|
724
|
+
const kb = kbs.get(id);
|
|
725
|
+
if (!kb) {
|
|
726
|
+
throw new Error(`Knowledge base "${id}" not found in registry`);
|
|
727
|
+
}
|
|
728
|
+
return kb;
|
|
729
|
+
}
|
|
730
|
+
registerInputResolver("knowledge-base", resolveKnowledgeBaseFromRegistry);
|
|
731
|
+
// src/knowledge-base/createKnowledgeBase.ts
|
|
732
|
+
import { InMemoryTabularStorage as InMemoryTabularStorage2, InMemoryVectorStorage } from "@workglow/storage";
|
|
733
|
+
|
|
734
|
+
// src/document/DocumentStorageSchema.ts
|
|
735
|
+
var DocumentStorageSchema = {
|
|
736
|
+
type: "object",
|
|
737
|
+
properties: {
|
|
738
|
+
doc_id: {
|
|
739
|
+
type: "string",
|
|
740
|
+
"x-auto-generated": true,
|
|
741
|
+
title: "Document ID",
|
|
742
|
+
description: "Unique identifier for the document"
|
|
743
|
+
},
|
|
744
|
+
data: {
|
|
745
|
+
type: "string",
|
|
746
|
+
title: "Document Data",
|
|
747
|
+
description: "JSON-serialized document"
|
|
748
|
+
},
|
|
749
|
+
metadata: {
|
|
750
|
+
type: "object",
|
|
751
|
+
title: "Metadata",
|
|
752
|
+
description: "Metadata of the document"
|
|
753
|
+
}
|
|
754
|
+
},
|
|
755
|
+
required: ["doc_id", "data"],
|
|
756
|
+
additionalProperties: true
|
|
757
|
+
};
|
|
758
|
+
var DocumentStorageKey = ["doc_id"];
|
|
759
|
+
|
|
760
|
+
// src/knowledge-base/createKnowledgeBase.ts
|
|
761
|
+
async function createKnowledgeBase(options) {
|
|
762
|
+
const {
|
|
763
|
+
name,
|
|
764
|
+
vectorDimensions,
|
|
765
|
+
vectorType = Float32Array,
|
|
766
|
+
register: shouldRegister = true,
|
|
767
|
+
title,
|
|
768
|
+
description
|
|
769
|
+
} = options;
|
|
770
|
+
const tabularStorage = new InMemoryTabularStorage2(DocumentStorageSchema, DocumentStorageKey);
|
|
771
|
+
await tabularStorage.setupDatabase();
|
|
772
|
+
const vectorStorage = new InMemoryVectorStorage(ChunkVectorStorageSchema, ChunkVectorPrimaryKey, [], vectorDimensions, vectorType);
|
|
773
|
+
await vectorStorage.setupDatabase();
|
|
774
|
+
const kb = new KnowledgeBase(name, tabularStorage, vectorStorage, title, description);
|
|
775
|
+
if (shouldRegister) {
|
|
776
|
+
await registerKnowledgeBase(name, kb);
|
|
777
|
+
}
|
|
778
|
+
return kb;
|
|
779
|
+
}
|
|
780
|
+
// src/util/DatasetSchema.ts
|
|
781
|
+
function TypeTabularStorage(options = {}) {
|
|
782
|
+
return {
|
|
783
|
+
title: "Tabular Storage",
|
|
784
|
+
description: "Storage ID or instance for tabular data storage",
|
|
785
|
+
...options,
|
|
786
|
+
format: "storage:tabular",
|
|
787
|
+
oneOf: [
|
|
788
|
+
{ type: "string", title: "Storage ID" },
|
|
789
|
+
{ title: "Storage Instance", additionalProperties: true }
|
|
790
|
+
]
|
|
791
|
+
};
|
|
792
|
+
}
|
|
793
|
+
function TypeKnowledgeBase(options = {}) {
|
|
794
|
+
return {
|
|
795
|
+
title: "Knowledge Base",
|
|
796
|
+
description: "Knowledge base ID or instance",
|
|
797
|
+
...options,
|
|
798
|
+
format: "knowledge-base",
|
|
799
|
+
anyOf: [
|
|
800
|
+
{ type: "string", title: "Knowledge Base ID" },
|
|
801
|
+
{ title: "Knowledge Base Instance", additionalProperties: true }
|
|
802
|
+
]
|
|
803
|
+
};
|
|
804
|
+
}
|
|
805
|
+
// src/document/DocumentNode.ts
|
|
806
|
+
function estimateTokens(text) {
|
|
807
|
+
return Math.ceil(text.length / 4);
|
|
808
|
+
}
|
|
809
|
+
function hasChildren(node) {
|
|
810
|
+
return node.kind === NodeKind.DOCUMENT || node.kind === NodeKind.SECTION || node.kind === NodeKind.TOPIC;
|
|
811
|
+
}
|
|
812
|
+
function getChildren(node) {
|
|
813
|
+
if (hasChildren(node)) {
|
|
814
|
+
return node.children;
|
|
815
|
+
}
|
|
816
|
+
return [];
|
|
817
|
+
}
|
|
818
|
+
function* traverseDepthFirst(node) {
|
|
819
|
+
yield node;
|
|
820
|
+
if (hasChildren(node)) {
|
|
821
|
+
for (const child of node.children) {
|
|
822
|
+
yield* traverseDepthFirst(child);
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
function getNodePath(root, targetNodeId) {
|
|
827
|
+
const path = [];
|
|
828
|
+
function search(node) {
|
|
829
|
+
path.push(node.nodeId);
|
|
830
|
+
if (node.nodeId === targetNodeId) {
|
|
831
|
+
return true;
|
|
832
|
+
}
|
|
833
|
+
if (hasChildren(node)) {
|
|
834
|
+
for (const child of node.children) {
|
|
835
|
+
if (search(child)) {
|
|
836
|
+
return true;
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
path.pop();
|
|
841
|
+
return false;
|
|
842
|
+
}
|
|
843
|
+
return search(root) ? path : undefined;
|
|
844
|
+
}
|
|
845
|
+
function getDocumentRange(root, nodePath) {
|
|
846
|
+
let currentNode = root;
|
|
847
|
+
for (let i = 1;i < nodePath.length; i++) {
|
|
848
|
+
const targetId = nodePath[i];
|
|
849
|
+
const children = currentNode.children;
|
|
850
|
+
let found;
|
|
851
|
+
for (let j = 0;j < children.length; j++) {
|
|
852
|
+
if (children[j].nodeId === targetId) {
|
|
853
|
+
found = children[j];
|
|
854
|
+
break;
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
if (!found) {
|
|
858
|
+
throw new Error(`Node with id ${targetId} not found in path`);
|
|
859
|
+
}
|
|
860
|
+
currentNode = found;
|
|
861
|
+
}
|
|
862
|
+
return currentNode.range;
|
|
863
|
+
}
|
|
864
|
+
// src/document/StructuralParser.ts
|
|
865
|
+
import { uuid4 } from "@workglow/util";
|
|
866
|
+
class StructuralParser {
|
|
867
|
+
static async parseMarkdown(doc_id, text, title) {
|
|
868
|
+
const lines = text.split(`
|
|
869
|
+
`);
|
|
870
|
+
let currentOffset = 0;
|
|
871
|
+
const root = {
|
|
872
|
+
nodeId: uuid4(),
|
|
873
|
+
kind: NodeKind.DOCUMENT,
|
|
874
|
+
range: { startOffset: 0, endOffset: text.length },
|
|
875
|
+
text: title,
|
|
876
|
+
title,
|
|
877
|
+
children: []
|
|
878
|
+
};
|
|
879
|
+
let currentParentStack = [root];
|
|
880
|
+
let textBuffer = [];
|
|
881
|
+
let textBufferStartOffset = 0;
|
|
882
|
+
const flushTextBuffer = async () => {
|
|
883
|
+
if (textBuffer.length > 0) {
|
|
884
|
+
const content = textBuffer.join(`
|
|
885
|
+
`).trim();
|
|
886
|
+
if (content) {
|
|
887
|
+
const paragraphStartOffset = textBufferStartOffset;
|
|
888
|
+
const paragraphEndOffset = currentOffset;
|
|
889
|
+
const paragraph = {
|
|
890
|
+
nodeId: uuid4(),
|
|
891
|
+
kind: NodeKind.PARAGRAPH,
|
|
892
|
+
range: {
|
|
893
|
+
startOffset: paragraphStartOffset,
|
|
894
|
+
endOffset: paragraphEndOffset
|
|
895
|
+
},
|
|
896
|
+
text: content
|
|
897
|
+
};
|
|
898
|
+
currentParentStack[currentParentStack.length - 1].children.push(paragraph);
|
|
899
|
+
}
|
|
900
|
+
textBuffer = [];
|
|
901
|
+
}
|
|
902
|
+
};
|
|
903
|
+
for (const line of lines) {
|
|
904
|
+
const lineLength = line.length + 1;
|
|
905
|
+
const headerMatch = line.match(/^(#{1,6})\s+(.*)$/);
|
|
906
|
+
if (headerMatch) {
|
|
907
|
+
await flushTextBuffer();
|
|
908
|
+
const level = headerMatch[1].length;
|
|
909
|
+
const headerTitle = headerMatch[2];
|
|
910
|
+
while (currentParentStack.length > 1 && currentParentStack[currentParentStack.length - 1].kind === NodeKind.SECTION && currentParentStack[currentParentStack.length - 1].level >= level) {
|
|
911
|
+
const poppedSection = currentParentStack.pop();
|
|
912
|
+
const updatedSection = {
|
|
913
|
+
...poppedSection,
|
|
914
|
+
range: {
|
|
915
|
+
...poppedSection.range,
|
|
916
|
+
endOffset: currentOffset
|
|
917
|
+
}
|
|
918
|
+
};
|
|
919
|
+
const parent = currentParentStack[currentParentStack.length - 1];
|
|
920
|
+
parent.children[parent.children.length - 1] = updatedSection;
|
|
921
|
+
}
|
|
922
|
+
const sectionStartOffset = currentOffset;
|
|
923
|
+
const section = {
|
|
924
|
+
nodeId: uuid4(),
|
|
925
|
+
kind: NodeKind.SECTION,
|
|
926
|
+
level,
|
|
927
|
+
title: headerTitle,
|
|
928
|
+
range: {
|
|
929
|
+
startOffset: sectionStartOffset,
|
|
930
|
+
endOffset: text.length
|
|
931
|
+
},
|
|
932
|
+
text: headerTitle,
|
|
933
|
+
children: []
|
|
934
|
+
};
|
|
935
|
+
currentParentStack[currentParentStack.length - 1].children.push(section);
|
|
936
|
+
currentParentStack.push(section);
|
|
937
|
+
} else {
|
|
938
|
+
if (textBuffer.length === 0) {
|
|
939
|
+
textBufferStartOffset = currentOffset;
|
|
940
|
+
}
|
|
941
|
+
textBuffer.push(line);
|
|
942
|
+
}
|
|
943
|
+
currentOffset += lineLength;
|
|
944
|
+
}
|
|
945
|
+
await flushTextBuffer();
|
|
946
|
+
while (currentParentStack.length > 1) {
|
|
947
|
+
const section = currentParentStack.pop();
|
|
948
|
+
const updatedSection = {
|
|
949
|
+
...section,
|
|
950
|
+
range: {
|
|
951
|
+
...section.range,
|
|
952
|
+
endOffset: text.length
|
|
953
|
+
}
|
|
954
|
+
};
|
|
955
|
+
const parent = currentParentStack[currentParentStack.length - 1];
|
|
956
|
+
parent.children[parent.children.length - 1] = updatedSection;
|
|
957
|
+
}
|
|
958
|
+
return root;
|
|
959
|
+
}
|
|
960
|
+
static async parsePlainText(doc_id, text, title) {
|
|
961
|
+
const root = {
|
|
962
|
+
nodeId: uuid4(),
|
|
963
|
+
kind: NodeKind.DOCUMENT,
|
|
964
|
+
range: { startOffset: 0, endOffset: text.length },
|
|
965
|
+
text: title,
|
|
966
|
+
title,
|
|
967
|
+
children: []
|
|
968
|
+
};
|
|
969
|
+
const paragraphRegex = /\n\s*\n/g;
|
|
970
|
+
let lastIndex = 0;
|
|
971
|
+
let paragraphIndex = 0;
|
|
972
|
+
let match;
|
|
973
|
+
while ((match = paragraphRegex.exec(text)) !== null) {
|
|
974
|
+
const rawParagraph = text.slice(lastIndex, match.index);
|
|
975
|
+
const paragraphText = rawParagraph.trim();
|
|
976
|
+
if (paragraphText.length > 0) {
|
|
977
|
+
const trimmedRelativeStart = rawParagraph.indexOf(paragraphText);
|
|
978
|
+
const startOffset = lastIndex + trimmedRelativeStart;
|
|
979
|
+
const endOffset = startOffset + paragraphText.length;
|
|
980
|
+
const paragraph = {
|
|
981
|
+
nodeId: uuid4(),
|
|
982
|
+
kind: NodeKind.PARAGRAPH,
|
|
983
|
+
range: {
|
|
984
|
+
startOffset,
|
|
985
|
+
endOffset
|
|
986
|
+
},
|
|
987
|
+
text: paragraphText
|
|
988
|
+
};
|
|
989
|
+
root.children.push(paragraph);
|
|
990
|
+
paragraphIndex++;
|
|
991
|
+
}
|
|
992
|
+
lastIndex = paragraphRegex.lastIndex;
|
|
993
|
+
}
|
|
994
|
+
if (lastIndex < text.length) {
|
|
995
|
+
const rawParagraph = text.slice(lastIndex);
|
|
996
|
+
const paragraphText = rawParagraph.trim();
|
|
997
|
+
if (paragraphText.length > 0) {
|
|
998
|
+
const trimmedRelativeStart = rawParagraph.indexOf(paragraphText);
|
|
999
|
+
const startOffset = lastIndex + trimmedRelativeStart;
|
|
1000
|
+
const endOffset = startOffset + paragraphText.length;
|
|
1001
|
+
const paragraph = {
|
|
1002
|
+
nodeId: uuid4(),
|
|
1003
|
+
kind: NodeKind.PARAGRAPH,
|
|
1004
|
+
range: {
|
|
1005
|
+
startOffset,
|
|
1006
|
+
endOffset
|
|
1007
|
+
},
|
|
1008
|
+
text: paragraphText
|
|
1009
|
+
};
|
|
1010
|
+
root.children.push(paragraph);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
return root;
|
|
1014
|
+
}
|
|
1015
|
+
static parse(doc_id, text, title, format) {
|
|
1016
|
+
if (format === "markdown" || !format && this.looksLikeMarkdown(text)) {
|
|
1017
|
+
return this.parseMarkdown(doc_id, text, title);
|
|
1018
|
+
}
|
|
1019
|
+
return this.parsePlainText(doc_id, text, title);
|
|
1020
|
+
}
|
|
1021
|
+
static looksLikeMarkdown(text) {
|
|
1022
|
+
return /^#{1,6}\s/m.test(text);
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
export {
|
|
1026
|
+
traverseDepthFirst,
|
|
1027
|
+
setGlobalKnowledgeBaseRepository,
|
|
1028
|
+
registerKnowledgeBase,
|
|
1029
|
+
knowledgeBaseTableNames,
|
|
1030
|
+
hasChildren,
|
|
1031
|
+
getNodePath,
|
|
1032
|
+
getKnowledgeBase,
|
|
1033
|
+
getGlobalKnowledgeBases,
|
|
1034
|
+
getGlobalKnowledgeBaseRepository,
|
|
1035
|
+
getDocumentRange,
|
|
1036
|
+
getChildren,
|
|
1037
|
+
estimateTokens,
|
|
1038
|
+
createKnowledgeBase,
|
|
1039
|
+
TypeTabularStorage,
|
|
1040
|
+
TypeKnowledgeBase,
|
|
1041
|
+
TopicNodeSchema,
|
|
1042
|
+
TokenBudgetSchema,
|
|
1043
|
+
StructuralParser,
|
|
1044
|
+
SentenceNodeSchema,
|
|
1045
|
+
SectionNodeSchema,
|
|
1046
|
+
ParagraphNodeSchema,
|
|
1047
|
+
NodeRangeSchema,
|
|
1048
|
+
NodeKind,
|
|
1049
|
+
NodeEnrichmentSchema,
|
|
1050
|
+
KnowledgeBaseRepository,
|
|
1051
|
+
KnowledgeBaseRecordSchema,
|
|
1052
|
+
KnowledgeBasePrimaryKeyNames,
|
|
1053
|
+
KnowledgeBase,
|
|
1054
|
+
KNOWLEDGE_BASE_REPOSITORY,
|
|
1055
|
+
KNOWLEDGE_BASES,
|
|
1056
|
+
InMemoryKnowledgeBaseRepository,
|
|
1057
|
+
EntitySchema,
|
|
1058
|
+
DocumentStorageSchema,
|
|
1059
|
+
DocumentStorageKey,
|
|
1060
|
+
DocumentRootNodeSchema,
|
|
1061
|
+
DocumentNodeSchema,
|
|
1062
|
+
DocumentNodeBaseSchema,
|
|
1063
|
+
DocumentMetadataSchema,
|
|
1064
|
+
Document,
|
|
1065
|
+
ChunkVectorStorageSchema,
|
|
1066
|
+
ChunkVectorPrimaryKey,
|
|
1067
|
+
ChunkRecordSchema,
|
|
1068
|
+
ChunkRecordArraySchema
|
|
1069
|
+
};
|
|
1070
|
+
|
|
1071
|
+
//# debugId=5A4C1CAE3E58975C64756E2164756E21
|