@workglow/knowledge-base 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +166 -0
- package/dist/browser.js +330 -25
- package/dist/browser.js.map +15 -12
- package/dist/bun.js +330 -25
- package/dist/bun.js.map +15 -12
- package/dist/chunk/ChunkSchema.d.ts.map +1 -1
- package/dist/chunk/ChunkVectorStorageSchema.d.ts +1 -1
- package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -1
- package/dist/common.d.ts +3 -0
- package/dist/common.d.ts.map +1 -1
- package/dist/document/Document.d.ts.map +1 -1
- package/dist/document/DocumentNode.d.ts +1 -1
- package/dist/document/DocumentNode.d.ts.map +1 -1
- package/dist/document/DocumentSchema.d.ts.map +1 -1
- package/dist/document/DocumentStorageSchema.d.ts +2 -1
- package/dist/document/DocumentStorageSchema.d.ts.map +1 -1
- package/dist/document/StructuralParser.d.ts +1 -1
- package/dist/document/StructuralParser.d.ts.map +1 -1
- package/dist/knowledge-base/KnowledgeBase.d.ts +2 -0
- package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -1
- package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +17 -1
- package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -1
- package/dist/knowledge-base/KnowledgeBaseRepository.d.ts +5 -3
- package/dist/knowledge-base/KnowledgeBaseRepository.d.ts.map +1 -1
- package/dist/knowledge-base/KnowledgeBaseSchema.d.ts +4 -0
- package/dist/knowledge-base/KnowledgeBaseSchema.d.ts.map +1 -1
- package/dist/knowledge-base/ScopedTabularStorage.d.ts +46 -0
- package/dist/knowledge-base/ScopedTabularStorage.d.ts.map +1 -0
- package/dist/knowledge-base/ScopedVectorStorage.d.ts +27 -0
- package/dist/knowledge-base/ScopedVectorStorage.d.ts.map +1 -0
- package/dist/knowledge-base/SharedTableSchemas.d.ts +93 -0
- package/dist/knowledge-base/SharedTableSchemas.d.ts.map +1 -0
- package/dist/node.js +330 -25
- package/dist/node.js.map +15 -12
- package/dist/util/DatasetSchema.d.ts.map +1 -1
- package/package.json +10 -5
package/README.md
CHANGED
|
@@ -21,6 +21,13 @@ Document management, hierarchical chunking, and knowledge base infrastructure fo
|
|
|
21
21
|
- [Tree Traversal](#tree-traversal)
|
|
22
22
|
- [Lifecycle Management](#lifecycle-management)
|
|
23
23
|
- [Registry](#registry)
|
|
24
|
+
- [Shared-Table Mode](#shared-table-mode)
|
|
25
|
+
- [Overview](#overview-1)
|
|
26
|
+
- [Setting Up Shared Storage](#setting-up-shared-storage)
|
|
27
|
+
- [Scoped Wrappers](#scoped-wrappers)
|
|
28
|
+
- [Registering with Shared Tables](#registering-with-shared-tables)
|
|
29
|
+
- [Schemas and Indexes](#schemas-and-indexes)
|
|
30
|
+
- [When to Use Shared Tables](#when-to-use-shared-tables)
|
|
24
31
|
- [Data Flow](#data-flow)
|
|
25
32
|
- [Ingestion Pipeline](#ingestion-pipeline)
|
|
26
33
|
- [Retrieval Pipeline](#retrieval-pipeline)
|
|
@@ -28,6 +35,8 @@ Document management, hierarchical chunking, and knowledge base infrastructure fo
|
|
|
28
35
|
- [Document](#document)
|
|
29
36
|
- [KnowledgeBase](#knowledgebase-1)
|
|
30
37
|
- [createKnowledgeBase](#createknowledgebase)
|
|
38
|
+
- [ScopedTabularStorage](#scopedtabularstorage)
|
|
39
|
+
- [ScopedVectorStorage](#scopedvectorstorage)
|
|
31
40
|
- [StructuralParser](#structuralparser)
|
|
32
41
|
- [Type Helpers](#type-helpers)
|
|
33
42
|
- [License](#license)
|
|
@@ -442,6 +451,134 @@ await task.run({ knowledgeBase: kb }); // Direct instance
|
|
|
442
451
|
await task.run({ knowledgeBase: "my-kb" }); // Resolved from registry
|
|
443
452
|
```
|
|
444
453
|
|
|
454
|
+
## Shared-Table Mode
|
|
455
|
+
|
|
456
|
+
### Overview
|
|
457
|
+
|
|
458
|
+
By default, each `KnowledgeBase` gets its own document table and chunk table. **Shared-table mode** lets multiple knowledge bases share the same underlying storage tables, partitioned by a `kb_id` column. This is useful when you have many knowledge bases and want to reduce table proliferation in your database.
|
|
459
|
+
|
|
460
|
+
```
|
|
461
|
+
Default mode (per-KB tables): Shared-table mode:
|
|
462
|
+
┌──────────────────────┐ ┌──────────────────────────┐
|
|
463
|
+
│ kb_docs_my_kb │ │ shared_documents │
|
|
464
|
+
│ (doc_id, data) │ │ (doc_id, kb_id, data) │
|
|
465
|
+
├──────────────────────┤ │ ├─ kb_id = "kb-1" rows │
|
|
466
|
+
│ kb_chunks_my_kb │ │ └─ kb_id = "kb-2" rows │
|
|
467
|
+
│ (chunk_id, vector..) │ ├──────────────────────────┤
|
|
468
|
+
├──────────────────────┤ │ shared_chunks │
|
|
469
|
+
│ kb_docs_other_kb │ │ (chunk_id, kb_id, vec..) │
|
|
470
|
+
│ (doc_id, data) │ │ ├─ kb_id = "kb-1" rows │
|
|
471
|
+
├──────────────────────┤ │ └─ kb_id = "kb-2" rows │
|
|
472
|
+
│ kb_chunks_other_kb │ └──────────────────────────┘
|
|
473
|
+
│ (chunk_id, vector..) │
|
|
474
|
+
└──────────────────────┘
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
The `KnowledgeBase` class itself is unchanged — shared-table mode is implemented via thin wrapper classes (`ScopedTabularStorage`, `ScopedVectorStorage`) that inject `kb_id` on writes and filter by `kb_id` on reads.
|
|
478
|
+
|
|
479
|
+
### Setting Up Shared Storage
|
|
480
|
+
|
|
481
|
+
Create the shared storage instances once, globally:
|
|
482
|
+
|
|
483
|
+
```typescript
|
|
484
|
+
import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
|
|
485
|
+
import {
|
|
486
|
+
SharedDocumentStorageSchema,
|
|
487
|
+
SharedChunkVectorStorageSchema,
|
|
488
|
+
SharedDocumentIndexes,
|
|
489
|
+
SharedChunkIndexes,
|
|
490
|
+
SHARED_DOCUMENT_TABLE,
|
|
491
|
+
SHARED_CHUNK_TABLE,
|
|
492
|
+
DocumentStorageKey,
|
|
493
|
+
ChunkVectorPrimaryKey,
|
|
494
|
+
} from "@workglow/knowledge-base";
|
|
495
|
+
|
|
496
|
+
const sharedDocStorage = new InMemoryTabularStorage(
|
|
497
|
+
SharedDocumentStorageSchema,
|
|
498
|
+
DocumentStorageKey,
|
|
499
|
+
SharedDocumentIndexes
|
|
500
|
+
);
|
|
501
|
+
|
|
502
|
+
const sharedChunkStorage = new InMemoryVectorStorage(
|
|
503
|
+
SharedChunkVectorStorageSchema,
|
|
504
|
+
ChunkVectorPrimaryKey,
|
|
505
|
+
SharedChunkIndexes,
|
|
506
|
+
1024 // vector dimensions
|
|
507
|
+
);
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
For SQL backends (SQLite, PostgreSQL), replace `InMemoryTabularStorage` / `InMemoryVectorStorage` with the appropriate implementations. The shared schemas include indexes on `kb_id` and `[kb_id, doc_id]` for efficient scoped queries.
|
|
511
|
+
|
|
512
|
+
### Scoped Wrappers
|
|
513
|
+
|
|
514
|
+
For each knowledge base, create scoped wrappers that filter to that KB's data:
|
|
515
|
+
|
|
516
|
+
```typescript
|
|
517
|
+
import {
|
|
518
|
+
ScopedTabularStorage,
|
|
519
|
+
ScopedVectorStorage,
|
|
520
|
+
KnowledgeBase,
|
|
521
|
+
} from "@workglow/knowledge-base";
|
|
522
|
+
|
|
523
|
+
// KB 1
|
|
524
|
+
const scopedDocs1 = new ScopedTabularStorage(sharedDocStorage, "kb-1");
|
|
525
|
+
const scopedChunks1 = new ScopedVectorStorage(sharedChunkStorage, "kb-1");
|
|
526
|
+
const kb1 = new KnowledgeBase("kb-1", scopedDocs1, scopedChunks1);
|
|
527
|
+
|
|
528
|
+
// KB 2
|
|
529
|
+
const scopedDocs2 = new ScopedTabularStorage(sharedDocStorage, "kb-2");
|
|
530
|
+
const scopedChunks2 = new ScopedVectorStorage(sharedChunkStorage, "kb-2");
|
|
531
|
+
const kb2 = new KnowledgeBase("kb-2", scopedDocs2, scopedChunks2);
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
Each `KnowledgeBase` instance works exactly the same as in default mode — all CRUD, search, and lifecycle operations are transparently scoped to the KB's data.
|
|
535
|
+
|
|
536
|
+
### Registering with Shared Tables
|
|
537
|
+
|
|
538
|
+
Pass `{ sharedTables: true }` when registering so that the metadata record uses the shared table names:
|
|
539
|
+
|
|
540
|
+
```typescript
|
|
541
|
+
import { registerKnowledgeBase } from "@workglow/knowledge-base";
|
|
542
|
+
|
|
543
|
+
await registerKnowledgeBase("kb-1", kb1, { sharedTables: true });
|
|
544
|
+
await registerKnowledgeBase("kb-2", kb2, { sharedTables: true });
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
You can check whether a persisted record uses shared tables with the `isSharedTableMode` helper:
|
|
548
|
+
|
|
549
|
+
```typescript
|
|
550
|
+
import { isSharedTableMode } from "@workglow/knowledge-base";
|
|
551
|
+
|
|
552
|
+
const record = await repo.getKnowledgeBase("kb-1");
|
|
553
|
+
if (isSharedTableMode(record)) {
|
|
554
|
+
// reconstruct using scoped wrappers
|
|
555
|
+
}
|
|
556
|
+
```
|
|
557
|
+
|
|
558
|
+
### Schemas and Indexes
|
|
559
|
+
|
|
560
|
+
The shared schemas augment the standard schemas with a `kb_id` column:
|
|
561
|
+
|
|
562
|
+
| Schema | Base Schema | Added Column |
|
|
563
|
+
| ------------------------------- | -------------------------- | ------------ |
|
|
564
|
+
| `SharedDocumentStorageSchema` | `DocumentStorageSchema` | `kb_id: string` |
|
|
565
|
+
| `SharedChunkVectorStorageSchema`| `ChunkVectorStorageSchema` | `kb_id: string` |
|
|
566
|
+
|
|
567
|
+
Default shared table names: `SHARED_DOCUMENT_TABLE = "shared_documents"`, `SHARED_CHUNK_TABLE = "shared_chunks"`.
|
|
568
|
+
|
|
569
|
+
Pre-defined index arrays for efficient queries:
|
|
570
|
+
- `SharedDocumentIndexes` — `[["kb_id"]]`
|
|
571
|
+
- `SharedChunkIndexes` — `[["kb_id"], ["kb_id", "doc_id"]]`
|
|
572
|
+
|
|
573
|
+
### When to Use Shared Tables
|
|
574
|
+
|
|
575
|
+
| Scenario | Recommendation |
|
|
576
|
+
| --- | --- |
|
|
577
|
+
| Few knowledge bases, each large | Default (per-KB tables) — simpler, no `kb_id` overhead |
|
|
578
|
+
| Many knowledge bases (e.g., per-user, per-tenant) | Shared tables — avoids table proliferation |
|
|
579
|
+
| Need cross-KB queries | Shared tables — query the shared storage directly |
|
|
580
|
+
| Using managed databases with table limits | Shared tables |
|
|
581
|
+
|
|
445
582
|
## Data Flow
|
|
446
583
|
|
|
447
584
|
### Ingestion Pipeline
|
|
@@ -640,6 +777,35 @@ interface CreateKnowledgeBaseOptions {
|
|
|
640
777
|
}
|
|
641
778
|
```
|
|
642
779
|
|
|
780
|
+
### ScopedTabularStorage
|
|
781
|
+
|
|
782
|
+
```typescript
|
|
783
|
+
class ScopedTabularStorage<Schema, PrimaryKeyNames, Entity, PrimaryKey, InsertType>
|
|
784
|
+
implements ITabularStorage<Schema, PrimaryKeyNames, Entity, PrimaryKey, InsertType>
|
|
785
|
+
{
|
|
786
|
+
constructor(inner: AnyTabularStorage, kbId: string);
|
|
787
|
+
|
|
788
|
+
// All ITabularStorage methods are implemented.
|
|
789
|
+
// Writes inject kb_id, reads filter by kb_id, results strip kb_id.
|
|
790
|
+
// setupDatabase() and destroy() are no-ops (shared storage lifecycle is external).
|
|
791
|
+
}
|
|
792
|
+
```
|
|
793
|
+
|
|
794
|
+
### ScopedVectorStorage
|
|
795
|
+
|
|
796
|
+
```typescript
|
|
797
|
+
class ScopedVectorStorage<Metadata, Schema, Entity, PrimaryKeyNames>
|
|
798
|
+
extends ScopedTabularStorage<Schema, PrimaryKeyNames, Entity>
|
|
799
|
+
implements IVectorStorage<Metadata, Schema, Entity, PrimaryKeyNames>
|
|
800
|
+
{
|
|
801
|
+
constructor(inner: AnyVectorStorage, kbId: string);
|
|
802
|
+
|
|
803
|
+
getVectorDimensions(): number; // Delegates to inner
|
|
804
|
+
similaritySearch(query, options?): Promise<(Entity & { score })[]>; // Post-filters by kb_id
|
|
805
|
+
hybridSearch?(query, options): Promise<(Entity & { score })[]>; // Post-filters by kb_id
|
|
806
|
+
}
|
|
807
|
+
```
|
|
808
|
+
|
|
643
809
|
### StructuralParser
|
|
644
810
|
|
|
645
811
|
```typescript
|
package/dist/browser.js
CHANGED
|
@@ -342,9 +342,7 @@ var ChunkRecordArraySchema = {
|
|
|
342
342
|
description: "Array of chunk records"
|
|
343
343
|
};
|
|
344
344
|
// src/chunk/ChunkVectorStorageSchema.ts
|
|
345
|
-
import {
|
|
346
|
-
TypedArraySchema
|
|
347
|
-
} from "@workglow/util/schema";
|
|
345
|
+
import { TypedArraySchema } from "@workglow/util/schema";
|
|
348
346
|
var ChunkVectorStorageSchema = {
|
|
349
347
|
type: "object",
|
|
350
348
|
properties: {
|
|
@@ -563,6 +561,12 @@ class KnowledgeBase {
|
|
|
563
561
|
this.tabularStorage.destroy();
|
|
564
562
|
this.chunkStorage.destroy();
|
|
565
563
|
}
|
|
564
|
+
async[Symbol.asyncDispose]() {
|
|
565
|
+
this.destroy();
|
|
566
|
+
}
|
|
567
|
+
[Symbol.dispose]() {
|
|
568
|
+
this.destroy();
|
|
569
|
+
}
|
|
566
570
|
async getChunk(chunk_id) {
|
|
567
571
|
return this.chunkStorage.get({ chunk_id });
|
|
568
572
|
}
|
|
@@ -599,6 +603,55 @@ class KnowledgeBase {
|
|
|
599
603
|
return doc.findChunksByNodeId(nodeId);
|
|
600
604
|
}
|
|
601
605
|
}
|
|
606
|
+
// src/knowledge-base/SharedTableSchemas.ts
|
|
607
|
+
import { TypedArraySchema as TypedArraySchema2 } from "@workglow/util/schema";
|
|
608
|
+
var SHARED_DOCUMENT_TABLE = "shared_documents";
|
|
609
|
+
var SHARED_CHUNK_TABLE = "shared_chunks";
|
|
610
|
+
var SharedDocumentStorageSchema = {
|
|
611
|
+
type: "object",
|
|
612
|
+
properties: {
|
|
613
|
+
doc_id: {
|
|
614
|
+
type: "string",
|
|
615
|
+
"x-auto-generated": true,
|
|
616
|
+
title: "Document ID",
|
|
617
|
+
description: "Unique identifier for the document"
|
|
618
|
+
},
|
|
619
|
+
kb_id: {
|
|
620
|
+
type: "string",
|
|
621
|
+
title: "Knowledge Base ID",
|
|
622
|
+
description: "Owning knowledge base identifier"
|
|
623
|
+
},
|
|
624
|
+
data: {
|
|
625
|
+
type: "string",
|
|
626
|
+
title: "Document Data",
|
|
627
|
+
description: "JSON-serialized document"
|
|
628
|
+
},
|
|
629
|
+
metadata: {
|
|
630
|
+
type: "object",
|
|
631
|
+
title: "Metadata",
|
|
632
|
+
description: "Metadata of the document"
|
|
633
|
+
}
|
|
634
|
+
},
|
|
635
|
+
required: ["doc_id", "kb_id", "data"],
|
|
636
|
+
additionalProperties: true
|
|
637
|
+
};
|
|
638
|
+
var SharedChunkVectorStorageSchema = {
|
|
639
|
+
type: "object",
|
|
640
|
+
properties: {
|
|
641
|
+
chunk_id: { type: "string", "x-auto-generated": true },
|
|
642
|
+
kb_id: { type: "string" },
|
|
643
|
+
doc_id: { type: "string" },
|
|
644
|
+
vector: TypedArraySchema2(),
|
|
645
|
+
metadata: { type: "object", format: "metadata", additionalProperties: true }
|
|
646
|
+
},
|
|
647
|
+
required: ["chunk_id", "kb_id", "doc_id", "vector", "metadata"],
|
|
648
|
+
additionalProperties: false
|
|
649
|
+
};
|
|
650
|
+
var SharedDocumentPrimaryKey = ["kb_id", "doc_id"];
|
|
651
|
+
var SharedChunkPrimaryKey = ["kb_id", "chunk_id"];
|
|
652
|
+
var SharedDocumentIndexes = [["kb_id"]];
|
|
653
|
+
var SharedChunkIndexes = [["kb_id"], ["kb_id", "doc_id"]];
|
|
654
|
+
|
|
602
655
|
// src/knowledge-base/KnowledgeBaseSchema.ts
|
|
603
656
|
var KnowledgeBaseRecordSchema = {
|
|
604
657
|
type: "object",
|
|
@@ -632,6 +685,9 @@ function knowledgeBaseTableNames(kbId) {
|
|
|
632
685
|
chunkTable: `kb_chunks_${safe}`
|
|
633
686
|
};
|
|
634
687
|
}
|
|
688
|
+
function isSharedTableMode(record) {
|
|
689
|
+
return record.document_table === SHARED_DOCUMENT_TABLE && record.chunk_table === SHARED_CHUNK_TABLE;
|
|
690
|
+
}
|
|
635
691
|
// src/knowledge-base/KnowledgeBaseRepository.ts
|
|
636
692
|
import { EventEmitter } from "@workglow/util";
|
|
637
693
|
|
|
@@ -696,16 +752,13 @@ class InMemoryKnowledgeBaseRepository extends KnowledgeBaseRepository {
|
|
|
696
752
|
import {
|
|
697
753
|
createServiceToken,
|
|
698
754
|
globalServiceRegistry,
|
|
755
|
+
registerInputCompactor,
|
|
699
756
|
registerInputResolver
|
|
700
757
|
} from "@workglow/util";
|
|
701
758
|
var KNOWLEDGE_BASES = createServiceToken("knowledge-base.registry");
|
|
702
759
|
var KNOWLEDGE_BASE_REPOSITORY = createServiceToken("knowledge-base.repository");
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
}
|
|
706
|
-
if (!globalServiceRegistry.has(KNOWLEDGE_BASE_REPOSITORY)) {
|
|
707
|
-
globalServiceRegistry.register(KNOWLEDGE_BASE_REPOSITORY, () => new InMemoryKnowledgeBaseRepository, true);
|
|
708
|
-
}
|
|
760
|
+
globalServiceRegistry.registerIfAbsent(KNOWLEDGE_BASES, () => new Map, true);
|
|
761
|
+
globalServiceRegistry.registerIfAbsent(KNOWLEDGE_BASE_REPOSITORY, () => new InMemoryKnowledgeBaseRepository, true);
|
|
709
762
|
function getGlobalKnowledgeBases() {
|
|
710
763
|
return globalServiceRegistry.get(KNOWLEDGE_BASES);
|
|
711
764
|
}
|
|
@@ -715,23 +768,53 @@ function getGlobalKnowledgeBaseRepository() {
|
|
|
715
768
|
function setGlobalKnowledgeBaseRepository(repository) {
|
|
716
769
|
globalServiceRegistry.registerInstance(KNOWLEDGE_BASE_REPOSITORY, repository);
|
|
717
770
|
}
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
const
|
|
721
|
-
const
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
document_table: tableNames.documentTable,
|
|
728
|
-
chunk_table: tableNames.chunkTable,
|
|
729
|
-
created_at: now,
|
|
730
|
-
updated_at: now
|
|
771
|
+
var pendingOps = new Map;
|
|
772
|
+
function withIdLock(id, fn) {
|
|
773
|
+
const prev = pendingOps.get(id) ?? Promise.resolve();
|
|
774
|
+
const next = prev.then(fn, fn);
|
|
775
|
+
pendingOps.set(id, next);
|
|
776
|
+
const cleanup = () => {
|
|
777
|
+
if (pendingOps.get(id) === next) {
|
|
778
|
+
pendingOps.delete(id);
|
|
779
|
+
}
|
|
731
780
|
};
|
|
781
|
+
next.finally(cleanup);
|
|
782
|
+
return next;
|
|
783
|
+
}
|
|
784
|
+
function registerKnowledgeBase(id, kb, options) {
|
|
785
|
+
return withIdLock(id, async () => {
|
|
786
|
+
const kbs = getGlobalKnowledgeBases();
|
|
787
|
+
const now = new Date().toISOString();
|
|
788
|
+
const useShared = options?.sharedTables === true;
|
|
789
|
+
const tableNames = useShared ? { documentTable: SHARED_DOCUMENT_TABLE, chunkTable: SHARED_CHUNK_TABLE } : knowledgeBaseTableNames(id);
|
|
790
|
+
const record = {
|
|
791
|
+
kb_id: id,
|
|
792
|
+
title: kb.title,
|
|
793
|
+
description: kb.description,
|
|
794
|
+
vector_dimensions: kb.getVectorDimensions(),
|
|
795
|
+
document_table: tableNames.documentTable,
|
|
796
|
+
chunk_table: tableNames.chunkTable,
|
|
797
|
+
created_at: now,
|
|
798
|
+
updated_at: now
|
|
799
|
+
};
|
|
800
|
+
const repo = getGlobalKnowledgeBaseRepository();
|
|
801
|
+
await repo.addKnowledgeBase(record);
|
|
802
|
+
kbs.set(id, kb);
|
|
803
|
+
});
|
|
804
|
+
}
|
|
805
|
+
function unregisterKnowledgeBase(id) {
|
|
806
|
+
return withIdLock(id, async () => {
|
|
807
|
+
const repo = getGlobalKnowledgeBaseRepository();
|
|
808
|
+
await repo.removeKnowledgeBase(id);
|
|
809
|
+
const kbs = getGlobalKnowledgeBases();
|
|
810
|
+
kbs.delete(id);
|
|
811
|
+
});
|
|
812
|
+
}
|
|
813
|
+
async function deregisterKnowledgeBase(id) {
|
|
732
814
|
const repo = getGlobalKnowledgeBaseRepository();
|
|
733
|
-
await repo.
|
|
734
|
-
kbs
|
|
815
|
+
await repo.removeKnowledgeBase(id);
|
|
816
|
+
const kbs = getGlobalKnowledgeBases();
|
|
817
|
+
kbs.delete(id);
|
|
735
818
|
}
|
|
736
819
|
function getKnowledgeBase(id) {
|
|
737
820
|
return getGlobalKnowledgeBases().get(id);
|
|
@@ -745,6 +828,14 @@ async function resolveKnowledgeBaseFromRegistry(id, format, registry) {
|
|
|
745
828
|
return kb;
|
|
746
829
|
}
|
|
747
830
|
registerInputResolver("knowledge-base", resolveKnowledgeBaseFromRegistry);
|
|
831
|
+
registerInputCompactor("knowledge-base", (value, _format, registry) => {
|
|
832
|
+
const kbs = registry.has(KNOWLEDGE_BASES) ? registry.get(KNOWLEDGE_BASES) : getGlobalKnowledgeBases();
|
|
833
|
+
for (const [id, kb] of kbs) {
|
|
834
|
+
if (kb === value)
|
|
835
|
+
return id;
|
|
836
|
+
}
|
|
837
|
+
return;
|
|
838
|
+
});
|
|
748
839
|
// src/knowledge-base/createKnowledgeBase.ts
|
|
749
840
|
import { InMemoryTabularStorage as InMemoryTabularStorage2, InMemoryVectorStorage } from "@workglow/storage";
|
|
750
841
|
|
|
@@ -800,6 +891,207 @@ async function createKnowledgeBase(options) {
|
|
|
800
891
|
}
|
|
801
892
|
return kb;
|
|
802
893
|
}
|
|
894
|
+
// src/knowledge-base/ScopedTabularStorage.ts
|
|
895
|
+
import { EventEmitter as EventEmitter2 } from "@workglow/util";
|
|
896
|
+
|
|
897
|
+
class ScopedTabularStorage {
|
|
898
|
+
inner;
|
|
899
|
+
kbId;
|
|
900
|
+
events = new EventEmitter2;
|
|
901
|
+
constructor(inner, kbId) {
|
|
902
|
+
this.inner = inner;
|
|
903
|
+
this.kbId = kbId;
|
|
904
|
+
}
|
|
905
|
+
inject(value) {
|
|
906
|
+
return { ...value, kb_id: this.kbId };
|
|
907
|
+
}
|
|
908
|
+
strip(entity) {
|
|
909
|
+
if (!entity)
|
|
910
|
+
return entity;
|
|
911
|
+
const { kb_id: _, ...rest } = entity;
|
|
912
|
+
return rest;
|
|
913
|
+
}
|
|
914
|
+
stripArray(entities) {
|
|
915
|
+
if (!entities)
|
|
916
|
+
return;
|
|
917
|
+
return entities.map((e) => this.strip(e));
|
|
918
|
+
}
|
|
919
|
+
async put(value) {
|
|
920
|
+
const result = await this.inner.put(this.inject(value));
|
|
921
|
+
const stripped = this.strip(result);
|
|
922
|
+
this.events.emit("put", stripped);
|
|
923
|
+
return stripped;
|
|
924
|
+
}
|
|
925
|
+
async putBulk(values) {
|
|
926
|
+
const injected = values.map((v) => this.inject(v));
|
|
927
|
+
const results = await this.inner.putBulk(injected);
|
|
928
|
+
const stripped = results.map((r) => this.strip(r));
|
|
929
|
+
for (const entity of stripped) {
|
|
930
|
+
this.events.emit("put", entity);
|
|
931
|
+
}
|
|
932
|
+
return stripped;
|
|
933
|
+
}
|
|
934
|
+
async get(key) {
|
|
935
|
+
const result = await this.inner.get({ ...key, kb_id: this.kbId });
|
|
936
|
+
if (!result)
|
|
937
|
+
return;
|
|
938
|
+
const stripped = this.strip(result);
|
|
939
|
+
this.events.emit("get", key, stripped);
|
|
940
|
+
return stripped;
|
|
941
|
+
}
|
|
942
|
+
async delete(key) {
|
|
943
|
+
await this.inner.deleteSearch({ ...key, kb_id: this.kbId });
|
|
944
|
+
this.events.emit("delete", key);
|
|
945
|
+
}
|
|
946
|
+
async getAll(options) {
|
|
947
|
+
const results = await this.inner.query({ kb_id: this.kbId }, options);
|
|
948
|
+
return this.stripArray(results);
|
|
949
|
+
}
|
|
950
|
+
async deleteAll() {
|
|
951
|
+
await this.inner.deleteSearch({ kb_id: this.kbId });
|
|
952
|
+
this.events.emit("clearall");
|
|
953
|
+
}
|
|
954
|
+
async size() {
|
|
955
|
+
let count = 0;
|
|
956
|
+
const pageSize = 1000;
|
|
957
|
+
let offset = 0;
|
|
958
|
+
while (true) {
|
|
959
|
+
const page = await this.inner.query({ kb_id: this.kbId }, { offset, limit: pageSize });
|
|
960
|
+
if (!page || page.length === 0)
|
|
961
|
+
break;
|
|
962
|
+
count += page.length;
|
|
963
|
+
if (page.length < pageSize)
|
|
964
|
+
break;
|
|
965
|
+
offset += pageSize;
|
|
966
|
+
}
|
|
967
|
+
return count;
|
|
968
|
+
}
|
|
969
|
+
async query(criteria, options) {
|
|
970
|
+
const results = await this.inner.query({ ...criteria, kb_id: this.kbId }, options);
|
|
971
|
+
const stripped = this.stripArray(results);
|
|
972
|
+
this.events.emit("query", criteria, stripped);
|
|
973
|
+
return stripped;
|
|
974
|
+
}
|
|
975
|
+
async deleteSearch(criteria) {
|
|
976
|
+
await this.inner.deleteSearch({ ...criteria, kb_id: this.kbId });
|
|
977
|
+
}
|
|
978
|
+
async getBulk(offset, limit) {
|
|
979
|
+
const results = await this.inner.query({ kb_id: this.kbId }, { offset, limit });
|
|
980
|
+
return this.stripArray(results);
|
|
981
|
+
}
|
|
982
|
+
async* records(pageSize = 100) {
|
|
983
|
+
if (pageSize <= 0) {
|
|
984
|
+
throw new RangeError(`pageSize must be greater than 0, got ${pageSize}`);
|
|
985
|
+
}
|
|
986
|
+
let offset = 0;
|
|
987
|
+
while (true) {
|
|
988
|
+
const page = await this.getBulk(offset, pageSize);
|
|
989
|
+
if (!page || page.length === 0) {
|
|
990
|
+
break;
|
|
991
|
+
}
|
|
992
|
+
for (const entity of page) {
|
|
993
|
+
yield entity;
|
|
994
|
+
}
|
|
995
|
+
if (page.length < pageSize)
|
|
996
|
+
break;
|
|
997
|
+
offset += pageSize;
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
async* pages(pageSize = 100) {
|
|
1001
|
+
if (pageSize <= 0) {
|
|
1002
|
+
throw new RangeError(`pageSize must be greater than 0, got ${pageSize}`);
|
|
1003
|
+
}
|
|
1004
|
+
let offset = 0;
|
|
1005
|
+
while (true) {
|
|
1006
|
+
const page = await this.getBulk(offset, pageSize);
|
|
1007
|
+
if (!page || page.length === 0) {
|
|
1008
|
+
break;
|
|
1009
|
+
}
|
|
1010
|
+
yield page;
|
|
1011
|
+
if (page.length < pageSize)
|
|
1012
|
+
break;
|
|
1013
|
+
offset += pageSize;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
on(name, fn) {
|
|
1017
|
+
this.events.on(name, fn);
|
|
1018
|
+
}
|
|
1019
|
+
off(name, fn) {
|
|
1020
|
+
this.events.off(name, fn);
|
|
1021
|
+
}
|
|
1022
|
+
emit(name, ...args) {
|
|
1023
|
+
this.events.emit(name, ...args);
|
|
1024
|
+
}
|
|
1025
|
+
once(name, fn) {
|
|
1026
|
+
this.events.once(name, fn);
|
|
1027
|
+
}
|
|
1028
|
+
waitOn(name) {
|
|
1029
|
+
return this.events.waitOn(name);
|
|
1030
|
+
}
|
|
1031
|
+
subscribeToChanges(callback, options) {
|
|
1032
|
+
return this.inner.subscribeToChanges((change) => {
|
|
1033
|
+
const newKbId = change.new?.kb_id;
|
|
1034
|
+
const oldKbId = change.old?.kb_id;
|
|
1035
|
+
if (newKbId !== undefined && newKbId !== this.kbId)
|
|
1036
|
+
return;
|
|
1037
|
+
if (oldKbId !== undefined && oldKbId !== this.kbId)
|
|
1038
|
+
return;
|
|
1039
|
+
if (newKbId === undefined && oldKbId === undefined)
|
|
1040
|
+
return;
|
|
1041
|
+
callback({
|
|
1042
|
+
type: change.type,
|
|
1043
|
+
...change.old ? { old: this.strip(change.old) } : {},
|
|
1044
|
+
...change.new ? { new: this.strip(change.new) } : {}
|
|
1045
|
+
});
|
|
1046
|
+
}, options);
|
|
1047
|
+
}
|
|
1048
|
+
async setupDatabase() {}
|
|
1049
|
+
destroy() {}
|
|
1050
|
+
[Symbol.dispose]() {}
|
|
1051
|
+
async[Symbol.asyncDispose]() {}
|
|
1052
|
+
}
|
|
1053
|
+
// src/knowledge-base/ScopedVectorStorage.ts
|
|
1054
|
+
class ScopedVectorStorage extends ScopedTabularStorage {
|
|
1055
|
+
inner;
|
|
1056
|
+
overFetchMultiplier;
|
|
1057
|
+
constructor(inner, kbId, overFetchMultiplier = 3) {
|
|
1058
|
+
super(inner, kbId);
|
|
1059
|
+
this.inner = inner;
|
|
1060
|
+
this.overFetchMultiplier = overFetchMultiplier;
|
|
1061
|
+
}
|
|
1062
|
+
getVectorDimensions() {
|
|
1063
|
+
return this.inner.getVectorDimensions();
|
|
1064
|
+
}
|
|
1065
|
+
filterAndStrip(results, topK, overfetchLimit) {
|
|
1066
|
+
const filtered = results.filter((r) => r.kb_id === this.kbId).slice(0, topK);
|
|
1067
|
+
if (topK && overfetchLimit && results.length >= overfetchLimit && filtered.length < topK) {
|
|
1068
|
+
console.warn(`ScopedVectorStorage: search returned ${filtered.length}/${topK} results after ` + `kb_id filtering. Consider increasing overFetchMultiplier (currently ${this.overFetchMultiplier}).`);
|
|
1069
|
+
}
|
|
1070
|
+
return filtered.map((r) => {
|
|
1071
|
+
const { kb_id: _, ...rest } = r;
|
|
1072
|
+
return rest;
|
|
1073
|
+
});
|
|
1074
|
+
}
|
|
1075
|
+
async similaritySearch(query, options) {
|
|
1076
|
+
const overfetchLimit = options?.topK ? options.topK * this.overFetchMultiplier : undefined;
|
|
1077
|
+
const results = await this.inner.similaritySearch(query, {
|
|
1078
|
+
...options,
|
|
1079
|
+
topK: overfetchLimit
|
|
1080
|
+
});
|
|
1081
|
+
return this.filterAndStrip(results, options?.topK, overfetchLimit);
|
|
1082
|
+
}
|
|
1083
|
+
async hybridSearch(query, options) {
|
|
1084
|
+
if (typeof this.inner.hybridSearch !== "function") {
|
|
1085
|
+
throw new Error("Hybrid search is not supported by the configured chunk storage backend. " + "Please use a vector storage implementation that provides `hybridSearch`.");
|
|
1086
|
+
}
|
|
1087
|
+
const overfetchLimit = options?.topK ? options.topK * this.overFetchMultiplier : undefined;
|
|
1088
|
+
const results = await this.inner.hybridSearch(query, {
|
|
1089
|
+
...options,
|
|
1090
|
+
topK: overfetchLimit
|
|
1091
|
+
});
|
|
1092
|
+
return this.filterAndStrip(results, options?.topK, overfetchLimit);
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
803
1095
|
// src/util/DatasetSchema.ts
|
|
804
1096
|
function TypeTabularStorage(options = {}) {
|
|
805
1097
|
return {
|
|
@@ -1053,10 +1345,12 @@ class StructuralParser {
|
|
|
1053
1345
|
}
|
|
1054
1346
|
}
|
|
1055
1347
|
export {
|
|
1348
|
+
unregisterKnowledgeBase,
|
|
1056
1349
|
traverseDepthFirst,
|
|
1057
1350
|
setGlobalKnowledgeBaseRepository,
|
|
1058
1351
|
registerKnowledgeBase,
|
|
1059
1352
|
knowledgeBaseTableNames,
|
|
1353
|
+
isSharedTableMode,
|
|
1060
1354
|
hasChildren,
|
|
1061
1355
|
getNodePath,
|
|
1062
1356
|
getKnowledgeBase,
|
|
@@ -1065,14 +1359,25 @@ export {
|
|
|
1065
1359
|
getDocumentRange,
|
|
1066
1360
|
getChildren,
|
|
1067
1361
|
estimateTokens,
|
|
1362
|
+
deregisterKnowledgeBase,
|
|
1068
1363
|
createKnowledgeBase,
|
|
1069
1364
|
TypeTabularStorage,
|
|
1070
1365
|
TypeKnowledgeBase,
|
|
1071
1366
|
TopicNodeSchema,
|
|
1072
1367
|
TokenBudgetSchema,
|
|
1073
1368
|
StructuralParser,
|
|
1369
|
+
SharedDocumentStorageSchema,
|
|
1370
|
+
SharedDocumentPrimaryKey,
|
|
1371
|
+
SharedDocumentIndexes,
|
|
1372
|
+
SharedChunkVectorStorageSchema,
|
|
1373
|
+
SharedChunkPrimaryKey,
|
|
1374
|
+
SharedChunkIndexes,
|
|
1074
1375
|
SentenceNodeSchema,
|
|
1075
1376
|
SectionNodeSchema,
|
|
1377
|
+
ScopedVectorStorage,
|
|
1378
|
+
ScopedTabularStorage,
|
|
1379
|
+
SHARED_DOCUMENT_TABLE,
|
|
1380
|
+
SHARED_CHUNK_TABLE,
|
|
1076
1381
|
ParagraphNodeSchema,
|
|
1077
1382
|
NodeRangeSchema,
|
|
1078
1383
|
NodeKind,
|
|
@@ -1098,4 +1403,4 @@ export {
|
|
|
1098
1403
|
ChunkRecordArraySchema
|
|
1099
1404
|
};
|
|
1100
1405
|
|
|
1101
|
-
//# debugId=
|
|
1406
|
+
//# debugId=6A75C04FDEE4B40064756E2164756E21
|