@workglow/dataset 0.0.110 → 0.0.113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +504 -968
  2. package/dist/browser.js +376 -490
  3. package/dist/browser.js.map +13 -13
  4. package/dist/bun.js +376 -490
  5. package/dist/bun.js.map +13 -13
  6. package/dist/chunk/ChunkSchema.d.ts +206 -0
  7. package/dist/chunk/ChunkSchema.d.ts.map +1 -0
  8. package/dist/chunk/ChunkVectorStorageSchema.d.ts +64 -0
  9. package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -0
  10. package/dist/common.d.ts +5 -5
  11. package/dist/common.d.ts.map +1 -1
  12. package/dist/document/Document.d.ts +7 -6
  13. package/dist/document/Document.d.ts.map +1 -1
  14. package/dist/document/DocumentSchema.d.ts +0 -465
  15. package/dist/document/DocumentSchema.d.ts.map +1 -1
  16. package/dist/knowledge-base/KnowledgeBase.d.ts +122 -0
  17. package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -0
  18. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +24 -0
  19. package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -0
  20. package/dist/knowledge-base/createKnowledgeBase.d.ts +28 -0
  21. package/dist/knowledge-base/createKnowledgeBase.d.ts.map +1 -0
  22. package/dist/node.js +376 -490
  23. package/dist/node.js.map +13 -13
  24. package/dist/util/DatasetSchema.d.ts +9 -49
  25. package/dist/util/DatasetSchema.d.ts.map +1 -1
  26. package/package.json +5 -5
  27. package/dist/document/DocumentDataset.d.ts +0 -79
  28. package/dist/document/DocumentDataset.d.ts.map +0 -1
  29. package/dist/document/DocumentDatasetRegistry.d.ts +0 -29
  30. package/dist/document/DocumentDatasetRegistry.d.ts.map +0 -1
  31. package/dist/document-chunk/DocumentChunkDataset.d.ts +0 -79
  32. package/dist/document-chunk/DocumentChunkDataset.d.ts.map +0 -1
  33. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +0 -29
  34. package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +0 -1
  35. package/dist/document-chunk/DocumentChunkSchema.d.ts +0 -56
  36. package/dist/document-chunk/DocumentChunkSchema.d.ts.map +0 -1
  37. package/src/document-chunk/README.md +0 -362
@@ -1,79 +0,0 @@
1
- /**
2
- * @license
3
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
4
- * SPDX-License-Identifier: Apache-2.0
5
- */
6
- import type { VectorSearchOptions } from "@workglow/storage";
7
- import type { TypedArray } from "@workglow/util";
8
- import type { DocumentChunk, DocumentChunkStorage } from "../document-chunk/DocumentChunkSchema";
9
- import { Document } from "./Document";
10
- import { ChunkNode, DocumentNode } from "./DocumentSchema";
11
- import { DocumentTabularStorage } from "./DocumentStorageSchema";
12
- /**
13
- * Document dataset that uses TabularStorage for document persistence and VectorStorage for chunk persistence and similarity search.
14
- * This is a unified implementation that composes storage backends rather than using
15
- * inheritance/interface patterns.
16
- */
17
- export declare class DocumentDataset {
18
- private tabularStorage;
19
- private vectorStorage?;
20
- /**
21
- * Creates a new DocumentDataset instance.
22
- *
23
- * @param tabularStorage - Pre-initialized tabular storage for document persistence
24
- * @param vectorStorage - Pre-initialized vector storage for chunk similarity search
25
- *
26
- * @example
27
- * ```typescript
28
- * const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, ["doc_id"]);
29
- * await tabularStorage.setupDatabase();
30
- *
31
- * const vectorStorage = new InMemoryVectorStorage();
32
- * await vectorStorage.setupDatabase();
33
- *
34
- * const docDataset = new DocumentDataset(tabularStorage, vectorStorage);
35
- * ```
36
- */
37
- constructor(tabularStorage: DocumentTabularStorage, vectorStorage?: DocumentChunkStorage);
38
- /**
39
- * Upsert a document
40
- * @returns The document with the generated doc_id if it was auto-generated
41
- */
42
- upsert(document: Document): Promise<Document>;
43
- /**
44
- * Get a document by ID
45
- */
46
- get(doc_id: string): Promise<Document | undefined>;
47
- /**
48
- * Delete a document
49
- */
50
- delete(doc_id: string): Promise<void>;
51
- /**
52
- * Get a specific node by ID
53
- */
54
- getNode(doc_id: string, nodeId: string): Promise<DocumentNode | undefined>;
55
- /**
56
- * Get ancestors of a node (from root to node)
57
- */
58
- getAncestors(doc_id: string, nodeId: string): Promise<DocumentNode[]>;
59
- /**
60
- * Get chunks for a document
61
- */
62
- getChunks(doc_id: string): Promise<ChunkNode[]>;
63
- /**
64
- * Find chunks that contain a specific nodeId in their path
65
- */
66
- findChunksByNodeId(doc_id: string, nodeId: string): Promise<ChunkNode[]>;
67
- /**
68
- * List all document IDs
69
- */
70
- list(): Promise<string[]>;
71
- /**
72
- * Search for similar vectors using the vector storage
73
- * @param query - Query vector to search for
74
- * @param options - Search options (topK, filter, scoreThreshold)
75
- * @returns Array of search results sorted by similarity
76
- */
77
- search(query: TypedArray, options?: VectorSearchOptions<Record<string, unknown>>): Promise<Array<DocumentChunk<Record<string, unknown>, TypedArray>>>;
78
- }
79
- //# sourceMappingURL=DocumentDataset.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"DocumentDataset.d.ts","sourceRoot":"","sources":["../../src/document/DocumentDataset.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAC7D,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,KAAK,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,uCAAuC,CAAC;AACjG,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAC3D,OAAO,EAEL,sBAAsB,EAEvB,MAAM,yBAAyB,CAAC;AAEjC;;;;GAIG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,cAAc,CAAyB;IAC/C,OAAO,CAAC,aAAa,CAAC,CAAuB;IAE7C;;;;;;;;;;;;;;;;OAgBG;gBACS,cAAc,EAAE,sBAAsB,EAAE,aAAa,CAAC,EAAE,oBAAoB;IAKxF;;;OAGG;IACG,MAAM,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;IAgBnD;;OAEG;IACG,GAAG,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,GAAG,SAAS,CAAC;IAQxD;;OAEG;IACG,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAI3C;;OAEG;IACG,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,GAAG,SAAS,CAAC;IAuBhF;;OAEG;IACG,YAAY,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAmD3E;;OAEG;IACG,SAAS,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAQrD;;OAEG;IACG,kBAAkB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAa9E;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;IAQ/B;;;;;OAKG;IACG,MAAM,CACV,KAAK,EAAE,UAAU,EACjB,OAAO,CAAC,EAAE,mBAAmB,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GACrD,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC;CAGtE"}
@@ -1,29 +0,0 @@
1
- /**
2
- * @license
3
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
4
- * SPDX-License-Identifier: Apache-2.0
5
- */
6
- import type { DocumentDataset } from "./DocumentDataset";
7
- /**
8
- * Service token for the document dataset registry
9
- * Maps dataset IDs to DocumentDataset instances
10
- */
11
- export declare const DOCUMENT_DATASETS: import("@workglow/util").ServiceToken<Map<string, DocumentDataset>>;
12
- /**
13
- * Gets the global document dataset registry
14
- * @returns Map of document dataset ID to instance
15
- */
16
- export declare function getGlobalDocumentDatasets(): Map<string, DocumentDataset>;
17
- /**
18
- * Registers a document dataset globally by ID
19
- * @param id The unique identifier for this dataset
20
- * @param dataset The dataset instance to register
21
- */
22
- export declare function registerDocumentDataset(id: string, dataset: DocumentDataset): void;
23
- /**
24
- * Gets a document dataset by ID from the global registry
25
- * @param id The dataset identifier
26
- * @returns The dataset instance or undefined if not found
27
- */
28
- export declare function getDocumentDataset(id: string): DocumentDataset | undefined;
29
- //# sourceMappingURL=DocumentDatasetRegistry.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"DocumentDatasetRegistry.d.ts","sourceRoot":"","sources":["../../src/document/DocumentDatasetRegistry.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAQH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEzD;;;GAGG;AACH,eAAO,MAAM,iBAAiB,qEACyC,CAAC;AAWxE;;;GAGG;AACH,wBAAgB,yBAAyB,IAAI,GAAG,CAAC,MAAM,EAAE,eAAe,CAAC,CAExE;AAED;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,IAAI,CAGlF;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,eAAe,GAAG,SAAS,CAE1E"}
@@ -1,79 +0,0 @@
1
- /**
2
- * @license
3
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
4
- * SPDX-License-Identifier: Apache-2.0
5
- */
6
- import type { VectorSearchOptions } from "@workglow/storage";
7
- import type { TypedArray } from "@workglow/util";
8
- import type { DocumentChunk, DocumentChunkStorage, InsertDocumentChunk } from "./DocumentChunkSchema";
9
- /**
10
- * Document Chunk Dataset
11
- *
12
- * A dataset-specific wrapper around vector storage for document chunks.
13
- * This provides a domain-specific API for working with document chunk embeddings
14
- * in RAG pipelines.
15
- */
16
- export declare class DocumentChunkDataset {
17
- private storage;
18
- constructor(storage: DocumentChunkStorage);
19
- /**
20
- * Get the underlying storage instance
21
- */
22
- getStorage(): DocumentChunkStorage;
23
- /**
24
- * Store a document chunk
25
- */
26
- put(chunk: InsertDocumentChunk): Promise<DocumentChunk>;
27
- /**
28
- * Store multiple document chunks
29
- */
30
- putBulk(chunks: InsertDocumentChunk[]): Promise<DocumentChunk[]>;
31
- /**
32
- * Get a document chunk by ID
33
- */
34
- get(chunk_id: string): Promise<DocumentChunk | undefined>;
35
- /**
36
- * Delete a document chunk
37
- */
38
- delete(chunk_id: string): Promise<void>;
39
- /**
40
- * Search for similar chunks using vector similarity
41
- */
42
- similaritySearch(query: TypedArray, options?: VectorSearchOptions<Record<string, unknown>>): Promise<Array<DocumentChunk & {
43
- score: number;
44
- }>>;
45
- /**
46
- * Hybrid search (vector + full-text)
47
- */
48
- hybridSearch(query: TypedArray, options: VectorSearchOptions<Record<string, unknown>> & {
49
- textQuery: string;
50
- vectorWeight?: number;
51
- }): Promise<Array<DocumentChunk & {
52
- score: number;
53
- }>>;
54
- /**
55
- * Get all chunks
56
- */
57
- getAll(): Promise<DocumentChunk[] | undefined>;
58
- /**
59
- * Get the count of stored chunks
60
- */
61
- size(): Promise<number>;
62
- /**
63
- * Clear all chunks
64
- */
65
- clear(): Promise<void>;
66
- /**
67
- * Destroy the storage
68
- */
69
- destroy(): void;
70
- /**
71
- * Setup the database/storage
72
- */
73
- setupDatabase(): Promise<void>;
74
- /**
75
- * Get the vector dimensions
76
- */
77
- getVectorDimensions(): number;
78
- }
79
- //# sourceMappingURL=DocumentChunkDataset.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"DocumentChunkDataset.d.ts","sourceRoot":"","sources":["../../src/document-chunk/DocumentChunkDataset.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAC7D,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,KAAK,EACV,aAAa,EAEb,oBAAoB,EACpB,mBAAmB,EACpB,MAAM,uBAAuB,CAAC;AAE/B;;;;;;GAMG;AACH,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAAuB;gBAE1B,OAAO,EAAE,oBAAoB;IAIzC;;OAEG;IACH,UAAU,IAAI,oBAAoB;IAIlC;;OAEG;IACG,GAAG,CAAC,KAAK,EAAE,mBAAmB,GAAG,OAAO,CAAC,aAAa,CAAC;IAI7D;;OAEG;IACG,OAAO,CAAC,MAAM,EAAE,mBAAmB,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;IAItE;;OAEG;IACG,GAAG,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,GAAG,SAAS,CAAC;IAK/D;;OAEG;IACG,MAAM,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAK7C;;OAEG;IACG,gBAAgB,CACpB,KAAK,EAAE,UAAU,EACjB,OAAO,CAAC,EAAE,mBAAmB,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GACrD,OAAO,CAAC,KAAK,CAAC,aAAa,GAAG;QAAE,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAIpD;;OAEG;IACG,YAAY,CAChB,KAAK,EAAE,UAAU,EACjB,OAAO,EAAE,mBAAmB,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GAAG;QACtD,SAAS,EAAE,MAAM,CAAC;QAClB,YAAY,CAAC,EAAE,MAAM,CAAC;KACvB,GACA,OAAO,CAAC,KAAK,CAAC,aAAa,GAAG;QAAE,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAOpD;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,EAAE,GAAG,SAAS,CAAC;IAIpD;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,MAAM,CAAC;IAI7B;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAI5B;;OAEG;IACH,OAAO,IAAI,IAAI;IAIf;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAIpC;;OAEG;IACH,mBAAmB,IAAI,MAAM;CAG9B"}
@@ -1,29 +0,0 @@
1
- /**
2
- * @license
3
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
4
- * SPDX-License-Identifier: Apache-2.0
5
- */
6
- import type { DocumentChunkDataset } from "./DocumentChunkDataset";
7
- /**
8
- * Service token for the document chunk dataset registry
9
- * Maps dataset IDs to DocumentChunkDataset instances
10
- */
11
- export declare const DOCUMENT_CHUNK_DATASET: import("@workglow/util").ServiceToken<Map<string, DocumentChunkDataset>>;
12
- /**
13
- * Gets the global document chunk dataset registry
14
- * @returns Map of document chunk dataset ID to instance
15
- */
16
- export declare function getGlobalDocumentChunkDataset(): Map<string, DocumentChunkDataset>;
17
- /**
18
- * Registers a document chunk dataset globally by ID
19
- * @param id The unique identifier for this dataset
20
- * @param dataset The dataset instance to register
21
- */
22
- export declare function registerDocumentChunkDataset(id: string, dataset: DocumentChunkDataset): void;
23
- /**
24
- * Gets a document chunk dataset by ID from the global registry
25
- * @param id The dataset identifier
26
- * @returns The dataset instance or undefined if not found
27
- */
28
- export declare function getDocumentChunkDataset(id: string): DocumentChunkDataset | undefined;
29
- //# sourceMappingURL=DocumentChunkDatasetRegistry.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"DocumentChunkDatasetRegistry.d.ts","sourceRoot":"","sources":["../../src/document-chunk/DocumentChunkDatasetRegistry.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAQH,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAEnE;;;GAGG;AACH,eAAO,MAAM,sBAAsB,0EAC8C,CAAC;AAWlF;;;GAGG;AACH,wBAAgB,6BAA6B,IAAI,GAAG,CAAC,MAAM,EAAE,oBAAoB,CAAC,CAEjF;AAED;;;;GAIG;AACH,wBAAgB,4BAA4B,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,oBAAoB,GAAG,IAAI,CAG5F;AAED;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,EAAE,EAAE,MAAM,GAAG,oBAAoB,GAAG,SAAS,CAEpF"}
@@ -1,56 +0,0 @@
1
- /**
2
- * @license
3
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
4
- * SPDX-License-Identifier: Apache-2.0
5
- */
6
- import { IVectorStorage } from "@workglow/storage";
7
- import { type TypedArray } from "@workglow/util";
8
- /**
9
- * Default schema for document chunk storage with vector embeddings
10
- */
11
- export declare const DocumentChunkSchema: {
12
- readonly type: "object";
13
- readonly properties: {
14
- readonly chunk_id: {
15
- readonly type: "string";
16
- readonly "x-auto-generated": true;
17
- };
18
- readonly doc_id: {
19
- readonly type: "string";
20
- };
21
- readonly vector: {
22
- readonly type: "array";
23
- readonly format: "TypedArray";
24
- readonly title: "Typed Array";
25
- readonly description: "A typed array (Float32Array, Int8Array, etc.)";
26
- };
27
- readonly metadata: {
28
- readonly type: "object";
29
- readonly format: "metadata";
30
- readonly additionalProperties: true;
31
- };
32
- };
33
- readonly required: readonly ["chunk_id", "doc_id", "vector", "metadata"];
34
- readonly additionalProperties: false;
35
- };
36
- export type DocumentChunkSchema = typeof DocumentChunkSchema;
37
- export declare const DocumentChunkPrimaryKey: readonly ["chunk_id"];
38
- export type DocumentChunkPrimaryKey = typeof DocumentChunkPrimaryKey;
39
- export interface DocumentChunk<Metadata extends Record<string, unknown> = Record<string, unknown>, Vector extends TypedArray = TypedArray> {
40
- chunk_id: string;
41
- doc_id: string;
42
- vector: Vector;
43
- metadata: Metadata;
44
- }
45
- /**
46
- * Type for inserting document chunks - chunk_id is optional (auto-generated)
47
- */
48
- export type InsertDocumentChunk<Metadata extends Record<string, unknown> = Record<string, unknown>, Vector extends TypedArray = TypedArray> = Omit<DocumentChunk<Metadata, Vector>, "chunk_id"> & Partial<Pick<DocumentChunk<Metadata, Vector>, "chunk_id">>;
49
- /**
50
- * Type for the primary key of document chunks
51
- */
52
- export type DocumentChunkKey = {
53
- chunk_id: string;
54
- };
55
- export type DocumentChunkStorage = IVectorStorage<Record<string, unknown>, typeof DocumentChunkSchema, DocumentChunk, DocumentChunkPrimaryKey>;
56
- //# sourceMappingURL=DocumentChunkSchema.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"DocumentChunkSchema.d.ts","sourceRoot":"","sources":["../../src/document-chunk/DocumentChunkSchema.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AACnD,OAAO,EAA+C,KAAK,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAE9F;;GAEG;AACH,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;CAUS,CAAC;AAC1C,MAAM,MAAM,mBAAmB,GAAG,OAAO,mBAAmB,CAAC;AAE7D,eAAO,MAAM,uBAAuB,uBAAwB,CAAC;AAC7D,MAAM,MAAM,uBAAuB,GAAG,OAAO,uBAAuB,CAAC;AAErE,MAAM,WAAW,aAAa,CAC5B,QAAQ,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAClE,MAAM,SAAS,UAAU,GAAG,UAAU;IAEtC,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,QAAQ,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,MAAM,mBAAmB,CAC7B,QAAQ,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAClE,MAAM,SAAS,UAAU,GAAG,UAAU,IACpC,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,EAAE,UAAU,CAAC,GACnD,OAAO,CAAC,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC;AAE7D;;GAEG;AACH,MAAM,MAAM,gBAAgB,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC;AAEpD,MAAM,MAAM,oBAAoB,GAAG,cAAc,CAC/C,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EACvB,OAAO,mBAAmB,EAC1B,aAAa,EACb,uBAAuB,CACxB,CAAC"}
@@ -1,362 +0,0 @@
1
- # Document Chunk Dataset
2
-
3
- Document-specific schema and utilities for storing document chunk embeddings. Uses the general-purpose vector storage from `@workglow/storage` with a predefined schema for document chunks in RAG (Retrieval-Augmented Generation) pipelines.
4
-
5
- ## Features
6
-
7
- - **Predefined Schema**: `DocumentChunkSchema` with fields for chunk_id, doc_id, vector, and metadata
8
- - **Registry Pattern**: Register and retrieve chunk storage instances globally
9
- - **Type Safety**: Full TypeScript type definitions for document chunks
10
- - **Storage Agnostic**: Works with any vector storage backend (InMemory, SQLite, PostgreSQL)
11
-
12
- ## Installation
13
-
14
- ```bash
15
- bun install @workglow/dataset @workglow/storage
16
- ```
17
-
18
- ## Usage
19
-
20
- ### Basic Usage with InMemoryVectorStorage
21
-
22
- ```typescript
23
- import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
24
- import { InMemoryVectorStorage } from "@workglow/storage";
25
-
26
- // Create storage using the DocumentChunkSchema
27
- const repo = new InMemoryVectorStorage(
28
- DocumentChunkSchema,
29
- DocumentChunkPrimaryKey,
30
- [], // indexes (optional)
31
- 384 // vector dimensions
32
- );
33
- await repo.setupDatabase();
34
-
35
- // Store a document chunk with its embedding
36
- await repo.put({
37
- chunk_id: "chunk-001",
38
- doc_id: "doc-001",
39
- vector: new Float32Array([0.1, 0.2, 0.3 /* ... 384 dims */]),
40
- metadata: { text: "Hello world", source: "example.txt" },
41
- });
42
-
43
- // Search for similar chunks
44
- const results = await repo.similaritySearch(new Float32Array([0.15, 0.25, 0.35 /* ... */]), {
45
- topK: 5,
46
- scoreThreshold: 0.7,
47
- });
48
- ```
49
-
50
- ### Quantized Vectors (Reduced Storage)
51
-
52
- ```typescript
53
- import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
54
- import { InMemoryVectorStorage } from "@workglow/storage";
55
-
56
- // Use Int8Array for 4x smaller storage (binary quantization)
57
- const repo = new InMemoryVectorStorage(
58
- DocumentChunkSchema,
59
- DocumentChunkPrimaryKey,
60
- [],
61
- 384,
62
- Int8Array // Specify vector type
63
- );
64
- await repo.setupDatabase();
65
-
66
- // Store quantized vectors
67
- await repo.put({
68
- chunk_id: "chunk-001",
69
- doc_id: "doc-001",
70
- vector: new Int8Array([127, -128, 64 /* ... */]),
71
- metadata: { category: "ai" },
72
- });
73
-
74
- // Search with quantized query
75
- const results = await repo.similaritySearch(new Int8Array([100, -50, 75 /* ... */]), { topK: 5 });
76
- ```
77
-
78
- ### SQLite Storage (Local Persistence)
79
-
80
- ```typescript
81
- import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
82
- import { SqliteVectorStorage } from "@workglow/storage";
83
-
84
- const repo = new SqliteVectorStorage(
85
- "./vectors.db", // database path
86
- "chunks", // table name
87
- DocumentChunkSchema,
88
- DocumentChunkPrimaryKey,
89
- [], // indexes
90
- 768 // vector dimension
91
- );
92
- await repo.setupDatabase();
93
-
94
- // Bulk insert using inherited tabular methods
95
- await repo.putMany([
96
- { chunk_id: "1", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
97
- { chunk_id: "2", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
98
- ]);
99
- ```
100
-
101
- ### PostgreSQL with pgvector
102
-
103
- ```typescript
104
- import { Pool } from "pg";
105
- import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
106
- import { PostgresVectorStorage } from "@workglow/storage";
107
-
108
- const pool = new Pool({ connectionString: "postgresql://..." });
109
- const repo = new PostgresVectorStorage(
110
- pool,
111
- "chunks",
112
- DocumentChunkSchema,
113
- DocumentChunkPrimaryKey,
114
- [],
115
- 384 // vector dimension
116
- );
117
- await repo.setupDatabase();
118
-
119
- // Native pgvector similarity search with filter
120
- const results = await repo.similaritySearch(queryVector, {
121
- topK: 10,
122
- filter: { category: "ai" },
123
- scoreThreshold: 0.5,
124
- });
125
-
126
- // Hybrid search (vector + full-text)
127
- const hybridResults = await repo.hybridSearch(queryVector, {
128
- textQuery: "machine learning",
129
- topK: 10,
130
- vectorWeight: 0.7,
131
- filter: { category: "ai" },
132
- });
133
- ```
134
-
135
- ## Schema Definition
136
-
137
- ### DocumentChunkSchema
138
-
139
- The predefined schema for document chunks:
140
-
141
- ```typescript
142
- import { TypedArraySchema } from "@workglow/util";
143
-
144
- export const DocumentChunkSchema = {
145
- type: "object",
146
- properties: {
147
- chunk_id: { type: "string" },
148
- doc_id: { type: "string" },
149
- vector: TypedArraySchema(), // Automatically detected as vector column
150
- metadata: {
151
- type: "object",
152
- format: "metadata", // Marked for filtering support
153
- additionalProperties: true,
154
- },
155
- },
156
- additionalProperties: false,
157
- } as const;
158
-
159
- export const DocumentChunkPrimaryKey = ["chunk_id"] as const;
160
- ```
161
-
162
- ### DocumentChunk Type
163
-
164
- TypeScript interface for document chunks:
165
-
166
- ```typescript
167
- interface DocumentChunk<
168
- Metadata extends Record<string, unknown> = Record<string, unknown>,
169
- Vector extends TypedArray = Float32Array,
170
- > {
171
- chunk_id: string; // Unique identifier for the chunk
172
- doc_id: string; // Parent document identifier
173
- vector: Vector; // Embedding vector
174
- metadata: Metadata; // Custom metadata (text content, entities, etc.)
175
- }
176
- ```
177
-
178
- ## API Reference
179
-
180
- ### IChunkVectorStorage Interface
181
-
182
- Extends `ITabularStorage` with vector-specific methods:
183
-
184
- ```typescript
185
- interface IChunkVectorStorage<Schema, PrimaryKeyNames, Entity> extends ITabularStorage<
186
- Schema,
187
- PrimaryKeyNames,
188
- Entity
189
- > {
190
- // Get the vector dimension
191
- getVectorDimensions(): number;
192
-
193
- // Vector similarity search
194
- similaritySearch(
195
- query: TypedArray,
196
- options?: VectorSearchOptions
197
- ): Promise<(Entity & { score: number })[]>;
198
-
199
- // Hybrid search (optional - not all implementations support it)
200
- hybridSearch?(
201
- query: TypedArray,
202
- options: HybridSearchOptions
203
- ): Promise<(Entity & { score: number })[]>;
204
- }
205
- ```
206
-
207
- ### Inherited Tabular Methods
208
-
209
- From `ITabularStorage`:
210
-
211
- ```typescript
212
- // Setup
213
- setupDatabase(): Promise<void>;
214
-
215
- // CRUD Operations
216
- put(entity: Entity): Promise<void>;
217
- putMany(entities: Entity[]): Promise<void>;
218
- get(key: PrimaryKey): Promise<Entity | undefined>;
219
- getAll(): Promise<Entity[] | undefined>;
220
- delete(key: PrimaryKey): Promise<void>;
221
- deleteMany(keys: PrimaryKey[]): Promise<void>;
222
-
223
- // Utility
224
- size(): Promise<number>;
225
- clear(): Promise<void>;
226
- destroy(): void;
227
- ```
228
-
229
- ### Search Options
230
-
231
- ```typescript
232
- interface VectorSearchOptions<Metadata = Record<string, unknown>> {
233
- readonly topK?: number; // Number of results (default: 10)
234
- readonly filter?: Partial<Metadata>; // Filter by metadata fields
235
- readonly scoreThreshold?: number; // Minimum score 0-1 (default: 0)
236
- }
237
-
238
- interface HybridSearchOptions<Metadata> extends VectorSearchOptions<Metadata> {
239
- readonly textQuery: string; // Full-text query keywords
240
- readonly vectorWeight?: number; // Vector weight 0-1 (default: 0.7)
241
- }
242
- ```
243
-
244
- ## Global Registry
245
-
246
- Register and retrieve chunk vector storage instances globally:
247
-
248
- ```typescript
249
- import {
250
- DocumentChunkSchema,
251
- DocumentChunkPrimaryKey,
252
- registerChunkVectorRepository,
253
- getDocumentChunkDataset,
254
- getGlobalDocumentChunkDataset,
255
- } from "@workglow/dataset";
256
- import { InMemoryVectorStorage } from "@workglow/storage";
257
-
258
- // Create and register a storage instance
259
- const repo = new InMemoryVectorStorage(DocumentChunkSchema, DocumentChunkPrimaryKey, [], 384);
260
- await repo.setupDatabase();
261
-
262
- registerChunkVectorRepository("my-chunks", repo);
263
-
264
- // Retrieve by ID
265
- const retrievedRepo = getDocumentChunkDataset("my-chunks");
266
-
267
- // Get all registered storage instances
268
- const allRepos = getGlobalDocumentChunkDataset();
269
- ```
270
-
271
- ## Quantization Benefits
272
-
273
- Quantized vectors reduce storage and can improve performance:
274
-
275
- | Vector Type | Bytes/Dim | Storage vs Float32 | Use Case |
276
- | ------------ | --------- | ------------------ | ------------------------------------ |
277
- | Float32Array | 4 | 100% (baseline) | Standard embeddings |
278
- | Float64Array | 8 | 200% | High precision needed |
279
- | Float16Array | 2 | 50% | Great precision/size tradeoff |
280
- | Int16Array | 2 | 50% | Good precision/size tradeoff |
281
- | Int8Array | 1 | 25% | Binary quantization, max compression |
282
- | Uint8Array | 1 | 25% | Quantized embeddings [0-255] |
283
-
284
- **Example:** A 768-dimensional embedding:
285
-
286
- - Float32: 3,072 bytes
287
- - Int8: 768 bytes (75% reduction!)
288
-
289
- ## Performance Considerations
290
-
291
- ### InMemory
292
-
293
- - **Best for:** Testing, small datasets (<10K vectors), development
294
- - **Pros:** Fastest, no dependencies, supports all vector types
295
- - **Cons:** No persistence, memory limited
296
-
297
- ### SQLite
298
-
299
- - **Best for:** Local apps, medium datasets (<100K vectors)
300
- - **Pros:** Persistent, single file, no server
301
- - **Cons:** No native vector indexing (linear scan), slower for large datasets
302
-
303
- ### PostgreSQL + pgvector
304
-
305
- - **Best for:** Production, large datasets (>100K vectors)
306
- - **Pros:** Native HNSW/IVFFlat indexing, efficient similarity search, scalable
307
- - **Cons:** Requires PostgreSQL server and pgvector extension
308
- - **Setup:** `CREATE EXTENSION vector;`
309
-
310
- ## Integration with DocumentDataset
311
-
312
- Document chunk storage works alongside `DocumentDataset` for hierarchical document management:
313
-
314
- ```typescript
315
- import {
316
- DocumentDataset,
317
- DocumentStorageSchema,
318
- DocumentChunkSchema,
319
- DocumentChunkPrimaryKey,
320
- } from "@workglow/dataset";
321
- import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
322
-
323
- // Initialize storage backends
324
- const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, ["doc_id"]);
325
- await tabularStorage.setupDatabase();
326
-
327
- const vectorStorage = new InMemoryVectorStorage(
328
- DocumentChunkSchema,
329
- DocumentChunkPrimaryKey,
330
- [],
331
- 384
332
- );
333
- await vectorStorage.setupDatabase();
334
-
335
- // Create document dataset with both storages
336
- const docDataset = new DocumentDataset(tabularStorage, vectorStorage);
337
-
338
- // Store document structure in tabular, chunks in vector
339
- await docDataset.upsert(document);
340
-
341
- // Search chunks by vector similarity
342
- const results = await docDataset.search(queryVector, { topK: 5 });
343
- ```
344
-
345
- ### Chunk Metadata for Hierarchical Documents
346
-
347
- When using hierarchical chunking, chunk metadata typically includes:
348
-
349
- ```typescript
350
- metadata: {
351
- text: string; // Chunk text content
352
- leafNodeId?: string; // Reference to document tree node
353
- depth?: number; // Hierarchy depth
354
- nodePath?: string[]; // Node IDs from root to leaf
355
- summary?: string; // Summary of the chunk content
356
- entities?: Entity[]; // Named entities extracted from the chunk
357
- }
358
- ```
359
-
360
- ## License
361
-
362
- Apache 2.0