npm - @workglow/dataset - Versions diffs - 0.0.109 → 0.0.113 - Mend

@workglow/dataset 0.0.109 → 0.0.113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/README.md +504 -968
package/dist/browser.js +376 -490
package/dist/browser.js.map +13 -13
package/dist/bun.js +376 -490
package/dist/bun.js.map +13 -13
package/dist/chunk/ChunkSchema.d.ts +206 -0
package/dist/chunk/ChunkSchema.d.ts.map +1 -0
package/dist/chunk/ChunkVectorStorageSchema.d.ts +64 -0
package/dist/chunk/ChunkVectorStorageSchema.d.ts.map +1 -0
package/dist/common.d.ts +5 -5
package/dist/common.d.ts.map +1 -1
package/dist/document/Document.d.ts +7 -6
package/dist/document/Document.d.ts.map +1 -1
package/dist/document/DocumentSchema.d.ts +0 -465
package/dist/document/DocumentSchema.d.ts.map +1 -1
package/dist/knowledge-base/KnowledgeBase.d.ts +122 -0
package/dist/knowledge-base/KnowledgeBase.d.ts.map +1 -0
package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts +24 -0
package/dist/knowledge-base/KnowledgeBaseRegistry.d.ts.map +1 -0
package/dist/knowledge-base/createKnowledgeBase.d.ts +28 -0
package/dist/knowledge-base/createKnowledgeBase.d.ts.map +1 -0
package/dist/node.js +376 -490
package/dist/node.js.map +13 -13
package/dist/util/DatasetSchema.d.ts +9 -49
package/dist/util/DatasetSchema.d.ts.map +1 -1
package/package.json +7 -5
package/dist/document/DocumentDataset.d.ts +0 -79
package/dist/document/DocumentDataset.d.ts.map +0 -1
package/dist/document/DocumentDatasetRegistry.d.ts +0 -29
package/dist/document/DocumentDatasetRegistry.d.ts.map +0 -1
package/dist/document-chunk/DocumentChunkDataset.d.ts +0 -79
package/dist/document-chunk/DocumentChunkDataset.d.ts.map +0 -1
package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts +0 -29
package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map +0 -1
package/dist/document-chunk/DocumentChunkSchema.d.ts +0 -56
package/dist/document-chunk/DocumentChunkSchema.d.ts.map +0 -1
package/src/document-chunk/README.md +0 -362

package/dist/document/DocumentDataset.d.ts DELETED Viewed

@@ -1,79 +0,0 @@
-/**
- * @license
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
- * SPDX-License-Identifier: Apache-2.0
- */
-import type { VectorSearchOptions } from "@workglow/storage";
-import type { TypedArray } from "@workglow/util";
-import type { DocumentChunk, DocumentChunkStorage } from "../document-chunk/DocumentChunkSchema";
-import { Document } from "./Document";
-import { ChunkNode, DocumentNode } from "./DocumentSchema";
-import { DocumentTabularStorage } from "./DocumentStorageSchema";
-/**
- * Document dataset that uses TabularStorage for document persistence and VectorStorage for chunk persistence and similarity search.
- * This is a unified implementation that composes storage backends rather than using
- * inheritance/interface patterns.
- */
-export declare class DocumentDataset {
-    private tabularStorage;
-    private vectorStorage?;
-    /**
-     * Creates a new DocumentDataset instance.
-     *
-     * @param tabularStorage - Pre-initialized tabular storage for document persistence
-     * @param vectorStorage - Pre-initialized vector storage for chunk similarity search
-     *
-     * @example
-     * ```typescript
-     * const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, ["doc_id"]);
-     * await tabularStorage.setupDatabase();
-     *
-     * const vectorStorage = new InMemoryVectorStorage();
-     * await vectorStorage.setupDatabase();
-     *
-     * const docDataset = new DocumentDataset(tabularStorage, vectorStorage);
-     * ```
-     */
-    constructor(tabularStorage: DocumentTabularStorage, vectorStorage?: DocumentChunkStorage);
-    /**
-     * Upsert a document
-     * @returns The document with the generated doc_id if it was auto-generated
-     */
-    upsert(document: Document): Promise<Document>;
-    /**
-     * Get a document by ID
-     */
-    get(doc_id: string): Promise<Document | undefined>;
-    /**
-     * Delete a document
-     */
-    delete(doc_id: string): Promise<void>;
-    /**
-     * Get a specific node by ID
-     */
-    getNode(doc_id: string, nodeId: string): Promise<DocumentNode | undefined>;
-    /**
-     * Get ancestors of a node (from root to node)
-     */
-    getAncestors(doc_id: string, nodeId: string): Promise<DocumentNode[]>;
-    /**
-     * Get chunks for a document
-     */
-    getChunks(doc_id: string): Promise<ChunkNode[]>;
-    /**
-     * Find chunks that contain a specific nodeId in their path
-     */
-    findChunksByNodeId(doc_id: string, nodeId: string): Promise<ChunkNode[]>;
-    /**
-     * List all document IDs
-     */
-    list(): Promise<string[]>;
-    /**
-     * Search for similar vectors using the vector storage
-     * @param query - Query vector to search for
-     * @param options - Search options (topK, filter, scoreThreshold)
-     * @returns Array of search results sorted by similarity
-     */
-    search(query: TypedArray, options?: VectorSearchOptions<Record<string, unknown>>): Promise<Array<DocumentChunk<Record<string, unknown>, TypedArray>>>;
-}
-//# sourceMappingURL=DocumentDataset.d.ts.map

package/dist/document/DocumentDataset.d.ts.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"file":"DocumentDataset.d.ts","sourceRoot":"","sources":["../../src/document/DocumentDataset.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAC7D,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,KAAK,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,uCAAuC,CAAC;AACjG,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAC3D,OAAO,EAEL,sBAAsB,EAEvB,MAAM,yBAAyB,CAAC;AAEjC;;;;GAIG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,cAAc,CAAyB;IAC/C,OAAO,CAAC,aAAa,CAAC,CAAuB;IAE7C;;;;;;;;;;;;;;;;OAgBG;gBACS,cAAc,EAAE,sBAAsB,EAAE,aAAa,CAAC,EAAE,oBAAoB;IAKxF;;;OAGG;IACG,MAAM,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;IAgBnD;;OAEG;IACG,GAAG,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,GAAG,SAAS,CAAC;IAQxD;;OAEG;IACG,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAI3C;;OAEG;IACG,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,GAAG,SAAS,CAAC;IAuBhF;;OAEG;IACG,YAAY,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAmD3E;;OAEG;IACG,SAAS,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAQrD;;OAEG;IACG,kBAAkB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAa9E;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;IAQ/B;;;;;OAKG;IACG,MAAM,CACV,KAAK,EAAE,UAAU,EACjB,OAAO,CAAC,EAAE,mBAAmB,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GACrD,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC;CAGtE"}

package/dist/document/DocumentDatasetRegistry.d.ts DELETED Viewed

@@ -1,29 +0,0 @@
-/**
- * @license
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
- * SPDX-License-Identifier: Apache-2.0
- */
-import type { DocumentDataset } from "./DocumentDataset";
-/**
- * Service token for the document dataset registry
- * Maps dataset IDs to DocumentDataset instances
- */
-export declare const DOCUMENT_DATASETS: import("@workglow/util").ServiceToken<Map<string, DocumentDataset>>;
-/**
- * Gets the global document dataset registry
- * @returns Map of document dataset ID to instance
- */
-export declare function getGlobalDocumentDatasets(): Map<string, DocumentDataset>;
-/**
- * Registers a document dataset globally by ID
- * @param id The unique identifier for this dataset
- * @param dataset The dataset instance to register
- */
-export declare function registerDocumentDataset(id: string, dataset: DocumentDataset): void;
-/**
- * Gets a document dataset by ID from the global registry
- * @param id The dataset identifier
- * @returns The dataset instance or undefined if not found
- */
-export declare function getDocumentDataset(id: string): DocumentDataset | undefined;
-//# sourceMappingURL=DocumentDatasetRegistry.d.ts.map

package/dist/document/DocumentDatasetRegistry.d.ts.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"file":"DocumentDatasetRegistry.d.ts","sourceRoot":"","sources":["../../src/document/DocumentDatasetRegistry.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAQH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEzD;;;GAGG;AACH,eAAO,MAAM,iBAAiB,qEACyC,CAAC;AAWxE;;;GAGG;AACH,wBAAgB,yBAAyB,IAAI,GAAG,CAAC,MAAM,EAAE,eAAe,CAAC,CAExE;AAED;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,IAAI,CAGlF;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,EAAE,EAAE,MAAM,GAAG,eAAe,GAAG,SAAS,CAE1E"}

package/dist/document-chunk/DocumentChunkDataset.d.ts DELETED Viewed

@@ -1,79 +0,0 @@
-/**
- * @license
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
- * SPDX-License-Identifier: Apache-2.0
- */
-import type { VectorSearchOptions } from "@workglow/storage";
-import type { TypedArray } from "@workglow/util";
-import type { DocumentChunk, DocumentChunkStorage, InsertDocumentChunk } from "./DocumentChunkSchema";
-/**
- * Document Chunk Dataset
- *
- * A dataset-specific wrapper around vector storage for document chunks.
- * This provides a domain-specific API for working with document chunk embeddings
- * in RAG pipelines.
- */
-export declare class DocumentChunkDataset {
-    private storage;
-    constructor(storage: DocumentChunkStorage);
-    /**
-     * Get the underlying storage instance
-     */
-    getStorage(): DocumentChunkStorage;
-    /**
-     * Store a document chunk
-     */
-    put(chunk: InsertDocumentChunk): Promise<DocumentChunk>;
-    /**
-     * Store multiple document chunks
-     */
-    putBulk(chunks: InsertDocumentChunk[]): Promise<DocumentChunk[]>;
-    /**
-     * Get a document chunk by ID
-     */
-    get(chunk_id: string): Promise<DocumentChunk | undefined>;
-    /**
-     * Delete a document chunk
-     */
-    delete(chunk_id: string): Promise<void>;
-    /**
-     * Search for similar chunks using vector similarity
-     */
-    similaritySearch(query: TypedArray, options?: VectorSearchOptions<Record<string, unknown>>): Promise<Array<DocumentChunk & {
-        score: number;
-    }>>;
-    /**
-     * Hybrid search (vector + full-text)
-     */
-    hybridSearch(query: TypedArray, options: VectorSearchOptions<Record<string, unknown>> & {
-        textQuery: string;
-        vectorWeight?: number;
-    }): Promise<Array<DocumentChunk & {
-        score: number;
-    }>>;
-    /**
-     * Get all chunks
-     */
-    getAll(): Promise<DocumentChunk[] | undefined>;
-    /**
-     * Get the count of stored chunks
-     */
-    size(): Promise<number>;
-    /**
-     * Clear all chunks
-     */
-    clear(): Promise<void>;
-    /**
-     * Destroy the storage
-     */
-    destroy(): void;
-    /**
-     * Setup the database/storage
-     */
-    setupDatabase(): Promise<void>;
-    /**
-     * Get the vector dimensions
-     */
-    getVectorDimensions(): number;
-}
-//# sourceMappingURL=DocumentChunkDataset.d.ts.map

package/dist/document-chunk/DocumentChunkDataset.d.ts.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"file":"DocumentChunkDataset.d.ts","sourceRoot":"","sources":["../../src/document-chunk/DocumentChunkDataset.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAC7D,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,KAAK,EACV,aAAa,EAEb,oBAAoB,EACpB,mBAAmB,EACpB,MAAM,uBAAuB,CAAC;AAE/B;;;;;;GAMG;AACH,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAAuB;gBAE1B,OAAO,EAAE,oBAAoB;IAIzC;;OAEG;IACH,UAAU,IAAI,oBAAoB;IAIlC;;OAEG;IACG,GAAG,CAAC,KAAK,EAAE,mBAAmB,GAAG,OAAO,CAAC,aAAa,CAAC;IAI7D;;OAEG;IACG,OAAO,CAAC,MAAM,EAAE,mBAAmB,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;IAItE;;OAEG;IACG,GAAG,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,GAAG,SAAS,CAAC;IAK/D;;OAEG;IACG,MAAM,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAK7C;;OAEG;IACG,gBAAgB,CACpB,KAAK,EAAE,UAAU,EACjB,OAAO,CAAC,EAAE,mBAAmB,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GACrD,OAAO,CAAC,KAAK,CAAC,aAAa,GAAG;QAAE,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAIpD;;OAEG;IACG,YAAY,CAChB,KAAK,EAAE,UAAU,EACjB,OAAO,EAAE,mBAAmB,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GAAG;QACtD,SAAS,EAAE,MAAM,CAAC;QAClB,YAAY,CAAC,EAAE,MAAM,CAAC;KACvB,GACA,OAAO,CAAC,KAAK,CAAC,aAAa,GAAG;QAAE,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAOpD;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,EAAE,GAAG,SAAS,CAAC;IAIpD;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,MAAM,CAAC;IAI7B;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAI5B;;OAEG;IACH,OAAO,IAAI,IAAI;IAIf;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAIpC;;OAEG;IACH,mBAAmB,IAAI,MAAM;CAG9B"}

package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts DELETED Viewed

@@ -1,29 +0,0 @@
-/**
- * @license
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
- * SPDX-License-Identifier: Apache-2.0
- */
-import type { DocumentChunkDataset } from "./DocumentChunkDataset";
-/**
- * Service token for the document chunk dataset registry
- * Maps dataset IDs to DocumentChunkDataset instances
- */
-export declare const DOCUMENT_CHUNK_DATASET: import("@workglow/util").ServiceToken<Map<string, DocumentChunkDataset>>;
-/**
- * Gets the global document chunk dataset registry
- * @returns Map of document chunk dataset ID to instance
- */
-export declare function getGlobalDocumentChunkDataset(): Map<string, DocumentChunkDataset>;
-/**
- * Registers a document chunk dataset globally by ID
- * @param id The unique identifier for this dataset
- * @param dataset The dataset instance to register
- */
-export declare function registerDocumentChunkDataset(id: string, dataset: DocumentChunkDataset): void;
-/**
- * Gets a document chunk dataset by ID from the global registry
- * @param id The dataset identifier
- * @returns The dataset instance or undefined if not found
- */
-export declare function getDocumentChunkDataset(id: string): DocumentChunkDataset | undefined;
-//# sourceMappingURL=DocumentChunkDatasetRegistry.d.ts.map

package/dist/document-chunk/DocumentChunkDatasetRegistry.d.ts.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"file":"DocumentChunkDatasetRegistry.d.ts","sourceRoot":"","sources":["../../src/document-chunk/DocumentChunkDatasetRegistry.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAQH,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAEnE;;;GAGG;AACH,eAAO,MAAM,sBAAsB,0EAC8C,CAAC;AAWlF;;;GAGG;AACH,wBAAgB,6BAA6B,IAAI,GAAG,CAAC,MAAM,EAAE,oBAAoB,CAAC,CAEjF;AAED;;;;GAIG;AACH,wBAAgB,4BAA4B,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,oBAAoB,GAAG,IAAI,CAG5F;AAED;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,EAAE,EAAE,MAAM,GAAG,oBAAoB,GAAG,SAAS,CAEpF"}

package/dist/document-chunk/DocumentChunkSchema.d.ts DELETED Viewed

@@ -1,56 +0,0 @@
-/**
- * @license
- * Copyright 2025 Steven Roussey <sroussey@gmail.com>
- * SPDX-License-Identifier: Apache-2.0
- */
-import { IVectorStorage } from "@workglow/storage";
-import { type TypedArray } from "@workglow/util";
-/**
- * Default schema for document chunk storage with vector embeddings
- */
-export declare const DocumentChunkSchema: {
-    readonly type: "object";
-    readonly properties: {
-        readonly chunk_id: {
-            readonly type: "string";
-            readonly "x-auto-generated": true;
-        };
-        readonly doc_id: {
-            readonly type: "string";
-        };
-        readonly vector: {
-            readonly type: "array";
-            readonly format: "TypedArray";
-            readonly title: "Typed Array";
-            readonly description: "A typed array (Float32Array, Int8Array, etc.)";
-        };
-        readonly metadata: {
-            readonly type: "object";
-            readonly format: "metadata";
-            readonly additionalProperties: true;
-        };
-    };
-    readonly required: readonly ["chunk_id", "doc_id", "vector", "metadata"];
-    readonly additionalProperties: false;
-};
-export type DocumentChunkSchema = typeof DocumentChunkSchema;
-export declare const DocumentChunkPrimaryKey: readonly ["chunk_id"];
-export type DocumentChunkPrimaryKey = typeof DocumentChunkPrimaryKey;
-export interface DocumentChunk<Metadata extends Record<string, unknown> = Record<string, unknown>, Vector extends TypedArray = TypedArray> {
-    chunk_id: string;
-    doc_id: string;
-    vector: Vector;
-    metadata: Metadata;
-}
-/**
- * Type for inserting document chunks - chunk_id is optional (auto-generated)
- */
-export type InsertDocumentChunk<Metadata extends Record<string, unknown> = Record<string, unknown>, Vector extends TypedArray = TypedArray> = Omit<DocumentChunk<Metadata, Vector>, "chunk_id"> & Partial<Pick<DocumentChunk<Metadata, Vector>, "chunk_id">>;
-/**
- * Type for the primary key of document chunks
- */
-export type DocumentChunkKey = {
-    chunk_id: string;
-};
-export type DocumentChunkStorage = IVectorStorage<Record<string, unknown>, typeof DocumentChunkSchema, DocumentChunk, DocumentChunkPrimaryKey>;
-//# sourceMappingURL=DocumentChunkSchema.d.ts.map

package/dist/document-chunk/DocumentChunkSchema.d.ts.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"file":"DocumentChunkSchema.d.ts","sourceRoot":"","sources":["../../src/document-chunk/DocumentChunkSchema.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AACnD,OAAO,EAA+C,KAAK,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAE9F;;GAEG;AACH,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;CAUS,CAAC;AAC1C,MAAM,MAAM,mBAAmB,GAAG,OAAO,mBAAmB,CAAC;AAE7D,eAAO,MAAM,uBAAuB,uBAAwB,CAAC;AAC7D,MAAM,MAAM,uBAAuB,GAAG,OAAO,uBAAuB,CAAC;AAErE,MAAM,WAAW,aAAa,CAC5B,QAAQ,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAClE,MAAM,SAAS,UAAU,GAAG,UAAU;IAEtC,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,QAAQ,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,MAAM,mBAAmB,CAC7B,QAAQ,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAClE,MAAM,SAAS,UAAU,GAAG,UAAU,IACpC,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,EAAE,UAAU,CAAC,GACnD,OAAO,CAAC,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC;AAE7D;;GAEG;AACH,MAAM,MAAM,gBAAgB,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC;AAEpD,MAAM,MAAM,oBAAoB,GAAG,cAAc,CAC/C,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EACvB,OAAO,mBAAmB,EAC1B,aAAa,EACb,uBAAuB,CACxB,CAAC"}

package/src/document-chunk/README.md DELETED Viewed

@@ -1,362 +0,0 @@
-# Document Chunk Dataset
-Document-specific schema and utilities for storing document chunk embeddings. Uses the general-purpose vector storage from `@workglow/storage` with a predefined schema for document chunks in RAG (Retrieval-Augmented Generation) pipelines.
-## Features
-- **Predefined Schema**: `DocumentChunkSchema` with fields for chunk_id, doc_id, vector, and metadata
-- **Registry Pattern**: Register and retrieve chunk storage instances globally
-- **Type Safety**: Full TypeScript type definitions for document chunks
-- **Storage Agnostic**: Works with any vector storage backend (InMemory, SQLite, PostgreSQL)
-## Installation
-```bash
-bun install @workglow/dataset @workglow/storage
-```
-## Usage
-### Basic Usage with InMemoryVectorStorage
-```typescript
-import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
-import { InMemoryVectorStorage } from "@workglow/storage";
-// Create storage using the DocumentChunkSchema
-const repo = new InMemoryVectorStorage(
-  DocumentChunkSchema,
-  DocumentChunkPrimaryKey,
-  [], // indexes (optional)
-  384 // vector dimensions
-);
-await repo.setupDatabase();
-// Store a document chunk with its embedding
-await repo.put({
-  chunk_id: "chunk-001",
-  doc_id: "doc-001",
-  vector: new Float32Array([0.1, 0.2, 0.3 /* ... 384 dims */]),
-  metadata: { text: "Hello world", source: "example.txt" },
-});
-// Search for similar chunks
-const results = await repo.similaritySearch(new Float32Array([0.15, 0.25, 0.35 /* ... */]), {
-  topK: 5,
-  scoreThreshold: 0.7,
-});
-```
-### Quantized Vectors (Reduced Storage)
-```typescript
-import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
-import { InMemoryVectorStorage } from "@workglow/storage";
-// Use Int8Array for 4x smaller storage (binary quantization)
-const repo = new InMemoryVectorStorage(
-  DocumentChunkSchema,
-  DocumentChunkPrimaryKey,
-  [],
-  384,
-  Int8Array // Specify vector type
-);
-await repo.setupDatabase();
-// Store quantized vectors
-await repo.put({
-  chunk_id: "chunk-001",
-  doc_id: "doc-001",
-  vector: new Int8Array([127, -128, 64 /* ... */]),
-  metadata: { category: "ai" },
-});
-// Search with quantized query
-const results = await repo.similaritySearch(new Int8Array([100, -50, 75 /* ... */]), { topK: 5 });
-```
-### SQLite Storage (Local Persistence)
-```typescript
-import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
-import { SqliteVectorStorage } from "@workglow/storage";
-const repo = new SqliteVectorStorage(
-  "./vectors.db",          // database path
-  "chunks",                // table name
-  DocumentChunkSchema,
-  DocumentChunkPrimaryKey,
-  [],                      // indexes
-  768                      // vector dimension
-);
-await repo.setupDatabase();
-// Bulk insert using inherited tabular methods
-await repo.putMany([
-  { chunk_id: "1", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
-  { chunk_id: "2", doc_id: "doc1", vector: new Float32Array([...]), metadata: { text: "..." } },
-]);
-```
-### PostgreSQL with pgvector
-```typescript
-import { Pool } from "pg";
-import { DocumentChunkSchema, DocumentChunkPrimaryKey } from "@workglow/dataset";
-import { PostgresVectorStorage } from "@workglow/storage";
-const pool = new Pool({ connectionString: "postgresql://..." });
-const repo = new PostgresVectorStorage(
-  pool,
-  "chunks",
-  DocumentChunkSchema,
-  DocumentChunkPrimaryKey,
-  [],
-  384 // vector dimension
-);
-await repo.setupDatabase();
-// Native pgvector similarity search with filter
-const results = await repo.similaritySearch(queryVector, {
-  topK: 10,
-  filter: { category: "ai" },
-  scoreThreshold: 0.5,
-});
-// Hybrid search (vector + full-text)
-const hybridResults = await repo.hybridSearch(queryVector, {
-  textQuery: "machine learning",
-  topK: 10,
-  vectorWeight: 0.7,
-  filter: { category: "ai" },
-});
-```
-## Schema Definition
-### DocumentChunkSchema
-The predefined schema for document chunks:
-```typescript
-import { TypedArraySchema } from "@workglow/util";
-export const DocumentChunkSchema = {
-  type: "object",
-  properties: {
-    chunk_id: { type: "string" },
-    doc_id: { type: "string" },
-    vector: TypedArraySchema(), // Automatically detected as vector column
-    metadata: {
-      type: "object",
-      format: "metadata", // Marked for filtering support
-      additionalProperties: true,
-    },
-  },
-  additionalProperties: false,
-} as const;
-export const DocumentChunkPrimaryKey = ["chunk_id"] as const;
-```
-### DocumentChunk Type
-TypeScript interface for document chunks:
-```typescript
-interface DocumentChunk<
-  Metadata extends Record<string, unknown> = Record<string, unknown>,
-  Vector extends TypedArray = Float32Array,
-> {
-  chunk_id: string; // Unique identifier for the chunk
-  doc_id: string; // Parent document identifier
-  vector: Vector; // Embedding vector
-  metadata: Metadata; // Custom metadata (text content, entities, etc.)
-}
-```
-## API Reference
-### IChunkVectorStorage Interface
-Extends `ITabularStorage` with vector-specific methods:
-```typescript
-interface IChunkVectorStorage<Schema, PrimaryKeyNames, Entity> extends ITabularStorage<
-  Schema,
-  PrimaryKeyNames,
-  Entity
-> {
-  // Get the vector dimension
-  getVectorDimensions(): number;
-  // Vector similarity search
-  similaritySearch(
-    query: TypedArray,
-    options?: VectorSearchOptions
-  ): Promise<(Entity & { score: number })[]>;
-  // Hybrid search (optional - not all implementations support it)
-  hybridSearch?(
-    query: TypedArray,
-    options: HybridSearchOptions
-  ): Promise<(Entity & { score: number })[]>;
-}
-```
-### Inherited Tabular Methods
-From `ITabularStorage`:
-```typescript
-// Setup
-setupDatabase(): Promise<void>;
-// CRUD Operations
-put(entity: Entity): Promise<void>;
-putMany(entities: Entity[]): Promise<void>;
-get(key: PrimaryKey): Promise<Entity | undefined>;
-getAll(): Promise<Entity[] | undefined>;
-delete(key: PrimaryKey): Promise<void>;
-deleteMany(keys: PrimaryKey[]): Promise<void>;
-// Utility
-size(): Promise<number>;
-clear(): Promise<void>;
-destroy(): void;
-```
-### Search Options
-```typescript
-interface VectorSearchOptions<Metadata = Record<string, unknown>> {
-  readonly topK?: number; // Number of results (default: 10)
-  readonly filter?: Partial<Metadata>; // Filter by metadata fields
-  readonly scoreThreshold?: number; // Minimum score 0-1 (default: 0)
-}
-interface HybridSearchOptions<Metadata> extends VectorSearchOptions<Metadata> {
-  readonly textQuery: string; // Full-text query keywords
-  readonly vectorWeight?: number; // Vector weight 0-1 (default: 0.7)
-}
-```
-## Global Registry
-Register and retrieve chunk vector storage instances globally:
-```typescript
-import {
-  DocumentChunkSchema,
-  DocumentChunkPrimaryKey,
-  registerChunkVectorRepository,
-  getDocumentChunkDataset,
-  getGlobalDocumentChunkDataset,
-} from "@workglow/dataset";
-import { InMemoryVectorStorage } from "@workglow/storage";
-// Create and register a storage instance
-const repo = new InMemoryVectorStorage(DocumentChunkSchema, DocumentChunkPrimaryKey, [], 384);
-await repo.setupDatabase();
-registerChunkVectorRepository("my-chunks", repo);
-// Retrieve by ID
-const retrievedRepo = getDocumentChunkDataset("my-chunks");
-// Get all registered storage instances
-const allRepos = getGlobalDocumentChunkDataset();
-```
-## Quantization Benefits
-Quantized vectors reduce storage and can improve performance:
-| Vector Type  | Bytes/Dim | Storage vs Float32 | Use Case                             |
-| ------------ | --------- | ------------------ | ------------------------------------ |
-| Float32Array | 4         | 100% (baseline)    | Standard embeddings                  |
-| Float64Array | 8         | 200%               | High precision needed                |
-| Float16Array | 2         | 50%                | Great precision/size tradeoff        |
-| Int16Array   | 2         | 50%                | Good precision/size tradeoff         |
-| Int8Array    | 1         | 25%                | Binary quantization, max compression |
-| Uint8Array   | 1         | 25%                | Quantized embeddings [0-255]         |
-**Example:** A 768-dimensional embedding:
-- Float32: 3,072 bytes
-- Int8: 768 bytes (75% reduction!)
-## Performance Considerations
-### InMemory
-- **Best for:** Testing, small datasets (<10K vectors), development
-- **Pros:** Fastest, no dependencies, supports all vector types
-- **Cons:** No persistence, memory limited
-### SQLite
-- **Best for:** Local apps, medium datasets (<100K vectors)
-- **Pros:** Persistent, single file, no server
-- **Cons:** No native vector indexing (linear scan), slower for large datasets
-### PostgreSQL + pgvector
-- **Best for:** Production, large datasets (>100K vectors)
-- **Pros:** Native HNSW/IVFFlat indexing, efficient similarity search, scalable
-- **Cons:** Requires PostgreSQL server and pgvector extension
-- **Setup:** `CREATE EXTENSION vector;`
-## Integration with DocumentDataset
-Document chunk storage works alongside `DocumentDataset` for hierarchical document management:
-```typescript
-import {
-  DocumentDataset,
-  DocumentStorageSchema,
-  DocumentChunkSchema,
-  DocumentChunkPrimaryKey,
-} from "@workglow/dataset";
-import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
-// Initialize storage backends
-const tabularStorage = new InMemoryTabularStorage(DocumentStorageSchema, ["doc_id"]);
-await tabularStorage.setupDatabase();
-const vectorStorage = new InMemoryVectorStorage(
-  DocumentChunkSchema,
-  DocumentChunkPrimaryKey,
-  [],
-  384
-);
-await vectorStorage.setupDatabase();
-// Create document dataset with both storages
-const docDataset = new DocumentDataset(tabularStorage, vectorStorage);
-// Store document structure in tabular, chunks in vector
-await docDataset.upsert(document);
-// Search chunks by vector similarity
-const results = await docDataset.search(queryVector, { topK: 5 });
-```
-### Chunk Metadata for Hierarchical Documents
-When using hierarchical chunking, chunk metadata typically includes:
-```typescript
-metadata: {
-  text: string;           // Chunk text content
-  leafNodeId?: string;    // Reference to document tree node
-  depth?: number;         // Hierarchy depth
-  nodePath?: string[];    // Node IDs from root to leaf
-  summary?: string;       // Summary of the chunk content
-  entities?: Entity[];    // Named entities extracted from the chunk
-}
-```
-## License
-Apache 2.0