PyPI - gllm-datastore-binary - Versions diffs - 0.5.45__cp311-cp311-macosx_13_0_arm64.whl - Mend

gllm-datastore-binary 0.5.45__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gllm-datastore-binary might be problematic. Click here for more details.

Files changed (108) hide show

gllm_datastore/__init__.pyi +0 -0
gllm_datastore/cache/__init__.pyi +4 -0
gllm_datastore/cache/base.pyi +84 -0
gllm_datastore/cache/cache.pyi +137 -0
gllm_datastore/cache/hybrid_cache/__init__.pyi +5 -0
gllm_datastore/cache/hybrid_cache/file_system_hybrid_cache.pyi +50 -0
gllm_datastore/cache/hybrid_cache/hybrid_cache.pyi +115 -0
gllm_datastore/cache/hybrid_cache/in_memory_hybrid_cache.pyi +29 -0
gllm_datastore/cache/hybrid_cache/key_matcher/__init__.pyi +5 -0
gllm_datastore/cache/hybrid_cache/key_matcher/exact_key_matcher.pyi +44 -0
gllm_datastore/cache/hybrid_cache/key_matcher/fuzzy_key_matcher.pyi +70 -0
gllm_datastore/cache/hybrid_cache/key_matcher/key_matcher.pyi +60 -0
gllm_datastore/cache/hybrid_cache/key_matcher/semantic_key_matcher.pyi +93 -0
gllm_datastore/cache/hybrid_cache/redis_hybrid_cache.pyi +34 -0
gllm_datastore/cache/hybrid_cache/utils.pyi +36 -0
gllm_datastore/cache/utils.pyi +34 -0
gllm_datastore/cache/vector_cache/__init__.pyi +0 -0
gllm_datastore/cache/vector_cache/eviction_manager/__init__.pyi +0 -0
gllm_datastore/cache/vector_cache/eviction_manager/asyncio_eviction_manager.pyi +48 -0
gllm_datastore/cache/vector_cache/eviction_manager/eviction_manager.pyi +38 -0
gllm_datastore/cache/vector_cache/eviction_strategy/__init__.pyi +0 -0
gllm_datastore/cache/vector_cache/eviction_strategy/eviction_strategy.pyi +34 -0
gllm_datastore/cache/vector_cache/eviction_strategy/ttl_eviction_strategy.pyi +34 -0
gllm_datastore/cache/vector_cache/vector_cache.pyi +99 -0
gllm_datastore/constants.pyi +66 -0
gllm_datastore/core/__init__.pyi +7 -0
gllm_datastore/core/capabilities/__init__.pyi +5 -0
gllm_datastore/core/capabilities/fulltext_capability.pyi +73 -0
gllm_datastore/core/capabilities/graph_capability.pyi +70 -0
gllm_datastore/core/capabilities/vector_capability.pyi +90 -0
gllm_datastore/core/filters/__init__.pyi +4 -0
gllm_datastore/core/filters/filter.pyi +340 -0
gllm_datastore/core/filters/schema.pyi +149 -0
gllm_datastore/data_store/__init__.pyi +7 -0
gllm_datastore/data_store/base.pyi +138 -0
gllm_datastore/data_store/chroma/__init__.pyi +4 -0
gllm_datastore/data_store/chroma/_chroma_import.pyi +13 -0
gllm_datastore/data_store/chroma/data_store.pyi +202 -0
gllm_datastore/data_store/chroma/fulltext.pyi +134 -0
gllm_datastore/data_store/chroma/query.pyi +266 -0
gllm_datastore/data_store/chroma/query_translator.pyi +41 -0
gllm_datastore/data_store/chroma/vector.pyi +197 -0
gllm_datastore/data_store/elasticsearch/__init__.pyi +5 -0
gllm_datastore/data_store/elasticsearch/data_store.pyi +119 -0
gllm_datastore/data_store/elasticsearch/fulltext.pyi +237 -0
gllm_datastore/data_store/elasticsearch/query.pyi +114 -0
gllm_datastore/data_store/elasticsearch/vector.pyi +179 -0
gllm_datastore/data_store/exceptions.pyi +35 -0
gllm_datastore/data_store/in_memory/__init__.pyi +5 -0
gllm_datastore/data_store/in_memory/data_store.pyi +71 -0
gllm_datastore/data_store/in_memory/fulltext.pyi +131 -0
gllm_datastore/data_store/in_memory/query.pyi +175 -0
gllm_datastore/data_store/in_memory/vector.pyi +174 -0
gllm_datastore/data_store/redis/__init__.pyi +5 -0
gllm_datastore/data_store/redis/data_store.pyi +154 -0
gllm_datastore/data_store/redis/fulltext.pyi +128 -0
gllm_datastore/data_store/redis/query.pyi +428 -0
gllm_datastore/data_store/redis/query_translator.pyi +37 -0
gllm_datastore/data_store/redis/vector.pyi +131 -0
gllm_datastore/encryptor/__init__.pyi +4 -0
gllm_datastore/encryptor/aes_gcm_encryptor.pyi +45 -0
gllm_datastore/encryptor/encryptor.pyi +52 -0
gllm_datastore/encryptor/key_ring/__init__.pyi +3 -0
gllm_datastore/encryptor/key_ring/in_memory_key_ring.pyi +52 -0
gllm_datastore/encryptor/key_ring/key_ring.pyi +45 -0
gllm_datastore/encryptor/key_rotating_encryptor.pyi +60 -0
gllm_datastore/graph_data_store/__init__.pyi +6 -0
gllm_datastore/graph_data_store/graph_data_store.pyi +151 -0
gllm_datastore/graph_data_store/graph_rag_data_store.pyi +29 -0
gllm_datastore/graph_data_store/light_rag_data_store.pyi +93 -0
gllm_datastore/graph_data_store/light_rag_postgres_data_store.pyi +96 -0
gllm_datastore/graph_data_store/llama_index_graph_rag_data_store.pyi +49 -0
gllm_datastore/graph_data_store/llama_index_neo4j_graph_rag_data_store.pyi +78 -0
gllm_datastore/graph_data_store/nebula_graph_data_store.pyi +206 -0
gllm_datastore/graph_data_store/neo4j_graph_data_store.pyi +182 -0
gllm_datastore/graph_data_store/utils/__init__.pyi +6 -0
gllm_datastore/graph_data_store/utils/constants.pyi +21 -0
gllm_datastore/graph_data_store/utils/light_rag_em_invoker_adapter.pyi +56 -0
gllm_datastore/graph_data_store/utils/light_rag_lm_invoker_adapter.pyi +43 -0
gllm_datastore/graph_data_store/utils/llama_index_em_invoker_adapter.pyi +45 -0
gllm_datastore/graph_data_store/utils/llama_index_lm_invoker_adapter.pyi +169 -0
gllm_datastore/sql_data_store/__init__.pyi +4 -0
gllm_datastore/sql_data_store/adapter/__init__.pyi +0 -0
gllm_datastore/sql_data_store/adapter/sqlalchemy_adapter.pyi +38 -0
gllm_datastore/sql_data_store/constants.pyi +6 -0
gllm_datastore/sql_data_store/sql_data_store.pyi +86 -0
gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi +216 -0
gllm_datastore/sql_data_store/types.pyi +31 -0
gllm_datastore/utils/__init__.pyi +6 -0
gllm_datastore/utils/converter.pyi +51 -0
gllm_datastore/utils/dict.pyi +21 -0
gllm_datastore/utils/ttl.pyi +25 -0
gllm_datastore/utils/types.pyi +32 -0
gllm_datastore/vector_data_store/__init__.pyi +6 -0
gllm_datastore/vector_data_store/chroma_vector_data_store.pyi +259 -0
gllm_datastore/vector_data_store/elasticsearch_vector_data_store.pyi +357 -0
gllm_datastore/vector_data_store/in_memory_vector_data_store.pyi +179 -0
gllm_datastore/vector_data_store/mixin/__init__.pyi +0 -0
gllm_datastore/vector_data_store/mixin/cache_compatible_mixin.pyi +145 -0
gllm_datastore/vector_data_store/redis_vector_data_store.pyi +191 -0
gllm_datastore/vector_data_store/vector_data_store.pyi +146 -0
gllm_datastore.build/.gitignore +1 -0
gllm_datastore.cpython-311-darwin.so +0 -0
gllm_datastore.pyi +156 -0
gllm_datastore_binary-0.5.45.dist-info/METADATA +178 -0
gllm_datastore_binary-0.5.45.dist-info/RECORD +108 -0
gllm_datastore_binary-0.5.45.dist-info/WHEEL +5 -0
gllm_datastore_binary-0.5.45.dist-info/top_level.txt +1 -0

gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi ADDED Viewed

@@ -0,0 +1,216 @@
+import pandas as pd
+from _typeshed import Incomplete
+from concurrent.futures import Future as Future
+from gllm_datastore.encryptor.encryptor import BaseEncryptor as BaseEncryptor
+from gllm_datastore.sql_data_store.adapter.sqlalchemy_adapter import SQLAlchemyAdapter as SQLAlchemyAdapter
+from gllm_datastore.sql_data_store.constants import CREATE_ERROR_MSG as CREATE_ERROR_MSG, DELETE_ERROR_MSG as DELETE_ERROR_MSG, QUERY_ERROR_MSG as QUERY_ERROR_MSG, READ_ERROR_MSG as READ_ERROR_MSG, UNEXPECTED_ERROR_MSG as UNEXPECTED_ERROR_MSG, UPDATE_ERROR_MSG as UPDATE_ERROR_MSG
+from gllm_datastore.sql_data_store.sql_data_store import BaseSQLDataStore as BaseSQLDataStore
+from gllm_datastore.sql_data_store.types import QueryFilter as QueryFilter, QueryOptions as QueryOptions
+from sqlalchemy import Engine
+from sqlalchemy.orm import DeclarativeBase
+from typing import Any
+DEFAULT_POOL_SIZE: int
+DEFAULT_MAX_OVERFLOW: int
+DEFAULT_MAX_WORKERS: int
+DEFAULT_BATCH_SIZE: int
+class SQLAlchemySQLDataStore(BaseSQLDataStore):
+    """Data store for interacting with SQLAlchemy.
+    This class provides methods to interact with a SQL database using SQLAlchemy.
+    Attributes:
+        db (Session): The SQLAlchemy session object.
+        engine (Engine): The SQLAlchemy engine object.
+        logger (Logger): The logger object.
+        encryptor (BaseEncryptor | None): The encryptor object to use for encryption.
+        encrypted_table_fields (list[str]): The table.column fields to encrypt.
+    """
+    db: Incomplete
+    engine: Incomplete
+    logger: Incomplete
+    encryptor: Incomplete
+    encrypted_table_fields: Incomplete
+    def __init__(self, engine_or_url: Engine | str, pool_size: int = ..., max_overflow: int = ..., autoflush: bool = True, encryptor: BaseEncryptor | None = None, encrypted_table_fields: list[str] | None = None, **kwargs: Any) -> None:
+        '''Initialize SQLAlchemySQLDataStore class.
+        Args:
+            engine_or_url (Engine | str): SQLAlchemy engine object or database URL.
+            pool_size (int, optional): The size of the database connections to be maintained. Defaults to 10.
+            max_overflow (int, optional): The maximum overflow size of the pool. Defaults to 10.
+                This parameter is ignored for SQLite.
+            autoflush (bool, optional): If True, all changes to the database are flushed immediately. Defaults to True.
+            encryptor (BaseEncryptor | None, optional): The encryptor object to use for encryption.
+                Should comply with the BaseEncryptor interface. Defaults to None.
+            encrypted_table_fields (list[str] | None, optional): The table.column fields to encrypt.
+                Format: ["table_name.column_name", "messages.content", "users.email"].
+                Defaults to None, in which case no fields will be encrypted.
+            **kwargs (Any): Additional keyword arguments to support the initialization of the SQLAlchemy adapter.
+        Raises:
+            ValueError: If the database adapter is not initialized.
+        '''
+    async def query(self, query: str, params: dict[str, Any] | None = None) -> pd.DataFrame:
+        '''Executes raw SQL queries.
+        Preferred for complex queries, when working with legacy schemas without ORM models,
+        or when using an LLM to generate your SQL queries.
+        Use this method when you need advanced SQL operations not supported by read().
+        For raw queries, we can\'t determine table context automatically. Therefore, no decryption is performed.
+        Users should handle decryption manually if needed for raw queries.
+        Args:
+            query (str): The query string with optional :param style parameters.
+            params (dict[str, Any] | None, optional): Parameters to bind to the query. Defaults to None.
+        Returns:
+            pd.DataFrame: The result of the query.
+        Note:
+            Using string parameters directly in queries is unsafe and vulnerable to SQL injection.
+            Therefore, please avoid doing as follows as they\'re unsafe:
+            ```
+            name = "O\'Connor"
+            query = f"SELECT * FROM users WHERE last_name = \'{name}\'"
+            ```
+            or
+            ```
+            query = "SELECT * FROM users WHERE last_name = \'" + name + "\'"
+            ```
+            Instead, please use parameterized queries with :param style notation as follows:
+            ```
+            query = "SELECT * FROM users WHERE last_name = :last_name"
+            params = {"last_name": "O\'Connor"}
+            ```
+        Raises:
+            RuntimeError: If the query fails.
+            RuntimeError: If an unexpected error occurs.
+        '''
+    def create(self, model: DeclarativeBase | list[DeclarativeBase]) -> None:
+        '''Inserts data into the database using SQLAlchemy ORM.
+        This method provides a structured way to insert data using ORM models.
+        Args:
+            model (DeclarativeBase | list[DeclarativeBase]): An instance or list of instances of SQLAlchemy
+                model to be inserted.
+        Example:
+            To insert a row into a table:
+            ```
+            data_store.create(MyModel(column1="value1", column2="value2"))
+            ```
+            To insert multiple rows:
+            ```
+            data_store.create([
+                MyModel(column1="value1", column2="value2"),
+                MyModel(column1="value3", column2="value4")
+            ])
+            ```
+        Raises:
+            RuntimeError: If the insertion fails.
+            RuntimeError: If an unexpected error occurs.
+        '''
+    def read(self, model_class: type[DeclarativeBase], filters: QueryFilter | None = None, options: QueryOptions | None = None) -> pd.DataFrame:
+        '''Reads data from the database using SQLAlchemy ORM with a structured, type-safe interface.
+        This method provides a high-level interface for querying data using ORM models. It supports
+        filtering, column selection, ordering, and limiting results through a type-safe interface.
+        Args:
+            model_class (Type[DeclarativeBase]): The SQLAlchemy model class to query.
+            filters (QueryFilter | None, optional): Optional query filters containing column-value pairs
+                to filter the results. Defaults to None.
+            options (QueryOptions | None, optional): Optional query configuration including:
+                - columns: Specific columns to select
+                - order_by: Column to sort by
+                - order_desc: Sort order (ascending/descending)
+                - limit: Maximum number of results
+                Defaults to None.
+        Returns:
+            pd.DataFrame: A DataFrame containing the query results.
+        Example:
+            ```python
+            data_store.read(
+                Message,
+                filters=QueryFilter(conditions={"conversation_id": "123"}),
+                options=QueryOptions(
+                    columns=["role", "content"],
+                    order_by="created_at",
+                    order_desc=True,
+                    limit=10
+                )
+            )
+            ```
+        Raises:
+            RuntimeError: If the read operation fails.
+            RuntimeError: If an unexpected error occurs.
+        '''
+    def update(self, model_class: type[DeclarativeBase], update_values: dict[str, Any], filters: QueryFilter | None = None, **kwargs: Any) -> None:
+        '''Updates data in the database using SQLAlchemy ORM.
+        This method provides a structured way to update data using ORM models.
+        Args:
+            model_class (Type[DeclarativeBase]): The SQLAlchemy model class to update.
+            update_values (dict[str, Any]): Values to update.
+            filters (QueryFilter | None, optional): Filters to apply to the query. Defaults to None.
+            **kwargs (Any): Additional keyword arguments to support the update method.
+        Example:
+            To update a row in a table:
+            ```
+            data_store.update(
+                MyModel,
+                update_values={"column1": "new_value"}
+                filters=QueryFilter(conditions={"id": 1}),
+            )
+            ```
+        Note:
+            Encrypted fields cannot be used in update conditions due to non-deterministic encryption.
+            Use non-encrypted fields (like \'id\') for update conditions.
+        Raises:
+            ValueError: If encrypted fields are used in update conditions.
+            RuntimeError: If the update operation fails.
+            RuntimeError: If an unexpected error occurs.
+        '''
+    def delete(self, model_class: type[DeclarativeBase], filters: QueryFilter | None = None, allow_delete_all: bool = False, **kwargs: Any) -> None:
+        '''Deletes data from the database using SQLAlchemy ORM.
+        This method provides a structured way to delete data using ORM models.
+        Args:
+            model_class (Type[DeclarativeBase]): The SQLAlchemy model class to delete.
+            filters (QueryFilter | None, optional): Filters to apply to the query. Defaults to None.
+            allow_delete_all (bool, optional): If True, allows deletion of all records. Defaults to False.
+            **kwargs (Any): Additional keyword arguments to support the delete method.
+        Example:
+            To delete a row from a table:
+            ```
+            data_store.delete(
+                MyModel,
+                filters=QueryFilter(conditions={"id": 1})
+            )
+            ```
+        Note:
+            Encrypted fields cannot be used in delete conditions due to non-deterministic encryption.
+            Use non-encrypted fields (like \'id\') for deletion conditions.
+        Raises:
+            ValueError: If no filters are provided (to prevent accidental deletion of all records).
+            ValueError: If encrypted fields are used in delete conditions.
+            RuntimeError: If the delete operation fails.
+            RuntimeError: If an unexpected error occurs.
+        '''

gllm_datastore/sql_data_store/types.pyi ADDED Viewed

@@ -0,0 +1,31 @@
+from pydantic import BaseModel
+from typing import Any, Sequence
+class QueryFilter(BaseModel):
+    '''Model for query filters.
+    Attributes:
+        conditions (dict[str, Any]): The conditions for filtering the query.
+    Example:
+        QueryFilter(conditions={"column1": "value1", "column2": "value2"})
+    '''
+    conditions: dict[str, Any]
+class QueryOptions(BaseModel):
+    '''Model for query options.
+    Attributes:
+        columns (Sequence[str] | None): The columns to include in the query result. Defaults to None.
+        fields (Sequence[str] | None): The fields to include in the query result. Defaults to None.
+        order_by (str | None): The column to order the query result by. Defaults to None.
+        order_desc (bool): Whether to order the query result in descending order. Defaults to False.
+        limit (int | None): The maximum number of rows to return. Defaults to None.
+    Example:
+        QueryOptions(fields=["field1", "field2"], order_by="column1", order_desc=True, limit=10)
+    '''
+    columns: Sequence[str] | None
+    order_by: str | None
+    order_desc: bool
+    limit: int | None

gllm_datastore/utils/__init__.pyi ADDED Viewed

@@ -0,0 +1,6 @@
+from gllm_datastore.utils.converter import from_langchain as from_langchain
+from gllm_datastore.utils.dict import flatten_dict as flatten_dict
+from gllm_datastore.utils.ttl import convert_ttl_to_seconds as convert_ttl_to_seconds
+from gllm_datastore.utils.types import QueryFilter as QueryFilter, QueryOptions as QueryOptions
+__all__ = ['from_langchain', 'convert_ttl_to_seconds', 'flatten_dict', 'QueryFilter', 'QueryOptions']

gllm_datastore/utils/converter.pyi ADDED Viewed

@@ -0,0 +1,51 @@
+from gllm_core.schema import Chunk
+from gllm_datastore.constants import SIMILARITY_SCORE as SIMILARITY_SCORE
+from langchain_core.documents import Document
+def from_langchain(doc: Document, score: float | None = None) -> Chunk:
+    """Create a standardized Chunk from a LangChain Document.
+    Args:
+        doc (Document): The document to create a Chunk from.
+        score (float | None, optional): The score to assign to the Chunk. Defaults to None, in which case it will
+            attempt to get the score from the `score` metadata.
+    Returns:
+        Chunk: The standardized Chunk object.
+    """
+def to_langchain(chunk: Chunk) -> Document:
+    """Create a LangChain Document from a standardized Chunk.
+    Args:
+        chunk (Chunk): The standardized Chunk to create a Document from.
+    Returns:
+        Document: The LangChain Document object.
+    """
+def l2_distance_to_similarity_score(distance: float) -> float:
+    """Convert distance to similarity.
+    Args:
+        distance (float): The distance value to convert. Ranges in [0, inf].
+    Returns:
+        float: The converted similarity value.
+    """
+def cosine_distance_to_similarity_score(distance: float) -> float:
+    """Convert cosine distance to similarity.
+    Args:
+        distance (float): The cosine distance value to convert. Ranges in [0, 2].
+    Returns:
+        float: The converted similarity value. Ranges in [0, 1].
+    """
+def similarity_score_to_cosine_distance(similarity: float) -> float:
+    """Convert similarity to cosine distance.
+    Args:
+        similarity (float): The similarity value to convert. Ranges in [0, 1].
+    Returns:
+        float: The converted cosine distance value. Ranges in [0, 2].
+    """

gllm_datastore/utils/dict.pyi ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import Any
+def flatten_dict(nested_dict: dict[str, Any], parent_key: str = '', sep: str = '.') -> dict[str, Any]:
+    '''Flatten a nested dictionary into a single level dictionary.
+    Args:
+        nested_dict (dict[str, Any]): The nested dictionary to flatten.
+        parent_key (str, optional): The parent key to prepend to the keys in the flattened dictionary.
+            Defaults to empty string.
+        sep (str, optional): The separator to use between the parent key and the child key. Defaults to ".".
+    Returns:
+        dict[str, Any]: The flattened dictionary.
+    Examples:
+        ```python
+        nested = {"a": {"b": 1, "c": 2}, "d": 3}
+        flattened = flatten_dict(nested)
+        # Result: {"a.b": 1, "a.c": 2, "d": 3}
+        ```
+    '''

gllm_datastore/utils/ttl.pyi ADDED Viewed

@@ -0,0 +1,25 @@
+from _typeshed import Incomplete
+TIME_UNIT_TO_SECOND_MAPPING: Incomplete
+def convert_ttl_to_seconds(ttl: str | int) -> int:
+    '''Convert TTL (time-to-live) string with time units to seconds.
+    Supported units: s (seconds), m (minutes), h (hours), d (days), w (weeks), y (years).
+    Examples:
+        "2m" -> 120 (2 minutes in seconds)
+        "1h" -> 3600 (1 hour in seconds)
+        "1y" -> 31536000 (1 year in seconds)
+        300 -> 300 (numeric input returned as is)
+    Args:
+        ttl (str | int): Time to live value with optional unit suffix (e.g., "2m", "1h", "1y")
+            or numeric value in seconds.
+    Returns:
+        int: TTL converted to seconds.
+    Raises:
+        ValueError: If the input format is invalid.
+    '''

gllm_datastore/utils/types.pyi ADDED Viewed

@@ -0,0 +1,32 @@
+from pydantic import BaseModel
+from typing import Any, Sequence
+class QueryFilter(BaseModel):
+    '''Model for query filters.
+    Attributes:
+        conditions (dict[str, Any]): The conditions for filtering the query.
+    Example:
+        QueryFilter(conditions={"column1": "value1", "column2": "value2"})
+    '''
+    conditions: dict[str, Any]
+class QueryOptions(BaseModel):
+    '''Model for query options.
+    Attributes:
+        columns (Sequence[str] | None): The columns to include in the query result. Defaults to None.
+        fields (Sequence[str] | None): The fields to include in the query result. Defaults to None.
+        order_by (str | None): The column to order the query result by. Defaults to None.
+        order_desc (bool): Whether to order the query result in descending order. Defaults to False.
+        limit (int | None): The maximum number of rows to return. Defaults to None.
+    Example:
+        QueryOptions(fields=["field1", "field2"], order_by="column1", order_desc=True, limit=10)
+    '''
+    columns: Sequence[str] | None
+    fields: Sequence[str] | None
+    order_by: str | None
+    order_desc: bool
+    limit: int | None

gllm_datastore/vector_data_store/__init__.pyi ADDED Viewed

@@ -0,0 +1,6 @@
+from gllm_datastore.vector_data_store.chroma_vector_data_store import ChromaVectorDataStore as ChromaVectorDataStore
+from gllm_datastore.vector_data_store.elasticsearch_vector_data_store import ElasticsearchVectorDataStore as ElasticsearchVectorDataStore
+from gllm_datastore.vector_data_store.in_memory_vector_data_store import InMemoryVectorDataStore as InMemoryVectorDataStore
+from gllm_datastore.vector_data_store.redis_vector_data_store import RedisVectorDataStore as RedisVectorDataStore
+__all__ = ['ChromaVectorDataStore', 'ElasticsearchVectorDataStore', 'InMemoryVectorDataStore', 'RedisVectorDataStore']

gllm_datastore/vector_data_store/chroma_vector_data_store.pyi ADDED Viewed

@@ -0,0 +1,259 @@
+from _typeshed import Incomplete
+from chromadb.types import Where, WhereDocument
+from datetime import datetime
+from enum import Enum
+from gllm_core.schema.chunk import Chunk
+from gllm_datastore.constants import DEFAULT_TOP_K as DEFAULT_TOP_K, METADATA_KEYS as METADATA_KEYS
+from gllm_datastore.utils.converter import from_langchain as from_langchain, l2_distance_to_similarity_score as l2_distance_to_similarity_score, to_langchain as to_langchain
+from gllm_datastore.vector_data_store.mixin.cache_compatible_mixin import CacheCompatibleMixin as CacheCompatibleMixin
+from gllm_datastore.vector_data_store.vector_data_store import BaseVectorDataStore as BaseVectorDataStore
+from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
+from langchain_core.embeddings import Embeddings
+from typing import Any
+DEFAULT_NUM_CANDIDATES: int
+class ChromaClientType(str, Enum):
+    """Enum for different types of ChromaDB clients.
+    Attributes:
+        MEMORY (str): Client type for an in-memory data store.
+        PERSISTENT (str): Client type for a persistent data store.
+        HTTP (str): Client type for a client-server architecture.
+    """
+    MEMORY: str
+    PERSISTENT: str
+    HTTP: str
+class ChromaVectorDataStore(BaseVectorDataStore, CacheCompatibleMixin):
+    """Datastore for interacting with ChromaDB.
+    This class provides methods to interact with ChromaDB for vector storage and retrieval
+    using the langchain-chroma integration.
+    Attributes:
+        vector_store (Chroma): The langchain Chroma vector store instance.
+        collection_name (str): The name of the ChromaDB collection to use.
+        num_candidates (int): The maximum number of candidates to consider during search.
+        embedding (BaseEMInvoker | Embeddings | None): The embedding model to perform vectorization.
+    """
+    vector_store: Incomplete
+    collection_name: Incomplete
+    num_candidates: Incomplete
+    def __init__(self, collection_name: str, embedding: BaseEMInvoker | Embeddings | None = None, client_type: ChromaClientType = ..., persist_directory: str | None = None, host: str | None = None, port: int | None = None, headers: dict | None = None, num_candidates: int = ..., **kwargs: Any) -> None:
+        """Initialize the ChromaDB vector data store with langchain-chroma.
+        Args:
+            collection_name (str): Name of the collection to use in ChromaDB.
+            embedding (BaseEMInvoker | Embeddings | None, optional): The embedding model to perform vectorization.
+                Defaults to None.
+            client_type (ChromaClientType, optional): Type of ChromaDB client to use.
+                Defaults to ChromaClientType.MEMORY.
+            persist_directory (str | None, optional): Directory to persist vector store data.
+                Required for PERSISTENT client type. Defaults to None.
+            host (str | None, optional): Host address for ChromaDB server.
+                Required for HTTP client type. Defaults to None.
+            port (int | None, optional): Port for ChromaDB server.
+                Required for HTTP client type. Defaults to None.
+            headers (dict | None, optional): Headers for ChromaDB server.
+                Used for HTTP client type. Defaults to None.
+            num_candidates (int, optional): Maximum number of candidates to consider during search.
+                Defaults to DEFAULT_NUM_CANDIDATES.
+            **kwargs: Additional parameters for Chroma initialization.
+        Note:
+            num_candidates (int, optional): This constant affects the maximum number of results to consider
+            during the search. Index with more documents would need a higher value for the whole documents
+            to be considered during search. This happens due to a bug with Chroma's search algorithm as discussed
+            in this issue: [3] https://github.com/langchain-ai/langchain/issues/1946
+        """
+    async def get_size(self) -> int:
+        """Returns the total number of vectors in the index.
+        If the index is not initialized returns 0.
+        Returns:
+            int: The total number of vectors.
+        """
+    async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, dict[str, str]] | None = None) -> list[Chunk]:
+        '''Query the vector data store for similar chunks with similarity scores.
+        Args:
+            query (str): The query string to find similar chunks for.
+            top_k (int, optional): Maximum number of results to return. Defaults to DEFAULT_TOP_K.
+            retrieval_params (dict[str, Any] | None, optional): Additional parameters for retrieval.
+                - filter (Where, optional): A Where type dict used to filter the retrieval by the metadata keys.
+                    E.g. `{"$and": [{"color" : "red"}, {"price": {"$gte": 4.20}]}}`.
+                - where_document (WhereDocument, optional): A WhereDocument type dict used to filter the retrieval by
+                    the document content. E.g. `{$contains: {"text": "hello"}}`.
+                Defaults to None.
+        Returns:
+            list[Chunk]: A list of Chunk objects matching the query, with similarity scores.
+        '''
+    async def query_by_id(self, id: str | list[str]) -> list[Chunk]:
+        """Retrieve chunks by their IDs.
+        Args:
+            id (str | list[str]): A single ID or a list of IDs to retrieve.
+        Returns:
+            list[Chunk]: A list of retrieved Chunk objects.
+        """
+    async def add_chunks(self, chunks: Chunk | list[Chunk], **kwargs) -> list[str]:
+        """Add chunks to the vector data store.
+        Args:
+            chunks (Chunk | list[Chunk]): A single chunk or list of chunks to add.
+            **kwargs: Additional keyword arguments for the add operation.
+        Returns:
+            list[str]: List of IDs of the added chunks.
+        """
+    async def delete_chunks(self, where: Where | None = None, where_document: WhereDocument | None = None, **kwargs: Any) -> None:
+        '''Delete chunks from the vector data store.
+        Args:
+            where (Where | None, optional): A Where type dict used to filter the deletion by metadata.
+                E.g. `{"source": "mydoc"}`. Defaults to None.
+            where_document (WhereDocument | None, optional): A WhereDocument type dict used to filter the deletion by
+                the document content. E.g. `{$contains: {"text": "hello"}}`. Defaults to None.
+            **kwargs: Additional keyword arguments for the delete operation.
+        Note:
+            If no filter criteria is provided, all chunks in the collection will be deleted. Please use with caution.
+        '''
+    async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
+        """Delete chunks from the vector data store by IDs.
+        Args:
+            ids (str | list[str]): A single ID or a list of IDs to delete.
+            **kwargs: Additional keyword arguments.
+        Note:
+            If no IDs are provided, no chunks will be deleted.
+        """
+    async def exact_match(self, key: str, metadata: dict[str, Any] | None = None) -> Any | None:
+        '''Find chunks that exactly match the given key.
+        This method searches for documents with the exact original_key in metadata.
+        Args:
+            key (str): The key to match.
+            metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
+                For example, `{"key": "value"}`. Defaults to None.
+        Returns:
+            Any: The value stored with the exact key match, or None if no match is found.
+        '''
+    async def fuzzy_match(self, key: str, max_distance: int = 2, metadata: dict[str, Any] | None = None) -> Any | None:
+        '''Find chunks that approximately match the given key using fuzzy matching.
+        Args:
+            key (str): The key to match.
+            max_distance (int): Maximum allowed Levenshtein distance for fuzzy matching.
+                Higher values are more lenient. Defaults to 2.
+            metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
+                For example, `{"key": "value"}`. Defaults to None.
+        Returns:
+            Any: The value with the closest fuzzy match to the key, or None if no match meets the threshold.
+        '''
+    async def semantic_match(self, key: str, min_similarity: float = 0.2, metadata: dict[str, Any] | None = None) -> Any | None:
+        '''Find chunks that semantically match the given key using vector similarity.
+        Args:
+            key (str): The key to match.
+            min_similarity (float): Minimum similarity score for semantic matching
+                (higher values are more strict). Ranges from 0 to 1. Defaults to 0.8.
+            metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
+                For example, `{"key": "value"}`. Defaults to None.
+        Returns:
+            Any: The semantically closest value, or None if no match meets the min_similarity.
+        '''
+    async def delete_expired_entries(self, now: datetime, max_size: int = 10000) -> None:
+        """Delete expired entries (for TTL eviction).
+        Args:
+            now (datetime): The current datetime for comparison.
+            max_size (int): The maximum number of entries to return. Defaults to 10000.
+        Raises:
+            NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
+        """
+    async def delete_least_frequently_used_entries(self, num_entries: int) -> None:
+        """Delete least frequently used entries (for LFU eviction).
+        Args:
+            num_entries (int): Number of entries to return.
+        Raises:
+            NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
+        """
+    async def delete_least_recently_used_entries(self, num_entries: int) -> None:
+        """Delete least recently used entries (for LRU eviction).
+        Args:
+            num_entries (int): Number of entries to return.
+        Raises:
+            NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
+        """
+    async def delete_entries_by_key(self, key: str, metadata: dict[str, Any] | None = None) -> None:
+        '''Delete entries by key.
+        Args:
+            key (str): The key to delete entries for.
+            metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
+                For example, `{"key": "value"}`. Defaults to None.
+        Raises:
+            NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
+        '''
+    async def clear(self) -> None:
+        """Clear all entries in the storage.
+        Raises:
+            NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
+        """
+    async def query_by_field(self, retrieval_params: dict[str, Any], limit: int | None = None, **kwargs) -> list[Chunk]:
+        """Retrieve documents that match specific metadata constraints.
+        This method filters and returns stored chunks based on metadata values
+        rather than vector similarity. It is particularly useful for structured lookups,
+        such as retrieving all chunks from a certain source, tagged with a specific label,
+        or authored by a particular user.
+        Unlike semantic search methods, `query_by_field` operates purely on metadata fields
+        associated with each document, allowing precise filtering based on key-value pairs.
+        Args:
+            retrieval_params (dict[str, Any]): A dictionary defining filter criteria. Common keys include:
+                - `filter` (dict): A dictionary of metadata field conditions.
+                - `where_document` (dict, optional): Conditions based on document content.
+            limit (int | None, optional): The maximum number of results to return. If None, all matching
+                documents will be returned.
+            **kwargs: Additional arguments to support datastore-specific behavior or filtering logic.
+        Returns:
+            list[Chunk]: A list of `Chunk` objects that satisfy the metadata criteria.
+        Raises:
+            NotImplementedError: If not implemented in the subclass.
+        """
+    async def query_by_vector(self, vector: list[float], top_k: int = ..., min_similarity: float = 0.8, retrieval_params: dict | None = None) -> list[Chunk]:
+        """Search for documents that are similar to a given vector.
+        Args:
+            vector (list[float]): The query embedding vector to compare against stored vectors.
+            top_k (int, optional): The number of top results to return. Defaults to DEFAULT_TOP_K.
+            min_similarity (float): Minimum similarity score for vector similarity.
+            retrieval_params (dict | None, optional): Filter parameters to narrow the search:
+                - filter (Where): Metadata-based filter.
+                - where_document (WhereDocument): Content-based filter.
+                Defaults to None.
+        Returns:
+            list[Chunk]: A list of Chunk objects with similarity scores based on the input vector.
+        """