PyPI - gllm-datastore-binary - Versions diffs - 0.5.50__cp312-cp312-macosx_13_0_arm64.whl - Mend

gllm-datastore-binary 0.5.50__cp312-cp312-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

gllm_datastore/__init__.pyi +0 -0
gllm_datastore/cache/__init__.pyi +4 -0
gllm_datastore/cache/base.pyi +84 -0
gllm_datastore/cache/cache.pyi +137 -0
gllm_datastore/cache/hybrid_cache/__init__.pyi +5 -0
gllm_datastore/cache/hybrid_cache/file_system_hybrid_cache.pyi +50 -0
gllm_datastore/cache/hybrid_cache/hybrid_cache.pyi +115 -0
gllm_datastore/cache/hybrid_cache/in_memory_hybrid_cache.pyi +29 -0
gllm_datastore/cache/hybrid_cache/key_matcher/__init__.pyi +5 -0
gllm_datastore/cache/hybrid_cache/key_matcher/exact_key_matcher.pyi +44 -0
gllm_datastore/cache/hybrid_cache/key_matcher/fuzzy_key_matcher.pyi +70 -0
gllm_datastore/cache/hybrid_cache/key_matcher/key_matcher.pyi +60 -0
gllm_datastore/cache/hybrid_cache/key_matcher/semantic_key_matcher.pyi +93 -0
gllm_datastore/cache/hybrid_cache/redis_hybrid_cache.pyi +34 -0
gllm_datastore/cache/hybrid_cache/utils.pyi +36 -0
gllm_datastore/cache/utils.pyi +34 -0
gllm_datastore/cache/vector_cache/__init__.pyi +0 -0
gllm_datastore/cache/vector_cache/eviction_manager/__init__.pyi +0 -0
gllm_datastore/cache/vector_cache/eviction_manager/asyncio_eviction_manager.pyi +48 -0
gllm_datastore/cache/vector_cache/eviction_manager/eviction_manager.pyi +38 -0
gllm_datastore/cache/vector_cache/eviction_strategy/__init__.pyi +0 -0
gllm_datastore/cache/vector_cache/eviction_strategy/eviction_strategy.pyi +34 -0
gllm_datastore/cache/vector_cache/eviction_strategy/ttl_eviction_strategy.pyi +34 -0
gllm_datastore/cache/vector_cache/vector_cache.pyi +99 -0
gllm_datastore/constants.pyi +66 -0
gllm_datastore/core/__init__.pyi +7 -0
gllm_datastore/core/capabilities/__init__.pyi +7 -0
gllm_datastore/core/capabilities/encryption_capability.pyi +21 -0
gllm_datastore/core/capabilities/fulltext_capability.pyi +73 -0
gllm_datastore/core/capabilities/graph_capability.pyi +70 -0
gllm_datastore/core/capabilities/hybrid_capability.pyi +184 -0
gllm_datastore/core/capabilities/vector_capability.pyi +90 -0
gllm_datastore/core/filters/__init__.pyi +4 -0
gllm_datastore/core/filters/filter.pyi +340 -0
gllm_datastore/core/filters/schema.pyi +149 -0
gllm_datastore/data_store/__init__.pyi +8 -0
gllm_datastore/data_store/_elastic_core/__init__.pyi +0 -0
gllm_datastore/data_store/_elastic_core/client_factory.pyi +66 -0
gllm_datastore/data_store/_elastic_core/constants.pyi +27 -0
gllm_datastore/data_store/_elastic_core/elastic_like_core.pyi +115 -0
gllm_datastore/data_store/_elastic_core/index_manager.pyi +37 -0
gllm_datastore/data_store/_elastic_core/query_translator.pyi +89 -0
gllm_datastore/data_store/base.pyi +176 -0
gllm_datastore/data_store/chroma/__init__.pyi +4 -0
gllm_datastore/data_store/chroma/_chroma_import.pyi +13 -0
gllm_datastore/data_store/chroma/data_store.pyi +201 -0
gllm_datastore/data_store/chroma/fulltext.pyi +134 -0
gllm_datastore/data_store/chroma/query.pyi +266 -0
gllm_datastore/data_store/chroma/query_translator.pyi +41 -0
gllm_datastore/data_store/chroma/vector.pyi +197 -0
gllm_datastore/data_store/elasticsearch/__init__.pyi +5 -0
gllm_datastore/data_store/elasticsearch/data_store.pyi +147 -0
gllm_datastore/data_store/elasticsearch/fulltext.pyi +238 -0
gllm_datastore/data_store/elasticsearch/query.pyi +118 -0
gllm_datastore/data_store/elasticsearch/query_translator.pyi +18 -0
gllm_datastore/data_store/elasticsearch/vector.pyi +180 -0
gllm_datastore/data_store/exceptions.pyi +35 -0
gllm_datastore/data_store/in_memory/__init__.pyi +5 -0
gllm_datastore/data_store/in_memory/data_store.pyi +71 -0
gllm_datastore/data_store/in_memory/fulltext.pyi +131 -0
gllm_datastore/data_store/in_memory/query.pyi +175 -0
gllm_datastore/data_store/in_memory/vector.pyi +174 -0
gllm_datastore/data_store/opensearch/__init__.pyi +5 -0
gllm_datastore/data_store/opensearch/data_store.pyi +160 -0
gllm_datastore/data_store/opensearch/fulltext.pyi +240 -0
gllm_datastore/data_store/opensearch/query.pyi +89 -0
gllm_datastore/data_store/opensearch/query_translator.pyi +18 -0
gllm_datastore/data_store/opensearch/vector.pyi +211 -0
gllm_datastore/data_store/redis/__init__.pyi +5 -0
gllm_datastore/data_store/redis/data_store.pyi +153 -0
gllm_datastore/data_store/redis/fulltext.pyi +128 -0
gllm_datastore/data_store/redis/query.pyi +428 -0
gllm_datastore/data_store/redis/query_translator.pyi +37 -0
gllm_datastore/data_store/redis/vector.pyi +131 -0
gllm_datastore/data_store/sql/__init__.pyi +4 -0
gllm_datastore/data_store/sql/constants.pyi +5 -0
gllm_datastore/data_store/sql/data_store.pyi +201 -0
gllm_datastore/data_store/sql/fulltext.pyi +164 -0
gllm_datastore/data_store/sql/query.pyi +81 -0
gllm_datastore/data_store/sql/query_translator.pyi +51 -0
gllm_datastore/data_store/sql/schema.pyi +16 -0
gllm_datastore/encryptor/__init__.pyi +4 -0
gllm_datastore/encryptor/aes_gcm_encryptor.pyi +45 -0
gllm_datastore/encryptor/capability/__init__.pyi +3 -0
gllm_datastore/encryptor/capability/mixin.pyi +32 -0
gllm_datastore/encryptor/encryptor.pyi +52 -0
gllm_datastore/encryptor/key_ring/__init__.pyi +3 -0
gllm_datastore/encryptor/key_ring/in_memory_key_ring.pyi +52 -0
gllm_datastore/encryptor/key_ring/key_ring.pyi +45 -0
gllm_datastore/encryptor/key_rotating_encryptor.pyi +60 -0
gllm_datastore/graph_data_store/__init__.pyi +6 -0
gllm_datastore/graph_data_store/graph_data_store.pyi +151 -0
gllm_datastore/graph_data_store/graph_rag_data_store.pyi +29 -0
gllm_datastore/graph_data_store/light_rag_data_store.pyi +93 -0
gllm_datastore/graph_data_store/light_rag_postgres_data_store.pyi +96 -0
gllm_datastore/graph_data_store/llama_index_graph_rag_data_store.pyi +49 -0
gllm_datastore/graph_data_store/llama_index_neo4j_graph_rag_data_store.pyi +78 -0
gllm_datastore/graph_data_store/mixins/__init__.pyi +3 -0
gllm_datastore/graph_data_store/mixins/agentic_graph_tools_mixin.pyi +175 -0
gllm_datastore/graph_data_store/nebula_graph_data_store.pyi +206 -0
gllm_datastore/graph_data_store/neo4j_graph_data_store.pyi +182 -0
gllm_datastore/graph_data_store/schema.pyi +27 -0
gllm_datastore/graph_data_store/utils/__init__.pyi +6 -0
gllm_datastore/graph_data_store/utils/constants.pyi +21 -0
gllm_datastore/graph_data_store/utils/light_rag_em_invoker_adapter.pyi +56 -0
gllm_datastore/graph_data_store/utils/light_rag_lm_invoker_adapter.pyi +43 -0
gllm_datastore/graph_data_store/utils/llama_index_em_invoker_adapter.pyi +45 -0
gllm_datastore/graph_data_store/utils/llama_index_lm_invoker_adapter.pyi +169 -0
gllm_datastore/signature/__init__.pyi +0 -0
gllm_datastore/signature/webhook_signature.pyi +31 -0
gllm_datastore/sql_data_store/__init__.pyi +4 -0
gllm_datastore/sql_data_store/adapter/__init__.pyi +0 -0
gllm_datastore/sql_data_store/adapter/sqlalchemy_adapter.pyi +38 -0
gllm_datastore/sql_data_store/constants.pyi +6 -0
gllm_datastore/sql_data_store/sql_data_store.pyi +86 -0
gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi +216 -0
gllm_datastore/sql_data_store/types.pyi +31 -0
gllm_datastore/utils/__init__.pyi +6 -0
gllm_datastore/utils/converter.pyi +51 -0
gllm_datastore/utils/dict.pyi +21 -0
gllm_datastore/utils/ttl.pyi +25 -0
gllm_datastore/utils/types.pyi +32 -0
gllm_datastore/vector_data_store/__init__.pyi +6 -0
gllm_datastore/vector_data_store/chroma_vector_data_store.pyi +259 -0
gllm_datastore/vector_data_store/elasticsearch_vector_data_store.pyi +357 -0
gllm_datastore/vector_data_store/in_memory_vector_data_store.pyi +179 -0
gllm_datastore/vector_data_store/mixin/__init__.pyi +0 -0
gllm_datastore/vector_data_store/mixin/cache_compatible_mixin.pyi +145 -0
gllm_datastore/vector_data_store/redis_vector_data_store.pyi +191 -0
gllm_datastore/vector_data_store/vector_data_store.pyi +146 -0
gllm_datastore.build/.gitignore +1 -0
gllm_datastore.cpython-312-darwin.so +0 -0
gllm_datastore.pyi +178 -0
gllm_datastore_binary-0.5.50.dist-info/METADATA +185 -0
gllm_datastore_binary-0.5.50.dist-info/RECORD +137 -0
gllm_datastore_binary-0.5.50.dist-info/WHEEL +5 -0
gllm_datastore_binary-0.5.50.dist-info/top_level.txt +1 -0

gllm_datastore/data_store/chroma/vector.pyi ADDED Viewed

@@ -0,0 +1,197 @@
+from _typeshed import Incomplete
+from chromadb import ClientAPI
+from gllm_core.schema import Chunk
+from gllm_datastore.constants import CHUNK_KEYS as CHUNK_KEYS, DEFAULT_TOP_K as DEFAULT_TOP_K, METADATA_KEYS as METADATA_KEYS
+from gllm_datastore.core.filters import FilterClause as FilterClause, QueryFilter as QueryFilter, QueryOptions as QueryOptions
+from gllm_datastore.data_store.chroma._chroma_import import safe_import_chromadb as safe_import_chromadb
+from gllm_datastore.data_store.chroma.query import ChromaCollectionKeys as ChromaCollectionKeys, DEFAULT_NUM_CANDIDATES as DEFAULT_NUM_CANDIDATES, build_chroma_delete_kwargs as build_chroma_delete_kwargs, build_chroma_get_kwargs as build_chroma_get_kwargs
+from gllm_datastore.data_store.chroma.query_translator import ChromaQueryTranslator as ChromaQueryTranslator
+from gllm_datastore.utils.converter import from_langchain as from_langchain, l2_distance_to_similarity_score as l2_distance_to_similarity_score, to_langchain as to_langchain
+from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
+from gllm_inference.schema import Vector
+from typing import Any
+chromadb: Incomplete
+class ChromaVectorCapability:
+    """ChromaDB implementation of VectorCapability protocol.
+    This class provides document CRUD operations and vector search using ChromaDB.
+    Attributes:
+        collection_name (str): The name of the ChromaDB collection.
+        collection (Collection): The ChromaDB collection instance.
+        vector_store (Chroma): The langchain Chroma vector store instance.
+        num_candidates (int): The maximum number of candidates to consider during search.
+    """
+    collection_name: Incomplete
+    client: Incomplete
+    collection: Incomplete
+    num_candidates: Incomplete
+    vector_store: Incomplete
+    def __init__(self, collection_name: str, em_invoker: BaseEMInvoker, client: ClientAPI, num_candidates: int = ...) -> None:
+        """Initialize the ChromaDB vector capability.
+        Args:
+            collection_name (str): The name of the ChromaDB collection.
+            em_invoker (BaseEMInvoker): The embedding model to perform vectorization.
+            client (ClientAPI): The ChromaDB client instance.
+            num_candidates (int, optional): Maximum number of candidates to consider during search.
+                Defaults to 50.
+        """
+    @property
+    def em_invoker(self) -> BaseEMInvoker:
+        """Returns the EM Invoker instance.
+        Returns:
+            BaseEMInvoker: The EM Invoker instance.
+        """
+    async def ensure_index(self) -> None:
+        """Ensure ChromaDB collection exists, creating it if necessary.
+        This method is idempotent - if the collection already exists, it will return
+        the existing collection. The collection is automatically created during initialization,
+        but this method can be called explicitly to ensure it exists.
+        Raises:
+            RuntimeError: If collection creation fails.
+        """
+    async def create(self, data: Chunk | list[Chunk], **kwargs: Any) -> None:
+        """Add chunks to the vector store with automatic embedding generation.
+        Args:
+            data (Chunk | list[Chunk]): Single chunk or list of chunks to add.
+            **kwargs: Backend-specific parameters.
+        """
+    async def create_from_vector(self, chunk_vectors: list[tuple[Chunk, Vector]], **kwargs: Any) -> None:
+        '''Add pre-computed embeddings directly.
+        Examples:
+            ```python
+            await datastore.vector.create_from_vector(chunk_vectors=[
+                (Chunk(content="text1", metadata={"source": "source1"}, id="id1"), [0.1, 0.2, 0.3]),
+                (Chunk(content="text2", metadata={"source": "source2"}, id="id2"), [0.4, 0.5, 0.6]),
+            ])
+            ```
+        Args:
+            chunk_vectors (list[tuple[Chunk, Vector]]): List of tuples containing chunks and their
+                corresponding vectors.
+            **kwargs: Datastore-specific parameters.
+        '''
+    async def retrieve(self, query: str, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None) -> list[Chunk]:
+        '''Semantic search using text query converted to vector.
+        Examples:
+            ```python
+            from gllm_datastore.core.filters import filter as F
+            # Direct FilterClause usage
+            await datastore.vector.retrieve(
+                query="What is the capital of France?",
+                filters=F.eq("metadata.category", "tech")
+            )
+            # Multiple filters
+            filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
+            await datastore.vector.retrieve(query="What is the capital of France?", filters=filters)
+            ```
+            This will retrieve the top 10 chunks by similarity score from the vector store
+            that match the query and the filters. The chunks will be sorted by score in descending order.
+        Args:
+            query (str): Text query to embed and search for.
+            filters (FilterClause | QueryFilter | None, optional): Filters to apply to the search.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None.
+            options (QueryOptions | None, optional): Options to apply to the search. Defaults to None.
+        Returns:
+            list[Chunk]: List of chunks ordered by relevance score.
+        '''
+    async def retrieve_by_vector(self, vector: Vector, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None) -> list[Chunk]:
+        '''Direct vector similarity search.
+        Examples:
+            ```python
+            from gllm_datastore.core.filters import filter as F
+            # Direct FilterClause usage
+            await datastore.vector.retrieve_by_vector(
+                vector=[0.1, 0.2, 0.3],
+                filters=F.eq("metadata.category", "tech")
+            )
+            # Multiple filters
+            filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
+            await datastore.vector.retrieve_by_vector(vector=[0.1, 0.2, 0.3], filters=filters)
+            ```
+            This will retrieve the top 10 chunks by similarity score from the vector store
+            that match the vector and the filters. The chunks will be sorted by score in descending order.
+        Args:
+            vector (Vector): Query embedding vector.
+            filters (FilterClause | QueryFilter | None, optional): Filters to apply to the search.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None.
+            options (QueryOptions | None, optional): Options to apply to the search. Defaults to None.
+        Returns:
+            list[Chunk]: List of chunks ordered by similarity score.
+        '''
+    async def update(self, update_values: dict[str, Any], filters: FilterClause | QueryFilter | None = None) -> None:
+        '''Update existing records in the datastore.
+        Examples:
+            ```python
+            from gllm_datastore.core.filters import filter as F
+            # Direct FilterClause usage
+            await datastore.vector.update(
+                update_values={"metadata": {"status": "published"}},
+                filters=F.eq("metadata.category", "tech"),
+            )
+            # Multiple filters
+            filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
+            await datastore.vector.update(
+                update_values={"metadata": {"status": "published"}},
+                filters=filters,
+            )
+            ```
+            This will update the metadata of the chunks that match the filters to "published".
+        Args:
+            update_values (dict[str, Any]): Values to update.
+            filters (FilterClause | QueryFilter | None, optional): Filters to select records to update.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None, in which case no operation is performed (no-op).
+        Note:
+            ChromaDB doesn\'t support direct update operations. This method requires
+            filters to identify records and will update matching records.
+        '''
+    async def delete(self, filters: FilterClause | QueryFilter | None = None, **kwargs: Any) -> None:
+        '''Delete records from the datastore.
+        Examples:
+            ```python
+            from gllm_datastore.core.filters import filter as F
+            # Direct FilterClause usage
+            await datastore.vector.delete(filters=F.eq("metadata.category", "tech"))
+            # Multiple filters
+            filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
+            await datastore.vector.delete(filters=filters)
+            ```
+            This will delete all chunks from the vector store that match the filters.
+        Args:
+            filters (FilterClause | QueryFilter | None, optional): Filters to select records to delete.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None, in which case no operation is performed (no-op).
+            **kwargs: Datastore-specific parameters.
+        '''
+    async def clear(self) -> None:
+        """Clear all records from the datastore."""

gllm_datastore/data_store/elasticsearch/__init__.pyi ADDED Viewed

@@ -0,0 +1,5 @@
+from gllm_datastore.data_store.elasticsearch.data_store import ElasticsearchDataStore as ElasticsearchDataStore
+from gllm_datastore.data_store.elasticsearch.fulltext import ElasticsearchFulltextCapability as ElasticsearchFulltextCapability
+from gllm_datastore.data_store.elasticsearch.vector import ElasticsearchVectorCapability as ElasticsearchVectorCapability
+__all__ = ['ElasticsearchDataStore', 'ElasticsearchFulltextCapability', 'ElasticsearchVectorCapability']

gllm_datastore/data_store/elasticsearch/data_store.pyi ADDED Viewed

@@ -0,0 +1,147 @@
+from _typeshed import Incomplete
+from elasticsearch import AsyncElasticsearch
+from gllm_datastore.constants import DEFAULT_REQUEST_TIMEOUT as DEFAULT_REQUEST_TIMEOUT
+from gllm_datastore.core.filters.schema import FilterClause as FilterClause, QueryFilter as QueryFilter
+from gllm_datastore.data_store._elastic_core.client_factory import EngineType as EngineType, create_client as create_client
+from gllm_datastore.data_store.base import BaseDataStore as BaseDataStore, CapabilityType as CapabilityType
+from gllm_datastore.data_store.elasticsearch.fulltext import ElasticsearchFulltextCapability as ElasticsearchFulltextCapability
+from gllm_datastore.data_store.elasticsearch.query_translator import ElasticsearchQueryTranslator as ElasticsearchQueryTranslator
+from gllm_datastore.data_store.elasticsearch.vector import ElasticsearchVectorCapability as ElasticsearchVectorCapability
+from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
+from langchain_elasticsearch.vectorstores import AsyncRetrievalStrategy
+from typing import Any
+class ElasticsearchDataStore(BaseDataStore):
+    '''Elasticsearch data store with multiple capability support.
+    This is the explicit public API for Elasticsearch. Users know they\'re
+    using Elasticsearch, not a generic "elastic-like" datastore.
+    Attributes:
+        engine (str): Always "elasticsearch" for explicit identification.
+            This attribute ensures users know they\'re using Elasticsearch, not a generic
+            "elastic-like" datastore.
+        index_name (str): The name of the Elasticsearch index.
+        client (AsyncElasticsearch): AsyncElasticsearch client.
+    '''
+    engine: str
+    client: Incomplete
+    index_name: Incomplete
+    def __init__(self, index_name: str, client: AsyncElasticsearch | None = None, url: str | None = None, cloud_id: str | None = None, api_key: str | None = None, username: str | None = None, password: str | None = None, request_timeout: int = ...) -> None:
+        '''Initialize the Elasticsearch data store.
+        Args:
+            index_name (str): The name of the Elasticsearch index to use for operations.
+                This index name will be used for all queries and operations.
+            client (AsyncElasticsearch | None, optional): Pre-configured Elasticsearch client instance.
+                If provided, it will be used instead of creating a new client from url/cloud_id.
+                Must be an instance of AsyncElasticsearch. Defaults to None.
+            url (str | None, optional): The URL of the Elasticsearch server.
+                For example, "http://localhost:9200". Either url or cloud_id must be provided
+                if client is None. Defaults to None.
+            cloud_id (str | None, optional): The cloud ID of the Elasticsearch cluster.
+                Used for Elastic Cloud connections. Either url or cloud_id must be provided
+                if client is None. Defaults to None.
+            api_key (str | None, optional): The API key for authentication.
+                If provided, will be used for authentication. Mutually exclusive with username/password.
+                Defaults to None.
+            username (str | None, optional): The username for basic authentication.
+                Must be provided together with password. Mutually exclusive with api_key.
+                Defaults to None.
+            password (str | None, optional): The password for basic authentication.
+                Must be provided together with username. Mutually exclusive with api_key.
+                Defaults to None.
+            request_timeout (int, optional): The request timeout in seconds.
+                Defaults to DEFAULT_REQUEST_TIMEOUT.
+        Raises:
+            ValueError: If neither url nor cloud_id is provided when client is None.
+            TypeError: If client is provided but is not an instance of AsyncElasticsearch.
+        '''
+    @property
+    def supported_capabilities(self) -> list[str]:
+        """Return list of currently supported capabilities.
+        Returns:
+            list[str]: List of capability names that are supported.
+        """
+    @property
+    def fulltext(self) -> ElasticsearchFulltextCapability:
+        """Access fulltext capability if supported.
+        This method uses the logic of its parent class to return the fulltext capability handler.
+        This method overrides the parent class to return the ElasticsearchFulltextCapability handler for better
+        type hinting.
+        Returns:
+            ElasticsearchFulltextCapability: Fulltext capability handler.
+        Raises:
+            NotSupportedException: If fulltext capability is not supported.
+        """
+    @property
+    def vector(self) -> ElasticsearchVectorCapability:
+        """Access vector capability if supported.
+        This method uses the logic of its parent class to return the vector capability handler.
+        This method overrides the parent class to return the ElasticsearchVectorCapability handler for better
+        type hinting.
+        Returns:
+            ElasticsearchVectorCapability: Vector capability handler.
+        Raises:
+            NotSupportedException: If vector capability is not supported.
+        """
+    def with_fulltext(self, index_name: str | None = None, query_field: str = 'text') -> ElasticsearchDataStore:
+        '''Configure fulltext capability and return datastore instance.
+        This method uses the logic of its parent class to configure the fulltext capability.
+        This method overrides the parent class for better type hinting.
+        Args:
+            index_name (str | None, optional): The name of the Elasticsearch index to use for fulltext operations.
+                If None, uses the default index_name from the datastore instance.
+                Defaults to None.
+            query_field (str, optional): The field name to use for text content in queries.
+                This field will be used for BM25 and other text search operations.
+                Defaults to "text".
+        Returns:
+            ElasticsearchDataStore: Self for method chaining.
+        '''
+    def with_vector(self, em_invoker: BaseEMInvoker, index_name: str | None = None, query_field: str = 'text', vector_query_field: str = 'vector', retrieval_strategy: AsyncRetrievalStrategy | None = None, distance_strategy: str | None = None) -> ElasticsearchDataStore:
+        '''Configure vector capability and return datastore instance.
+        This method uses the logic of its parent class to configure the vector capability.
+        This method overrides the parent class for better type hinting.
+        Args:
+            em_invoker (BaseEMInvoker): The embedding model to perform vectorization.
+            index_name (str | None, optional): The name of the Elasticsearch index. Defaults to None,
+                in which case the default class attribute will be utilized.
+            query_field (str, optional): The field name for text queries. Defaults to "text".
+            vector_query_field (str, optional): The field name for vector queries. Defaults to "vector".
+            retrieval_strategy (AsyncRetrievalStrategy | None, optional): The retrieval strategy for retrieval.
+                Defaults to None, in which case DenseVectorStrategy() is used.
+            distance_strategy (str | None, optional): The distance strategy for retrieval. Defaults to None.
+        Returns:
+            Self: Self for method chaining.
+        '''
+    @classmethod
+    def translate_query_filter(cls, query_filter: FilterClause | QueryFilter | None) -> dict[str, Any] | None:
+        """Translate QueryFilter or FilterClause to Elasticsearch native filter syntax.
+        This method delegates to the ElasticsearchQueryTranslator and returns the result as a dictionary.
+        Args:
+            query_filter (FilterClause | QueryFilter | None): The filter to translate.
+                Can be a single FilterClause, a QueryFilter with multiple clauses and logical conditions,
+                or None for empty filters. FilterClause objects are automatically converted to QueryFilter.
+        Returns:
+            dict[str, Any] | None: The translated filter as an Elasticsearch DSL dictionary.
+                Returns None for empty filters or when query_filter is None.
+                The dictionary format matches Elasticsearch Query DSL syntax.
+        """

gllm_datastore/data_store/elasticsearch/fulltext.pyi ADDED Viewed

@@ -0,0 +1,238 @@
+from _typeshed import Incomplete
+from elasticsearch import AsyncElasticsearch
+from enum import StrEnum
+from gllm_core.schema import Chunk
+from gllm_datastore.constants import METADATA_KEYS as METADATA_KEYS
+from gllm_datastore.core.filters.schema import FilterClause as FilterClause, QueryFilter as QueryFilter, QueryOptions as QueryOptions
+from gllm_datastore.data_store._elastic_core.elastic_like_core import ElasticLikeCore as ElasticLikeCore
+from gllm_datastore.data_store._elastic_core.query_translator import convert_filter_clause as convert_filter_clause
+from gllm_datastore.data_store.elasticsearch.query import apply_filter_query_to_search as apply_filter_query_to_search, apply_filters_and_options as apply_filters_and_options, create_search_with_filters as create_search_with_filters, delete_by_id as delete_by_id, delete_by_query as delete_by_query, safe_execute as safe_execute, translate_filter as translate_filter, update_by_query as update_by_query, validate_query_length as validate_query_length
+from typing import Any, Literal, overload
+class SupportedQueryMethods(StrEnum):
+    """Supported query methods for Elasticsearch fulltext capability."""
+    AUTOCOMPLETE: str
+    AUTOSUGGEST: str
+    BM25: str
+    BY_FIELD: str
+    SHINGLES: str
+QUERY_REQUIRED_STRATEGIES: Incomplete
+class ElasticsearchFulltextCapability:
+    """Elasticsearch implementation of FulltextCapability protocol.
+    This class provides document CRUD operations and flexible querying using Elasticsearch.
+    Attributes:
+        index_name (str): The name of the Elasticsearch index.
+        client (AsyncElasticsearch): AsyncElasticsearch client.
+        query_field (str): The field name to use for text content.
+    """
+    index_name: Incomplete
+    client: Incomplete
+    query_field: Incomplete
+    def __init__(self, index_name: str, client: AsyncElasticsearch, query_field: str = 'text') -> None:
+        '''Initialize the Elasticsearch fulltext capability.
+        Args:
+            index_name (str): The name of the Elasticsearch index.
+            client (AsyncElasticsearch): The Elasticsearch client.
+            query_field (str, optional): The field name to use for text content. Defaults to "text".
+        '''
+    async def get_size(self) -> int:
+        """Returns the total number of documents in the index.
+        Returns:
+            int: The total number of documents.
+        """
+    async def create(self, data: Chunk | list[Chunk], **kwargs: Any) -> None:
+        """Create new records in the datastore.
+        Args:
+            data (Chunk | list[Chunk]): Data to create (single item or collection).
+            **kwargs: Backend-specific parameters forwarded to Elasticsearch bulk API.
+        Raises:
+            ValueError: If data structure is invalid.
+        """
+    @overload
+    async def retrieve(self, strategy: Literal[SupportedQueryMethods.BY_FIELD] | None = ..., query: str | None = None, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None, **kwargs: Any) -> list[Chunk]: ...
+    @overload
+    async def retrieve(self, strategy: Literal[SupportedQueryMethods.BM25], query: str, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None, k1: float | None = None, b: float | None = None, **kwargs: Any) -> list[Chunk]: ...
+    @overload
+    async def retrieve(self, strategy: Literal[SupportedQueryMethods.AUTOCOMPLETE], query: str, field: str, size: int = 20, fuzzy_tolerance: int = 1, min_prefix_length: int = 3, filter_query: dict[str, Any] | None = None, **kwargs: Any) -> list[str]: ...
+    @overload
+    async def retrieve(self, strategy: Literal[SupportedQueryMethods.AUTOSUGGEST], query: str, search_fields: list[str], autocomplete_field: str, size: int = 20, min_length: int = 3, filter_query: dict[str, Any] | None = None, **kwargs: Any) -> list[str]: ...
+    @overload
+    async def retrieve(self, strategy: Literal[SupportedQueryMethods.SHINGLES], query: str, field: str, size: int = 20, min_length: int = 3, max_length: int = 30, filter_query: dict[str, Any] | None = None, **kwargs: Any) -> list[str]: ...
+    async def retrieve_by_field(self, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None) -> list[Chunk]:
+        """Retrieve records from the datastore based on metadata field filtering.
+        This method filters and returns stored chunks based on metadata values
+        rather than text content. It is particularly useful for structured lookups,
+        such as retrieving all chunks from a certain source, tagged with a specific label,
+        or authored by a particular user.
+        Args:
+            filters (FilterClause | QueryFilter | None, optional): Query filters to apply.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None.
+            options (QueryOptions | None, optional): Query options (sorting, pagination, etc.).
+                Defaults to None.
+        Returns:
+            list[Chunk]: The filtered results as Chunk objects.
+        """
+    async def retrieve_bm25(self, query: str, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None, k1: float | None = None, b: float | None = None) -> list[Chunk]:
+        '''Queries the Elasticsearch data store using BM25 algorithm for keyword-based search.
+        Args:
+            query (str): The query string.
+            filters (FilterClause | QueryFilter | None, optional): Optional metadata filter to apply to the search.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Use filter builder functions like `F.eq()`, `F.and_()`, etc. Defaults to None.
+            options (QueryOptions | None, optional): Query options including fields, limit, order_by, etc.
+                For example, `QueryOptions(fields=["title", "content"], limit=10, order_by="score", order_desc=True)`.
+                If fields is None, defaults to ["text"]. For multiple fields, uses multi_match query. Defaults to None.
+            k1 (float | None, optional): BM25 parameter controlling term frequency saturation.
+                Higher values mean term frequency has more impact before diminishing returns.
+                Typical values: 1.2-2.0. If None, uses Elasticsearch default (~1.2). Defaults to None.
+            b (float | None, optional): BM25 parameter controlling document length normalization.
+                0.0 = no length normalization, 1.0 = full normalization.
+                Typical values: 0.75. If None, uses Elasticsearch default (~0.75). Defaults to None.
+        Examples:
+            ```python
+            from gllm_datastore.core.filters import filter as F
+            # Basic BM25 query on the \'text\' field
+            results = await data_store.query_bm25("machine learning")
+            # BM25 query on specific fields with query options
+            results = await data_store.query_bm25(
+                "natural language",
+                options=QueryOptions(fields=["title", "abstract"], limit=5)
+            )
+            # BM25 query with direct FilterClause
+            results = await data_store.query_bm25(
+                "deep learning",
+                filters=F.eq("metadata.category", "AI")
+            )
+            # BM25 query with multiple filters
+            results = await data_store.query_bm25(
+                "deep learning",
+                filters=F.and_(F.eq("metadata.category", "AI"), F.eq("metadata.status", "published"))
+            )
+            # BM25 query with custom BM25 parameters for more aggressive term frequency weighting
+            results = await data_store.query_bm25(
+                "artificial intelligence",
+                k1=2.0,
+                b=0.5
+            )
+            # BM25 query with fields, filters, and options
+            results = await data_store.query_bm25(
+                "data science applications",
+                filters=F.and_(
+                    F.eq("metadata.author_id", "user123"),
+                    F.in_("metadata.publication_year", [2022, 2023])
+                ),
+                options=QueryOptions(fields=["content", "tags"], limit=10, order_by="score", order_desc=True),
+                k1=1.5,
+                b=0.9
+            )
+            ```
+        Returns:
+            list[Chunk]: A list of Chunk objects representing the retrieved documents.
+        '''
+    async def retrieve_autocomplete(self, query: str, field: str, size: int = 20, fuzzy_tolerance: int = 1, min_prefix_length: int = 3, filter_query: dict[str, Any] | None = None) -> list[str]:
+        """Provides suggestions based on a prefix query for a specific field.
+        Args:
+            query (str): The query string.
+            field (str): The field name for autocomplete.
+            size (int, optional): The number of suggestions to retrieve. Defaults to 20.
+            fuzzy_tolerance (int, optional): The level of fuzziness for suggestions. Defaults to 1.
+            min_prefix_length (int, optional): The minimum prefix length to trigger fuzzy matching. Defaults to 3.
+            filter_query (dict[str, Any] | None, optional): The filter query. Defaults to None.
+        Returns:
+            list[str]: A list of suggestions.
+        """
+    async def retrieve_autosuggest(self, query: str, search_fields: list[str], autocomplete_field: str, size: int = 20, min_length: int = 3, filters: QueryFilter | None = None) -> list[str]:
+        """Generates suggestions across multiple fields using a multi_match query to broaden the search criteria.
+        Args:
+            query (str): The query string.
+            search_fields (list[str]): The fields to search for.
+            autocomplete_field (str): The field name for autocomplete.
+            size (int, optional): The number of suggestions to retrieve. Defaults to 20.
+            min_length (int, optional): The minimum length of the query. Defaults to 3.
+            filters (QueryFilter | None, optional): The filter query. Defaults to None.
+        Returns:
+            list[str]: A list of suggestions.
+        """
+    async def retrieve_shingles(self, query: str, field: str, size: int = 20, min_length: int = 3, max_length: int = 30, filters: QueryFilter | None = None) -> list[str]:
+        """Searches using shingles for prefix and fuzzy matching.
+        Args:
+            query (str): The query string.
+            field (str): The field name for autocomplete.
+            size (int, optional): The number of suggestions to retrieve. Defaults to 20.
+            min_length (int, optional): The minimum length of the query.
+                Queries shorter than this limit will return an empty list. Defaults to 3.
+            max_length (int, optional): The maximum length of the query.
+                Queries exceeding this limit will return an empty list. Defaults to 30.
+            filters (QueryFilter | None, optional): The filter query. Defaults to None.
+        Returns:
+            list[str]: A list of suggestions.
+        """
+    async def retrieve_fuzzy(self, query: str, max_distance: int = 2, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None) -> list[Chunk]:
+        """Find records that fuzzy match the query within distance threshold.
+        Args:
+            query (str): Text to fuzzy match against.
+            max_distance (int): Maximum edit distance for matches. Defaults to 2.
+            filters (FilterClause | QueryFilter | None, optional): Optional metadata filters to apply.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None.
+            options (QueryOptions | None, optional): Query options (limit, sorting, etc.). Defaults to None.
+        Returns:
+            list[Chunk]: Matched chunks.
+        """
+    async def update(self, update_values: dict[str, Any], filters: FilterClause | QueryFilter | None = None) -> None:
+        """Update existing records in the datastore.
+        Args:
+            update_values (dict[str, Any]): Values to update.
+            filters (FilterClause | QueryFilter | None, optional): Filters to select records to update.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None.
+        """
+    async def delete(self, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None) -> None:
+        """Delete records from the data store using filters and optional options.
+        Args:
+            filters (FilterClause | QueryFilter | None, optional): Filters to select records for deletion.
+                FilterClause objects are automatically converted to QueryFilter internally.
+                Defaults to None.
+            options (QueryOptions | None, optional): Query options supporting limit and sorting
+                for eviction-like operations. Defaults to None.
+        """
+    async def delete_by_id(self, id_: str | list[str]) -> None:
+        """Deletes records from the data store based on IDs.
+        Args:
+            id_ (str | list[str]): ID or list of IDs to delete.
+        """
+    async def clear(self) -> None:
+        """Clear all records from the datastore."""