gllm-datastore-binary 0.5.45__cp311-cp311-macosx_13_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-datastore-binary might be problematic. Click here for more details.
- gllm_datastore/__init__.pyi +0 -0
- gllm_datastore/cache/__init__.pyi +4 -0
- gllm_datastore/cache/base.pyi +84 -0
- gllm_datastore/cache/cache.pyi +137 -0
- gllm_datastore/cache/hybrid_cache/__init__.pyi +5 -0
- gllm_datastore/cache/hybrid_cache/file_system_hybrid_cache.pyi +50 -0
- gllm_datastore/cache/hybrid_cache/hybrid_cache.pyi +115 -0
- gllm_datastore/cache/hybrid_cache/in_memory_hybrid_cache.pyi +29 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/__init__.pyi +5 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/exact_key_matcher.pyi +44 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/fuzzy_key_matcher.pyi +70 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/key_matcher.pyi +60 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/semantic_key_matcher.pyi +93 -0
- gllm_datastore/cache/hybrid_cache/redis_hybrid_cache.pyi +34 -0
- gllm_datastore/cache/hybrid_cache/utils.pyi +36 -0
- gllm_datastore/cache/utils.pyi +34 -0
- gllm_datastore/cache/vector_cache/__init__.pyi +0 -0
- gllm_datastore/cache/vector_cache/eviction_manager/__init__.pyi +0 -0
- gllm_datastore/cache/vector_cache/eviction_manager/asyncio_eviction_manager.pyi +48 -0
- gllm_datastore/cache/vector_cache/eviction_manager/eviction_manager.pyi +38 -0
- gllm_datastore/cache/vector_cache/eviction_strategy/__init__.pyi +0 -0
- gllm_datastore/cache/vector_cache/eviction_strategy/eviction_strategy.pyi +34 -0
- gllm_datastore/cache/vector_cache/eviction_strategy/ttl_eviction_strategy.pyi +34 -0
- gllm_datastore/cache/vector_cache/vector_cache.pyi +99 -0
- gllm_datastore/constants.pyi +66 -0
- gllm_datastore/core/__init__.pyi +7 -0
- gllm_datastore/core/capabilities/__init__.pyi +5 -0
- gllm_datastore/core/capabilities/fulltext_capability.pyi +73 -0
- gllm_datastore/core/capabilities/graph_capability.pyi +70 -0
- gllm_datastore/core/capabilities/vector_capability.pyi +90 -0
- gllm_datastore/core/filters/__init__.pyi +4 -0
- gllm_datastore/core/filters/filter.pyi +340 -0
- gllm_datastore/core/filters/schema.pyi +149 -0
- gllm_datastore/data_store/__init__.pyi +7 -0
- gllm_datastore/data_store/base.pyi +138 -0
- gllm_datastore/data_store/chroma/__init__.pyi +4 -0
- gllm_datastore/data_store/chroma/_chroma_import.pyi +13 -0
- gllm_datastore/data_store/chroma/data_store.pyi +202 -0
- gllm_datastore/data_store/chroma/fulltext.pyi +134 -0
- gllm_datastore/data_store/chroma/query.pyi +266 -0
- gllm_datastore/data_store/chroma/query_translator.pyi +41 -0
- gllm_datastore/data_store/chroma/vector.pyi +197 -0
- gllm_datastore/data_store/elasticsearch/__init__.pyi +5 -0
- gllm_datastore/data_store/elasticsearch/data_store.pyi +119 -0
- gllm_datastore/data_store/elasticsearch/fulltext.pyi +237 -0
- gllm_datastore/data_store/elasticsearch/query.pyi +114 -0
- gllm_datastore/data_store/elasticsearch/vector.pyi +179 -0
- gllm_datastore/data_store/exceptions.pyi +35 -0
- gllm_datastore/data_store/in_memory/__init__.pyi +5 -0
- gllm_datastore/data_store/in_memory/data_store.pyi +71 -0
- gllm_datastore/data_store/in_memory/fulltext.pyi +131 -0
- gllm_datastore/data_store/in_memory/query.pyi +175 -0
- gllm_datastore/data_store/in_memory/vector.pyi +174 -0
- gllm_datastore/data_store/redis/__init__.pyi +5 -0
- gllm_datastore/data_store/redis/data_store.pyi +154 -0
- gllm_datastore/data_store/redis/fulltext.pyi +128 -0
- gllm_datastore/data_store/redis/query.pyi +428 -0
- gllm_datastore/data_store/redis/query_translator.pyi +37 -0
- gllm_datastore/data_store/redis/vector.pyi +131 -0
- gllm_datastore/encryptor/__init__.pyi +4 -0
- gllm_datastore/encryptor/aes_gcm_encryptor.pyi +45 -0
- gllm_datastore/encryptor/encryptor.pyi +52 -0
- gllm_datastore/encryptor/key_ring/__init__.pyi +3 -0
- gllm_datastore/encryptor/key_ring/in_memory_key_ring.pyi +52 -0
- gllm_datastore/encryptor/key_ring/key_ring.pyi +45 -0
- gllm_datastore/encryptor/key_rotating_encryptor.pyi +60 -0
- gllm_datastore/graph_data_store/__init__.pyi +6 -0
- gllm_datastore/graph_data_store/graph_data_store.pyi +151 -0
- gllm_datastore/graph_data_store/graph_rag_data_store.pyi +29 -0
- gllm_datastore/graph_data_store/light_rag_data_store.pyi +93 -0
- gllm_datastore/graph_data_store/light_rag_postgres_data_store.pyi +96 -0
- gllm_datastore/graph_data_store/llama_index_graph_rag_data_store.pyi +49 -0
- gllm_datastore/graph_data_store/llama_index_neo4j_graph_rag_data_store.pyi +78 -0
- gllm_datastore/graph_data_store/nebula_graph_data_store.pyi +206 -0
- gllm_datastore/graph_data_store/neo4j_graph_data_store.pyi +182 -0
- gllm_datastore/graph_data_store/utils/__init__.pyi +6 -0
- gllm_datastore/graph_data_store/utils/constants.pyi +21 -0
- gllm_datastore/graph_data_store/utils/light_rag_em_invoker_adapter.pyi +56 -0
- gllm_datastore/graph_data_store/utils/light_rag_lm_invoker_adapter.pyi +43 -0
- gllm_datastore/graph_data_store/utils/llama_index_em_invoker_adapter.pyi +45 -0
- gllm_datastore/graph_data_store/utils/llama_index_lm_invoker_adapter.pyi +169 -0
- gllm_datastore/sql_data_store/__init__.pyi +4 -0
- gllm_datastore/sql_data_store/adapter/__init__.pyi +0 -0
- gllm_datastore/sql_data_store/adapter/sqlalchemy_adapter.pyi +38 -0
- gllm_datastore/sql_data_store/constants.pyi +6 -0
- gllm_datastore/sql_data_store/sql_data_store.pyi +86 -0
- gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi +216 -0
- gllm_datastore/sql_data_store/types.pyi +31 -0
- gllm_datastore/utils/__init__.pyi +6 -0
- gllm_datastore/utils/converter.pyi +51 -0
- gllm_datastore/utils/dict.pyi +21 -0
- gllm_datastore/utils/ttl.pyi +25 -0
- gllm_datastore/utils/types.pyi +32 -0
- gllm_datastore/vector_data_store/__init__.pyi +6 -0
- gllm_datastore/vector_data_store/chroma_vector_data_store.pyi +259 -0
- gllm_datastore/vector_data_store/elasticsearch_vector_data_store.pyi +357 -0
- gllm_datastore/vector_data_store/in_memory_vector_data_store.pyi +179 -0
- gllm_datastore/vector_data_store/mixin/__init__.pyi +0 -0
- gllm_datastore/vector_data_store/mixin/cache_compatible_mixin.pyi +145 -0
- gllm_datastore/vector_data_store/redis_vector_data_store.pyi +191 -0
- gllm_datastore/vector_data_store/vector_data_store.pyi +146 -0
- gllm_datastore.build/.gitignore +1 -0
- gllm_datastore.cpython-311-darwin.so +0 -0
- gllm_datastore.pyi +156 -0
- gllm_datastore_binary-0.5.45.dist-info/METADATA +178 -0
- gllm_datastore_binary-0.5.45.dist-info/RECORD +108 -0
- gllm_datastore_binary-0.5.45.dist-info/WHEEL +5 -0
- gllm_datastore_binary-0.5.45.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from gllm_datastore.cache.cache import MatchingStrategy as MatchingStrategy
|
|
4
|
+
from gllm_datastore.cache.vector_cache.eviction_manager.eviction_manager import BaseEvictionManager as BaseEvictionManager
|
|
5
|
+
from gllm_datastore.cache.vector_cache.vector_cache import VectorCache as VectorCache
|
|
6
|
+
from gllm_datastore.constants import METADATA_KEYS as METADATA_KEYS
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
class CacheCompatibleMixin(ABC):
|
|
10
|
+
"""Mixin that provides cache-specific matching operations for vector datastores.
|
|
11
|
+
|
|
12
|
+
This mixin adds methods for exact, fuzzy, and semantic matching that are
|
|
13
|
+
required by the VectorCache implementation, without forcing all vector datastores
|
|
14
|
+
to implement these methods.
|
|
15
|
+
"""
|
|
16
|
+
async def store_cache(self, key: str, value: Any, metadata: dict[str, Any] | None = None) -> None:
|
|
17
|
+
"""Public method to store cache data in the storage.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
key (str): The key to store the cache data.
|
|
21
|
+
value (Any): The cache data to store.
|
|
22
|
+
metadata (dict[str, Any] | None, optional): Additional metadata to store with the cache data.
|
|
23
|
+
Defaults to None.
|
|
24
|
+
"""
|
|
25
|
+
@abstractmethod
|
|
26
|
+
async def exact_match(self, key: str, **kwargs) -> Any | None:
|
|
27
|
+
"""Find chunks that exactly match the given key.
|
|
28
|
+
|
|
29
|
+
This method should be implemented by subclasses.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
key (str): The key to match.
|
|
33
|
+
**kwargs (Any): Additional parameters for the matching operation.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Any: Chunks that exactly match the key.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
NotImplementedError: If the method is not implemented.
|
|
40
|
+
"""
|
|
41
|
+
@abstractmethod
|
|
42
|
+
async def fuzzy_match(self, key: str, max_distance: int = 2, **kwargs) -> Any | None:
|
|
43
|
+
"""Find chunks that approximately match the given key using fuzzy matching.
|
|
44
|
+
|
|
45
|
+
This method should be implemented by subclasses.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
key (str): The key to match.
|
|
49
|
+
max_distance (int): Maximum distance for fuzzy matching. Lower values are more strict.
|
|
50
|
+
This is the maximum Levenshtein distance allowed for a match. Defaults to 2.
|
|
51
|
+
**kwargs (Any): Additional parameters for the matching operation.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Any: Chunks that fuzzy match the key within the threshold.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
NotImplementedError: If the method is not implemented.
|
|
58
|
+
"""
|
|
59
|
+
@abstractmethod
|
|
60
|
+
async def semantic_match(self, key: str, min_similarity: float = 0.8, metadata: dict[str, Any] | None = None, **kwargs) -> Any | None:
|
|
61
|
+
"""Find chunks that semantically match the given key using vector similarity.
|
|
62
|
+
|
|
63
|
+
This method should be implemented by subclasses.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
key (str): The key to match.
|
|
67
|
+
min_similarity (float): Minimum similarity score for semantic matching
|
|
68
|
+
(higher values are more strict). Ranges from 0 to 1. Defaults to 0.8.
|
|
69
|
+
metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
|
|
70
|
+
Defaults to None.
|
|
71
|
+
**kwargs (Any): Additional parameters for the matching operation.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Any: Chunks that semantically match the key above the threshold.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
NotImplementedError: If the method is not implemented.
|
|
78
|
+
"""
|
|
79
|
+
@abstractmethod
|
|
80
|
+
async def delete_expired_entries(self, now: datetime, max_size: int = 10000) -> None:
|
|
81
|
+
"""Delete expired entries (for TTL eviction).
|
|
82
|
+
|
|
83
|
+
This method should be implemented by subclasses.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
now (datetime): The current datetime for comparison.
|
|
87
|
+
max_size (int): The maximum number of entries to return. Defaults to 10000.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
NotImplementedError: If the method is not implemented.
|
|
91
|
+
"""
|
|
92
|
+
@abstractmethod
|
|
93
|
+
async def delete_least_frequently_used_entries(self, num_entries: int) -> None:
|
|
94
|
+
"""Delete least frequently used entries (for LFU eviction).
|
|
95
|
+
|
|
96
|
+
This method should be implemented by subclasses.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
num_entries (int): Number of entries to return.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
NotImplementedError: If the method is not implemented.
|
|
103
|
+
"""
|
|
104
|
+
@abstractmethod
|
|
105
|
+
async def delete_least_recently_used_entries(self, num_entries: int) -> None:
|
|
106
|
+
"""Delete least recently used entries (for LRU eviction).
|
|
107
|
+
|
|
108
|
+
This method should be implemented by subclasses.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
num_entries (int): Number of entries to return.
|
|
112
|
+
|
|
113
|
+
Raises:
|
|
114
|
+
NotImplementedError: If the method is not implemented.
|
|
115
|
+
"""
|
|
116
|
+
@abstractmethod
|
|
117
|
+
async def delete_entries_by_key(self, key: str | list[str], metadata: dict[str, Any] | None = None) -> None:
|
|
118
|
+
'''Delete entries by key.
|
|
119
|
+
|
|
120
|
+
This method should be implemented by subclasses.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
key (str): The key to delete entries for.
|
|
124
|
+
metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
|
|
125
|
+
For example, `{"key": "value"}`. Defaults to None.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
NotImplementedError: If the method is not implemented.
|
|
129
|
+
'''
|
|
130
|
+
def as_cache(self, eviction_manager: BaseEvictionManager | None = None, matching_strategy: MatchingStrategy = 'exact', matching_config: dict[str, Any] | None = None, saving_config: dict[str, Any] | None = None) -> VectorCache:
|
|
131
|
+
"""Return a cache instance that can be used to store and retrieve data.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
eviction_manager (Optional[BaseEvictionManager], optional): The eviction manager to use for cache eviction.
|
|
135
|
+
Defaults to None. If None, no eviction will be performed.
|
|
136
|
+
matching_strategy (MatchingStrategy, optional): The strategy to use for matching keys.
|
|
137
|
+
Defaults to MatchingStrategy.EXACT.
|
|
138
|
+
matching_config (dict[str, Any] | None, optional): Configuration parameters for matching strategies.
|
|
139
|
+
Defaults to None, which means no specific configuration is provided.
|
|
140
|
+
saving_config (dict[str, Any] | None, optional): Configuration parameters for saving strategies.
|
|
141
|
+
Defaults to None, which means no specific configuration is provided.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
VectorCache: A cache instance that can be used to store and retrieve data.
|
|
145
|
+
"""
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from gllm_core.schema.chunk import Chunk
|
|
4
|
+
from gllm_datastore.constants import DEFAULT_TOP_K as DEFAULT_TOP_K, METADATA_KEYS as METADATA_KEYS
|
|
5
|
+
from gllm_datastore.utils.converter import cosine_distance_to_similarity_score as cosine_distance_to_similarity_score, similarity_score_to_cosine_distance as similarity_score_to_cosine_distance
|
|
6
|
+
from gllm_datastore.vector_data_store.mixin.cache_compatible_mixin import CacheCompatibleMixin as CacheCompatibleMixin
|
|
7
|
+
from gllm_datastore.vector_data_store.vector_data_store import BaseVectorDataStore as BaseVectorDataStore
|
|
8
|
+
from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
|
|
9
|
+
from redis import Redis as Redis
|
|
10
|
+
from redisvl.query.filter import FilterExpression as FilterExpression
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
FUZZY_MATCH_MAX_DISTANCE: int
|
|
14
|
+
|
|
15
|
+
class RedisVectorDataStore(BaseVectorDataStore, CacheCompatibleMixin):
|
|
16
|
+
"""Vector data store implementation that uses Redis with RedisVL for vector search.
|
|
17
|
+
|
|
18
|
+
This class provides methods to interact with Redis for vector storage and retrieval
|
|
19
|
+
using Redis Vector Search capabilities via RedisVL and langchain-redis.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
redis_url (str): URL for Redis connection.
|
|
23
|
+
index_name (str): Name for the vector index.
|
|
24
|
+
search_index (SearchIndex): RedisVL SearchIndex instance.
|
|
25
|
+
cache_store (SemanticCache): RedisVL SemanticCache instance.
|
|
26
|
+
embedding (BaseEMInvoker | None): The embedding model to perform vectorization.
|
|
27
|
+
"""
|
|
28
|
+
index_name: Incomplete
|
|
29
|
+
url: Incomplete
|
|
30
|
+
client: Incomplete
|
|
31
|
+
filterable_fields: Incomplete
|
|
32
|
+
cache_store: Incomplete
|
|
33
|
+
def __init__(self, index_name: str, url: str | None = None, client: Redis | None = None, embedding: BaseEMInvoker | None = None, additional_filter_fields: list[dict[str, Any]] | None = None) -> None:
|
|
34
|
+
'''Initialize Redis vector store using RedisVL and langchain-redis.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
index_name (str): Name of the index to use.
|
|
38
|
+
url (str): URL for Redis connection.
|
|
39
|
+
client (Redis | None, optional): Redis client to use for vectorization.
|
|
40
|
+
embedding (BaseEMInvoker | None, optional): Embedding function to use for vectorization.
|
|
41
|
+
Defaults to None. If None, the default embedding model (redis/langcache-embed-v1) will be used.
|
|
42
|
+
additional_filter_fields (list[dict[str, Any]] | None, optional): Additional filterable fields to add
|
|
43
|
+
to the index. For example, to add `entry_id` as a filterable field, pass
|
|
44
|
+
`[{"name": "entry_id", "type": "text"}]`. Defaults to None.
|
|
45
|
+
|
|
46
|
+
Notes:
|
|
47
|
+
Besides the `additional_filter_fields`, the class will automatically create default filterable fields:
|
|
48
|
+
1. prompt: TEXT (default from redisvl).
|
|
49
|
+
2. response: TEXT (default from redisvl).
|
|
50
|
+
3. prompt_vector: VECTOR (default from redisvl).
|
|
51
|
+
4. chunk_id: TEXT (default additional_filter_fields).
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
TypeError: If `embedding` is not an instance of `BaseEMInvoker`.
|
|
55
|
+
'''
|
|
56
|
+
async def get_size(self) -> int:
|
|
57
|
+
"""Returns the total number of vectors in the index.
|
|
58
|
+
|
|
59
|
+
If the index is not initialized returns 0.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
int: The total number of vectors.
|
|
63
|
+
"""
|
|
64
|
+
async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, Any] | None = None) -> list[Chunk]:
|
|
65
|
+
"""Search for semantically similar documents which returns similarity scores.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
query (str): The query text to search for.
|
|
69
|
+
top_k (int): Number of top results to return.
|
|
70
|
+
retrieval_params (dict[str, Any] | None, optional): Additional parameters for the query such as:
|
|
71
|
+
- filter: Redis filter expression to narrow results following RedisVL FilterExpression.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
list[Chunk]: List of chunks semantically similar to the query
|
|
75
|
+
"""
|
|
76
|
+
async def query_by_id(self, id_: str | list[str]) -> list[Chunk]:
|
|
77
|
+
"""Retrieve chunks by their IDs.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
id_ (str | list[str]): A single ID or list of chunk IDs to retrieve
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
list[Chunk]: List of retrieved chunks
|
|
84
|
+
"""
|
|
85
|
+
async def add_chunks(self, chunks: Chunk | list[Chunk], **kwargs) -> list[str]:
|
|
86
|
+
"""Add chunks to the vector store.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
chunks (Chunk | list[Chunk]): A single chunk or a list of chunks to add
|
|
90
|
+
**kwargs: Additional parameters for adding chunks
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
list[str]: List of IDs of the added chunks
|
|
94
|
+
"""
|
|
95
|
+
async def delete_chunks(self, query: str, **kwargs: Any) -> None:
|
|
96
|
+
'''Delete chunks from the vector store by filter/query. Not supported for Redis backend.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
query (str): The query to delete chunks by. For example, "user_*" would match keys
|
|
100
|
+
like "user_1", "user_2", etc.
|
|
101
|
+
**kwargs: Additional keyword arguments.
|
|
102
|
+
'''
|
|
103
|
+
async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
|
|
104
|
+
"""Delete chunks from the vector store by their IDs.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
ids (str | list[str]): A single ID or a list of IDs to delete.
|
|
108
|
+
**kwargs: Additional keyword arguments.
|
|
109
|
+
"""
|
|
110
|
+
async def exact_match(self, key: str, metadata: dict[str, Any] | None = None) -> Any | None:
|
|
111
|
+
'''Find chunks that exactly match the given prompt.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
key (str): The prompt to match.
|
|
115
|
+
metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
|
|
116
|
+
For example, `{"key": "value"}`. Defaults to None.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Any: The value stored with the matching prompt, or None if no match is found.
|
|
120
|
+
'''
|
|
121
|
+
async def fuzzy_match(self, key: str, max_distance: int = 2, metadata: dict[str, Any] | None = None) -> Any | None:
|
|
122
|
+
'''Find chunks that approximately match the given key using fuzzy matching.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
key (str): The key to match
|
|
126
|
+
max_distance (int): Maximum allowed distance for fuzzy matching
|
|
127
|
+
(higher values allow for more differences). Maximum is 3. Defaults to 2.
|
|
128
|
+
metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
|
|
129
|
+
For example, `{"key": "value"}`. Defaults to None.
|
|
130
|
+
|
|
131
|
+
Note:
|
|
132
|
+
Maximum fuzzy distance is 3. This is a limitation of the Redis Vector Search and the Redis Search module.
|
|
133
|
+
See [5] for more details.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Any: The value with the closest fuzzy match, or None if no match is found
|
|
137
|
+
'''
|
|
138
|
+
async def semantic_match(self, key: str, min_similarity: float = 0.8, metadata: dict[str, Any] | None = None) -> Any | None:
|
|
139
|
+
'''Find chunks that semantically match the given key using vector similarity.
|
|
140
|
+
|
|
141
|
+
This method compares the vector embedding of the search key with vector embeddings
|
|
142
|
+
of stored keys to find semantically similar matches.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
key (str): The key to match
|
|
146
|
+
min_similarity (float, optional): Minimum similarity score for semantic matching
|
|
147
|
+
(higher values are more strict). Ranges from 0 to 1. Defaults to 0.8.
|
|
148
|
+
metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
|
|
149
|
+
For example, `{"key": "value"}`. Defaults to None.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Any: The semantically closest value, or None if no match meets the threshold
|
|
153
|
+
'''
|
|
154
|
+
async def delete_expired_entries(self, now: datetime, max_size: int = 10000) -> None:
|
|
155
|
+
"""Delete expired entries (for TTL eviction).
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
now (datetime): The current datetime for comparison.
|
|
159
|
+
max_size (int): The maximum number of entries to return. Defaults to 10000.
|
|
160
|
+
|
|
161
|
+
Raises:
|
|
162
|
+
NotImplementedError: Currently, app-level eviction is not supported for RedisVectorDataStore.
|
|
163
|
+
"""
|
|
164
|
+
async def delete_least_frequently_used_entries(self, num_entries: int) -> None:
|
|
165
|
+
"""Delete least frequently used entries (for LFU eviction).
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
num_entries (int): Number of entries to return.
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
NotImplementedError: Currently, app-level eviction is not supported for RedisVectorDataStore.
|
|
172
|
+
"""
|
|
173
|
+
async def delete_least_recently_used_entries(self, num_entries: int) -> None:
|
|
174
|
+
"""Delete least recently used entries (for LRU eviction).
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
num_entries (int): Number of entries to return.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
NotImplementedError: Currently, app-level eviction is not supported for RedisVectorDataStore.
|
|
181
|
+
"""
|
|
182
|
+
async def delete_entries_by_key(self, key: str | list[str], metadata: dict[str, Any] | None = None) -> None:
|
|
183
|
+
'''Delete entries by key.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
key (str | list[str]): The key or list of keys to delete entries for.
|
|
187
|
+
metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
|
|
188
|
+
For example, `{"key": "value"}`. Defaults to None.
|
|
189
|
+
'''
|
|
190
|
+
async def clear(self) -> None:
|
|
191
|
+
"""Clear all entries in the storage."""
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from gllm_core.schema.chunk import Chunk
|
|
3
|
+
from gllm_datastore.constants import DEFAULT_TOP_K as DEFAULT_TOP_K
|
|
4
|
+
from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
|
|
5
|
+
from langchain_core.embeddings import Embeddings
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
class BaseVectorDataStore(ABC):
|
|
9
|
+
"""Abstract base class for vector data stores in the retrieval system.
|
|
10
|
+
|
|
11
|
+
This class defines the interface for all vector data store implementations.
|
|
12
|
+
Subclasses must implement the `query` and `query_by_id` methods.
|
|
13
|
+
"""
|
|
14
|
+
@property
|
|
15
|
+
def embedding(self) -> BaseEMInvoker | Embeddings | None:
|
|
16
|
+
"""Returns the embedding model associated with this data store.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
BaseEMInvoker | Embeddings | None: The embedding model.
|
|
20
|
+
"""
|
|
21
|
+
async def get_size(self) -> int:
|
|
22
|
+
"""Returns the total number of vectors in the index.
|
|
23
|
+
|
|
24
|
+
If the index is not initialized returns 0.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
int: The total number of vectors.
|
|
28
|
+
"""
|
|
29
|
+
@abstractmethod
|
|
30
|
+
async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, Any] | None = None) -> list[Chunk]:
|
|
31
|
+
"""Executes a query on the data store.
|
|
32
|
+
|
|
33
|
+
This method must be implemented by subclasses.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
query (str): The query string to execute.
|
|
37
|
+
top_k (int, optional): The maximum number of results to return. Defaults to DEFAULT_TOP_K.
|
|
38
|
+
retrieval_params (dict[str, Any] | None, optional): Additional parameters for the query.
|
|
39
|
+
Defaults to None.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
list[Chunk]: A list of query results.
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
NotImplementedError: If the method is not implemented.
|
|
46
|
+
"""
|
|
47
|
+
@abstractmethod
|
|
48
|
+
async def query_by_id(self, id_: str | list[str]) -> list[Chunk]:
|
|
49
|
+
"""Retrieves chunks by their IDs.
|
|
50
|
+
|
|
51
|
+
This method must be implemented by subclasses.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
id_ (str | list[str]): A single ID or a list of IDs to retrieve.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list[Chunk]: A list of retrieved chunks.
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
NotImplementedError: If the method is not implemented.
|
|
61
|
+
"""
|
|
62
|
+
@abstractmethod
|
|
63
|
+
async def add_chunks(self, chunk: Chunk | list[Chunk], **kwargs) -> list[str]:
|
|
64
|
+
"""Adds a chunk or a list of chunks in the data store.
|
|
65
|
+
|
|
66
|
+
This method must be implemented by subclasses.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
chunk (Chunk | list[Chunk]): A single chunk or a list of chunks to index.
|
|
70
|
+
**kwargs: Additional keyword arguments to pass to the method.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
list[str]: A list of unique identifiers (IDs) assigned to the added chunks.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
NotImplementedError: If the method is not implemented.
|
|
77
|
+
"""
|
|
78
|
+
@abstractmethod
|
|
79
|
+
async def delete_chunks(self, **kwargs: Any) -> None:
|
|
80
|
+
"""Deletes chunks from the data store by filter or query.
|
|
81
|
+
|
|
82
|
+
This method must be implemented by subclasses.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
**kwargs: Additional keyword arguments specifying the filter or query for deletion.
|
|
86
|
+
The exact parameters depend on the backend implementation.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
NotImplementedError: If the method is not implemented.
|
|
90
|
+
"""
|
|
91
|
+
@abstractmethod
|
|
92
|
+
async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
|
|
93
|
+
"""Deletes a chunk or a list of chunks from the data store by their IDs.
|
|
94
|
+
|
|
95
|
+
This method must be implemented by subclasses.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
ids (str | list[str]): A single ID or a list of IDs to delete.
|
|
99
|
+
**kwargs: Additional keyword arguments.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
NotImplementedError: If the method is not implemented.
|
|
103
|
+
"""
|
|
104
|
+
async def clear(self) -> None:
|
|
105
|
+
"""Clear all entries in the storage.
|
|
106
|
+
|
|
107
|
+
This method should be implemented by subclasses.
|
|
108
|
+
"""
|
|
109
|
+
async def query_by_field(self, retrieval_params: dict[str, Any], limit: int | None = None, **kwargs) -> list[Chunk]:
|
|
110
|
+
"""Retrieve documents that match specific metadata constraints.
|
|
111
|
+
|
|
112
|
+
This method filters and returns stored chunks based on metadata values
|
|
113
|
+
rather than vector similarity. It is particularly useful for structured lookups,
|
|
114
|
+
such as retrieving all chunks from a certain source, tagged with a specific label,
|
|
115
|
+
or authored by a particular user.
|
|
116
|
+
|
|
117
|
+
Unlike semantic search methods, `query_by_field` operates purely on metadata fields
|
|
118
|
+
associated with each document, allowing precise filtering based on key-value pairs.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
retrieval_params (dict[str, Any]): A dictionary defining filter criteria.
|
|
122
|
+
limit (int | None, optional): The maximum number of results to return. If None, all matching
|
|
123
|
+
documents will be returned.
|
|
124
|
+
**kwargs: Additional arguments to support datastore-specific behavior or filtering logic.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
list[Chunk]: A list of `Chunk` objects that satisfy the metadata criteria.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
NotImplementedError: If not implemented in the subclass.
|
|
131
|
+
"""
|
|
132
|
+
async def query_by_vector(self, vector: list[float], top_k: int = ..., min_similarity: float = 0.8, retrieval_params: dict | None = None) -> list[Chunk]:
|
|
133
|
+
"""Search for documents that are similar to a given vector.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
vector (list[float]): The query embedding vector to compare against stored vectors.
|
|
137
|
+
top_k (int, optional): The number of top results to return. Defaults to DEFAULT_TOP_K.
|
|
138
|
+
min_similarity (float): Minimum similarity score for vector similarity.
|
|
139
|
+
retrieval_params (dict | None, optional): Filter parameters to narrow the search:
|
|
140
|
+
- filter (Where): Metadata-based filter.
|
|
141
|
+
- where_document (WhereDocument): Content-based filter.
|
|
142
|
+
Defaults to None.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
list[Chunk]: A list of Chunk objects with similarity scores based on the input vector.
|
|
146
|
+
"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*
|
|
Binary file
|
gllm_datastore.pyi
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# This file was generated by Nuitka
|
|
2
|
+
|
|
3
|
+
# Stubs included by default
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__name__ = ...
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Modules used internally, to allow implicit dependencies to be seen:
|
|
11
|
+
import os
|
|
12
|
+
import abc
|
|
13
|
+
import enum
|
|
14
|
+
import typing
|
|
15
|
+
import asyncio
|
|
16
|
+
import functools
|
|
17
|
+
import json
|
|
18
|
+
import collections
|
|
19
|
+
import collections.OrderedDict
|
|
20
|
+
import datetime
|
|
21
|
+
import gllm_core
|
|
22
|
+
import gllm_core.schema
|
|
23
|
+
import gllm_core.schema.chunk
|
|
24
|
+
import gllm_core.utils
|
|
25
|
+
import gllm_datastore.core.filters.FilterClause
|
|
26
|
+
import gllm_datastore.core.filters.QueryFilter
|
|
27
|
+
import gllm_datastore.core.filters.QueryOptions
|
|
28
|
+
import gzip
|
|
29
|
+
import pickle
|
|
30
|
+
import shutil
|
|
31
|
+
import time
|
|
32
|
+
import gllm_core.utils.logger_manager
|
|
33
|
+
import posixpath
|
|
34
|
+
import gllm_datastore.cache.hybrid_cache.key_matcher.ExactKeyMatcher
|
|
35
|
+
import gllm_datastore.utils.convert_ttl_to_seconds
|
|
36
|
+
import gllm_core.utils.imports
|
|
37
|
+
import Levenshtein
|
|
38
|
+
import gllm_datastore.vector_data_store.ElasticsearchVectorDataStore
|
|
39
|
+
import redis
|
|
40
|
+
import hashlib
|
|
41
|
+
import gllm_datastore.core.filters.FilterCondition
|
|
42
|
+
import gllm_datastore.core.filters.FilterOperator
|
|
43
|
+
import __future__
|
|
44
|
+
import gllm_inference
|
|
45
|
+
import gllm_inference.schema
|
|
46
|
+
import pydantic
|
|
47
|
+
import gllm_datastore.data_store.chroma.ChromaDataStore
|
|
48
|
+
import gllm_datastore.data_store.elasticsearch.ElasticsearchDataStore
|
|
49
|
+
import gllm_datastore.data_store.in_memory.InMemoryDataStore
|
|
50
|
+
import gllm_datastore.data_store.redis.RedisDataStore
|
|
51
|
+
import gllm_inference.em_invoker
|
|
52
|
+
import gllm_inference.em_invoker.em_invoker
|
|
53
|
+
import gllm_datastore.core.capabilities.FulltextCapability
|
|
54
|
+
import gllm_datastore.core.capabilities.GraphCapability
|
|
55
|
+
import gllm_datastore.core.capabilities.VectorCapability
|
|
56
|
+
import gllm_datastore.cache.Cache
|
|
57
|
+
import gllm_datastore.cache.MatchingStrategy
|
|
58
|
+
import chromadb
|
|
59
|
+
import sys
|
|
60
|
+
import pysqlite3
|
|
61
|
+
import rapidfuzz
|
|
62
|
+
import rapidfuzz.distance
|
|
63
|
+
import logging
|
|
64
|
+
import re
|
|
65
|
+
import dataclasses
|
|
66
|
+
import gllm_inference.em_invoker.langchain
|
|
67
|
+
import langchain_core
|
|
68
|
+
import langchain_core.runnables
|
|
69
|
+
import langchain_chroma
|
|
70
|
+
import elasticsearch
|
|
71
|
+
import elasticsearch.dsl
|
|
72
|
+
import elasticsearch.dsl.query
|
|
73
|
+
import gllm_core.utils.retry
|
|
74
|
+
import gllm_datastore.utils.flatten_dict
|
|
75
|
+
import langchain_elasticsearch
|
|
76
|
+
import langchain_elasticsearch.vectorstores
|
|
77
|
+
import collections.abc
|
|
78
|
+
import gllm_core.utils.similarity
|
|
79
|
+
import redis.asyncio
|
|
80
|
+
import redis.asyncio.client
|
|
81
|
+
import redis.exceptions
|
|
82
|
+
import redis.commands
|
|
83
|
+
import redis.commands.search
|
|
84
|
+
import redis.commands.search.query
|
|
85
|
+
import redis.commands.search.field
|
|
86
|
+
import redis.commands.search.indexDefinition
|
|
87
|
+
import numpy
|
|
88
|
+
import redisvl
|
|
89
|
+
import redisvl.redis
|
|
90
|
+
import redisvl.redis.utils
|
|
91
|
+
import redisvl.index
|
|
92
|
+
import redisvl.query
|
|
93
|
+
import redisvl.schema
|
|
94
|
+
import base64
|
|
95
|
+
import cryptography
|
|
96
|
+
import cryptography.hazmat
|
|
97
|
+
import cryptography.hazmat.primitives
|
|
98
|
+
import cryptography.hazmat.primitives.ciphers
|
|
99
|
+
import cryptography.hazmat.primitives.ciphers.aead
|
|
100
|
+
import threading
|
|
101
|
+
import lightrag
|
|
102
|
+
import tempfile
|
|
103
|
+
import contextlib
|
|
104
|
+
import gllm_inference.lm_invoker
|
|
105
|
+
import gllm_inference.lm_invoker.lm_invoker
|
|
106
|
+
import lightrag.kg
|
|
107
|
+
import lightrag.kg.shared_storage
|
|
108
|
+
import llama_index
|
|
109
|
+
import llama_index.core
|
|
110
|
+
import llama_index.core.base
|
|
111
|
+
import llama_index.core.base.embeddings
|
|
112
|
+
import llama_index.core.base.embeddings.base
|
|
113
|
+
import llama_index.core.graph_stores
|
|
114
|
+
import llama_index.core.graph_stores.types
|
|
115
|
+
import llama_index.core.llms
|
|
116
|
+
import gllm_datastore.graph_data_store.utils.LlamaIndexEMInvokerAdapter
|
|
117
|
+
import gllm_datastore.graph_data_store.utils.LlamaIndexLMInvokerAdapter
|
|
118
|
+
import llama_index.graph_stores
|
|
119
|
+
import llama_index.graph_stores.neo4j
|
|
120
|
+
import nebula3
|
|
121
|
+
import nebula3.Config
|
|
122
|
+
import nebula3.data
|
|
123
|
+
import nebula3.data.DataObject
|
|
124
|
+
import nebula3.gclient
|
|
125
|
+
import nebula3.gclient.net
|
|
126
|
+
import textwrap
|
|
127
|
+
import neo4j
|
|
128
|
+
import neo4j.exceptions
|
|
129
|
+
import lightrag.base
|
|
130
|
+
import nest_asyncio
|
|
131
|
+
import llama_index.core.bridge
|
|
132
|
+
import llama_index.core.bridge.pydantic
|
|
133
|
+
import llama_index.core.constants
|
|
134
|
+
import llama_index.core.base.llms
|
|
135
|
+
import llama_index.core.base.llms.types
|
|
136
|
+
import sqlalchemy
|
|
137
|
+
import sqlalchemy.engine
|
|
138
|
+
import pandas
|
|
139
|
+
import concurrent
|
|
140
|
+
import concurrent.futures
|
|
141
|
+
import concurrent.futures.Future
|
|
142
|
+
import concurrent.futures.ThreadPoolExecutor
|
|
143
|
+
import sqlalchemy.exc
|
|
144
|
+
import sqlalchemy.orm
|
|
145
|
+
import uuid
|
|
146
|
+
import langchain_core.documents
|
|
147
|
+
import langchain_core.embeddings
|
|
148
|
+
import langchain_core.runnables.config
|
|
149
|
+
import chromadb.types
|
|
150
|
+
import inspect
|
|
151
|
+
import redisvl.extensions
|
|
152
|
+
import redisvl.extensions.cache
|
|
153
|
+
import redisvl.extensions.cache.llm
|
|
154
|
+
import redisvl.utils
|
|
155
|
+
import redisvl.utils.vectorize
|
|
156
|
+
import redisvl.query.filter
|