gllm-datastore-binary 0.5.45__cp311-cp311-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gllm-datastore-binary might be problematic. Click here for more details.

Files changed (108) hide show
  1. gllm_datastore/__init__.pyi +0 -0
  2. gllm_datastore/cache/__init__.pyi +4 -0
  3. gllm_datastore/cache/base.pyi +84 -0
  4. gllm_datastore/cache/cache.pyi +137 -0
  5. gllm_datastore/cache/hybrid_cache/__init__.pyi +5 -0
  6. gllm_datastore/cache/hybrid_cache/file_system_hybrid_cache.pyi +50 -0
  7. gllm_datastore/cache/hybrid_cache/hybrid_cache.pyi +115 -0
  8. gllm_datastore/cache/hybrid_cache/in_memory_hybrid_cache.pyi +29 -0
  9. gllm_datastore/cache/hybrid_cache/key_matcher/__init__.pyi +5 -0
  10. gllm_datastore/cache/hybrid_cache/key_matcher/exact_key_matcher.pyi +44 -0
  11. gllm_datastore/cache/hybrid_cache/key_matcher/fuzzy_key_matcher.pyi +70 -0
  12. gllm_datastore/cache/hybrid_cache/key_matcher/key_matcher.pyi +60 -0
  13. gllm_datastore/cache/hybrid_cache/key_matcher/semantic_key_matcher.pyi +93 -0
  14. gllm_datastore/cache/hybrid_cache/redis_hybrid_cache.pyi +34 -0
  15. gllm_datastore/cache/hybrid_cache/utils.pyi +36 -0
  16. gllm_datastore/cache/utils.pyi +34 -0
  17. gllm_datastore/cache/vector_cache/__init__.pyi +0 -0
  18. gllm_datastore/cache/vector_cache/eviction_manager/__init__.pyi +0 -0
  19. gllm_datastore/cache/vector_cache/eviction_manager/asyncio_eviction_manager.pyi +48 -0
  20. gllm_datastore/cache/vector_cache/eviction_manager/eviction_manager.pyi +38 -0
  21. gllm_datastore/cache/vector_cache/eviction_strategy/__init__.pyi +0 -0
  22. gllm_datastore/cache/vector_cache/eviction_strategy/eviction_strategy.pyi +34 -0
  23. gllm_datastore/cache/vector_cache/eviction_strategy/ttl_eviction_strategy.pyi +34 -0
  24. gllm_datastore/cache/vector_cache/vector_cache.pyi +99 -0
  25. gllm_datastore/constants.pyi +66 -0
  26. gllm_datastore/core/__init__.pyi +7 -0
  27. gllm_datastore/core/capabilities/__init__.pyi +5 -0
  28. gllm_datastore/core/capabilities/fulltext_capability.pyi +73 -0
  29. gllm_datastore/core/capabilities/graph_capability.pyi +70 -0
  30. gllm_datastore/core/capabilities/vector_capability.pyi +90 -0
  31. gllm_datastore/core/filters/__init__.pyi +4 -0
  32. gllm_datastore/core/filters/filter.pyi +340 -0
  33. gllm_datastore/core/filters/schema.pyi +149 -0
  34. gllm_datastore/data_store/__init__.pyi +7 -0
  35. gllm_datastore/data_store/base.pyi +138 -0
  36. gllm_datastore/data_store/chroma/__init__.pyi +4 -0
  37. gllm_datastore/data_store/chroma/_chroma_import.pyi +13 -0
  38. gllm_datastore/data_store/chroma/data_store.pyi +202 -0
  39. gllm_datastore/data_store/chroma/fulltext.pyi +134 -0
  40. gllm_datastore/data_store/chroma/query.pyi +266 -0
  41. gllm_datastore/data_store/chroma/query_translator.pyi +41 -0
  42. gllm_datastore/data_store/chroma/vector.pyi +197 -0
  43. gllm_datastore/data_store/elasticsearch/__init__.pyi +5 -0
  44. gllm_datastore/data_store/elasticsearch/data_store.pyi +119 -0
  45. gllm_datastore/data_store/elasticsearch/fulltext.pyi +237 -0
  46. gllm_datastore/data_store/elasticsearch/query.pyi +114 -0
  47. gllm_datastore/data_store/elasticsearch/vector.pyi +179 -0
  48. gllm_datastore/data_store/exceptions.pyi +35 -0
  49. gllm_datastore/data_store/in_memory/__init__.pyi +5 -0
  50. gllm_datastore/data_store/in_memory/data_store.pyi +71 -0
  51. gllm_datastore/data_store/in_memory/fulltext.pyi +131 -0
  52. gllm_datastore/data_store/in_memory/query.pyi +175 -0
  53. gllm_datastore/data_store/in_memory/vector.pyi +174 -0
  54. gllm_datastore/data_store/redis/__init__.pyi +5 -0
  55. gllm_datastore/data_store/redis/data_store.pyi +154 -0
  56. gllm_datastore/data_store/redis/fulltext.pyi +128 -0
  57. gllm_datastore/data_store/redis/query.pyi +428 -0
  58. gllm_datastore/data_store/redis/query_translator.pyi +37 -0
  59. gllm_datastore/data_store/redis/vector.pyi +131 -0
  60. gllm_datastore/encryptor/__init__.pyi +4 -0
  61. gllm_datastore/encryptor/aes_gcm_encryptor.pyi +45 -0
  62. gllm_datastore/encryptor/encryptor.pyi +52 -0
  63. gllm_datastore/encryptor/key_ring/__init__.pyi +3 -0
  64. gllm_datastore/encryptor/key_ring/in_memory_key_ring.pyi +52 -0
  65. gllm_datastore/encryptor/key_ring/key_ring.pyi +45 -0
  66. gllm_datastore/encryptor/key_rotating_encryptor.pyi +60 -0
  67. gllm_datastore/graph_data_store/__init__.pyi +6 -0
  68. gllm_datastore/graph_data_store/graph_data_store.pyi +151 -0
  69. gllm_datastore/graph_data_store/graph_rag_data_store.pyi +29 -0
  70. gllm_datastore/graph_data_store/light_rag_data_store.pyi +93 -0
  71. gllm_datastore/graph_data_store/light_rag_postgres_data_store.pyi +96 -0
  72. gllm_datastore/graph_data_store/llama_index_graph_rag_data_store.pyi +49 -0
  73. gllm_datastore/graph_data_store/llama_index_neo4j_graph_rag_data_store.pyi +78 -0
  74. gllm_datastore/graph_data_store/nebula_graph_data_store.pyi +206 -0
  75. gllm_datastore/graph_data_store/neo4j_graph_data_store.pyi +182 -0
  76. gllm_datastore/graph_data_store/utils/__init__.pyi +6 -0
  77. gllm_datastore/graph_data_store/utils/constants.pyi +21 -0
  78. gllm_datastore/graph_data_store/utils/light_rag_em_invoker_adapter.pyi +56 -0
  79. gllm_datastore/graph_data_store/utils/light_rag_lm_invoker_adapter.pyi +43 -0
  80. gllm_datastore/graph_data_store/utils/llama_index_em_invoker_adapter.pyi +45 -0
  81. gllm_datastore/graph_data_store/utils/llama_index_lm_invoker_adapter.pyi +169 -0
  82. gllm_datastore/sql_data_store/__init__.pyi +4 -0
  83. gllm_datastore/sql_data_store/adapter/__init__.pyi +0 -0
  84. gllm_datastore/sql_data_store/adapter/sqlalchemy_adapter.pyi +38 -0
  85. gllm_datastore/sql_data_store/constants.pyi +6 -0
  86. gllm_datastore/sql_data_store/sql_data_store.pyi +86 -0
  87. gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi +216 -0
  88. gllm_datastore/sql_data_store/types.pyi +31 -0
  89. gllm_datastore/utils/__init__.pyi +6 -0
  90. gllm_datastore/utils/converter.pyi +51 -0
  91. gllm_datastore/utils/dict.pyi +21 -0
  92. gllm_datastore/utils/ttl.pyi +25 -0
  93. gllm_datastore/utils/types.pyi +32 -0
  94. gllm_datastore/vector_data_store/__init__.pyi +6 -0
  95. gllm_datastore/vector_data_store/chroma_vector_data_store.pyi +259 -0
  96. gllm_datastore/vector_data_store/elasticsearch_vector_data_store.pyi +357 -0
  97. gllm_datastore/vector_data_store/in_memory_vector_data_store.pyi +179 -0
  98. gllm_datastore/vector_data_store/mixin/__init__.pyi +0 -0
  99. gllm_datastore/vector_data_store/mixin/cache_compatible_mixin.pyi +145 -0
  100. gllm_datastore/vector_data_store/redis_vector_data_store.pyi +191 -0
  101. gllm_datastore/vector_data_store/vector_data_store.pyi +146 -0
  102. gllm_datastore.build/.gitignore +1 -0
  103. gllm_datastore.cpython-311-darwin.so +0 -0
  104. gllm_datastore.pyi +156 -0
  105. gllm_datastore_binary-0.5.45.dist-info/METADATA +178 -0
  106. gllm_datastore_binary-0.5.45.dist-info/RECORD +108 -0
  107. gllm_datastore_binary-0.5.45.dist-info/WHEEL +5 -0
  108. gllm_datastore_binary-0.5.45.dist-info/top_level.txt +1 -0
@@ -0,0 +1,145 @@
1
+ from abc import ABC, abstractmethod
2
+ from datetime import datetime
3
+ from gllm_datastore.cache.cache import MatchingStrategy as MatchingStrategy
4
+ from gllm_datastore.cache.vector_cache.eviction_manager.eviction_manager import BaseEvictionManager as BaseEvictionManager
5
+ from gllm_datastore.cache.vector_cache.vector_cache import VectorCache as VectorCache
6
+ from gllm_datastore.constants import METADATA_KEYS as METADATA_KEYS
7
+ from typing import Any
8
+
9
+ class CacheCompatibleMixin(ABC):
10
+ """Mixin that provides cache-specific matching operations for vector datastores.
11
+
12
+ This mixin adds methods for exact, fuzzy, and semantic matching that are
13
+ required by the VectorCache implementation, without forcing all vector datastores
14
+ to implement these methods.
15
+ """
16
+ async def store_cache(self, key: str, value: Any, metadata: dict[str, Any] | None = None) -> None:
17
+ """Public method to store cache data in the storage.
18
+
19
+ Args:
20
+ key (str): The key to store the cache data.
21
+ value (Any): The cache data to store.
22
+ metadata (dict[str, Any] | None, optional): Additional metadata to store with the cache data.
23
+ Defaults to None.
24
+ """
25
+ @abstractmethod
26
+ async def exact_match(self, key: str, **kwargs) -> Any | None:
27
+ """Find chunks that exactly match the given key.
28
+
29
+ This method should be implemented by subclasses.
30
+
31
+ Args:
32
+ key (str): The key to match.
33
+ **kwargs (Any): Additional parameters for the matching operation.
34
+
35
+ Returns:
36
+ Any: Chunks that exactly match the key.
37
+
38
+ Raises:
39
+ NotImplementedError: If the method is not implemented.
40
+ """
41
+ @abstractmethod
42
+ async def fuzzy_match(self, key: str, max_distance: int = 2, **kwargs) -> Any | None:
43
+ """Find chunks that approximately match the given key using fuzzy matching.
44
+
45
+ This method should be implemented by subclasses.
46
+
47
+ Args:
48
+ key (str): The key to match.
49
+ max_distance (int): Maximum distance for fuzzy matching. Lower values are more strict.
50
+ This is the maximum Levenshtein distance allowed for a match. Defaults to 2.
51
+ **kwargs (Any): Additional parameters for the matching operation.
52
+
53
+ Returns:
54
+ Any: Chunks that fuzzy match the key within the threshold.
55
+
56
+ Raises:
57
+ NotImplementedError: If the method is not implemented.
58
+ """
59
+ @abstractmethod
60
+ async def semantic_match(self, key: str, min_similarity: float = 0.8, metadata: dict[str, Any] | None = None, **kwargs) -> Any | None:
61
+ """Find chunks that semantically match the given key using vector similarity.
62
+
63
+ This method should be implemented by subclasses.
64
+
65
+ Args:
66
+ key (str): The key to match.
67
+ min_similarity (float): Minimum similarity score for semantic matching
68
+ (higher values are more strict). Ranges from 0 to 1. Defaults to 0.8.
69
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
70
+ Defaults to None.
71
+ **kwargs (Any): Additional parameters for the matching operation.
72
+
73
+ Returns:
74
+ Any: Chunks that semantically match the key above the threshold.
75
+
76
+ Raises:
77
+ NotImplementedError: If the method is not implemented.
78
+ """
79
+ @abstractmethod
80
+ async def delete_expired_entries(self, now: datetime, max_size: int = 10000) -> None:
81
+ """Delete expired entries (for TTL eviction).
82
+
83
+ This method should be implemented by subclasses.
84
+
85
+ Args:
86
+ now (datetime): The current datetime for comparison.
87
+ max_size (int): The maximum number of entries to return. Defaults to 10000.
88
+
89
+ Raises:
90
+ NotImplementedError: If the method is not implemented.
91
+ """
92
+ @abstractmethod
93
+ async def delete_least_frequently_used_entries(self, num_entries: int) -> None:
94
+ """Delete least frequently used entries (for LFU eviction).
95
+
96
+ This method should be implemented by subclasses.
97
+
98
+ Args:
99
+ num_entries (int): Number of entries to return.
100
+
101
+ Raises:
102
+ NotImplementedError: If the method is not implemented.
103
+ """
104
+ @abstractmethod
105
+ async def delete_least_recently_used_entries(self, num_entries: int) -> None:
106
+ """Delete least recently used entries (for LRU eviction).
107
+
108
+ This method should be implemented by subclasses.
109
+
110
+ Args:
111
+ num_entries (int): Number of entries to return.
112
+
113
+ Raises:
114
+ NotImplementedError: If the method is not implemented.
115
+ """
116
+ @abstractmethod
117
+ async def delete_entries_by_key(self, key: str | list[str], metadata: dict[str, Any] | None = None) -> None:
118
+ '''Delete entries by key.
119
+
120
+ This method should be implemented by subclasses.
121
+
122
+ Args:
123
+ key (str): The key to delete entries for.
124
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
125
+ For example, `{"key": "value"}`. Defaults to None.
126
+
127
+ Raises:
128
+ NotImplementedError: If the method is not implemented.
129
+ '''
130
+ def as_cache(self, eviction_manager: BaseEvictionManager | None = None, matching_strategy: MatchingStrategy = 'exact', matching_config: dict[str, Any] | None = None, saving_config: dict[str, Any] | None = None) -> VectorCache:
131
+ """Return a cache instance that can be used to store and retrieve data.
132
+
133
+ Args:
134
+ eviction_manager (Optional[BaseEvictionManager], optional): The eviction manager to use for cache eviction.
135
+ Defaults to None. If None, no eviction will be performed.
136
+ matching_strategy (MatchingStrategy, optional): The strategy to use for matching keys.
137
+ Defaults to MatchingStrategy.EXACT.
138
+ matching_config (dict[str, Any] | None, optional): Configuration parameters for matching strategies.
139
+ Defaults to None, which means no specific configuration is provided.
140
+ saving_config (dict[str, Any] | None, optional): Configuration parameters for saving strategies.
141
+ Defaults to None, which means no specific configuration is provided.
142
+
143
+ Returns:
144
+ VectorCache: A cache instance that can be used to store and retrieve data.
145
+ """
@@ -0,0 +1,191 @@
1
+ from _typeshed import Incomplete
2
+ from datetime import datetime
3
+ from gllm_core.schema.chunk import Chunk
4
+ from gllm_datastore.constants import DEFAULT_TOP_K as DEFAULT_TOP_K, METADATA_KEYS as METADATA_KEYS
5
+ from gllm_datastore.utils.converter import cosine_distance_to_similarity_score as cosine_distance_to_similarity_score, similarity_score_to_cosine_distance as similarity_score_to_cosine_distance
6
+ from gllm_datastore.vector_data_store.mixin.cache_compatible_mixin import CacheCompatibleMixin as CacheCompatibleMixin
7
+ from gllm_datastore.vector_data_store.vector_data_store import BaseVectorDataStore as BaseVectorDataStore
8
+ from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
9
+ from redis import Redis as Redis
10
+ from redisvl.query.filter import FilterExpression as FilterExpression
11
+ from typing import Any
12
+
13
+ FUZZY_MATCH_MAX_DISTANCE: int
14
+
15
+ class RedisVectorDataStore(BaseVectorDataStore, CacheCompatibleMixin):
16
+ """Vector data store implementation that uses Redis with RedisVL for vector search.
17
+
18
+ This class provides methods to interact with Redis for vector storage and retrieval
19
+ using Redis Vector Search capabilities via RedisVL and langchain-redis.
20
+
21
+ Attributes:
22
+ redis_url (str): URL for Redis connection.
23
+ index_name (str): Name for the vector index.
24
+ search_index (SearchIndex): RedisVL SearchIndex instance.
25
+ cache_store (SemanticCache): RedisVL SemanticCache instance.
26
+ embedding (BaseEMInvoker | None): The embedding model to perform vectorization.
27
+ """
28
+ index_name: Incomplete
29
+ url: Incomplete
30
+ client: Incomplete
31
+ filterable_fields: Incomplete
32
+ cache_store: Incomplete
33
+ def __init__(self, index_name: str, url: str | None = None, client: Redis | None = None, embedding: BaseEMInvoker | None = None, additional_filter_fields: list[dict[str, Any]] | None = None) -> None:
34
+ '''Initialize Redis vector store using RedisVL and langchain-redis.
35
+
36
+ Args:
37
+ index_name (str): Name of the index to use.
38
+ url (str): URL for Redis connection.
39
+ client (Redis | None, optional): Redis client to use for vectorization.
40
+ embedding (BaseEMInvoker | None, optional): Embedding function to use for vectorization.
41
+ Defaults to None. If None, the default embedding model (redis/langcache-embed-v1) will be used.
42
+ additional_filter_fields (list[dict[str, Any]] | None, optional): Additional filterable fields to add
43
+ to the index. For example, to add `entry_id` as a filterable field, pass
44
+ `[{"name": "entry_id", "type": "text"}]`. Defaults to None.
45
+
46
+ Notes:
47
+ Besides the `additional_filter_fields`, the class will automatically create default filterable fields:
48
+ 1. prompt: TEXT (default from redisvl).
49
+ 2. response: TEXT (default from redisvl).
50
+ 3. prompt_vector: VECTOR (default from redisvl).
51
+ 4. chunk_id: TEXT (default additional_filter_fields).
52
+
53
+ Raises:
54
+ TypeError: If `embedding` is not an instance of `BaseEMInvoker`.
55
+ '''
56
+ async def get_size(self) -> int:
57
+ """Returns the total number of vectors in the index.
58
+
59
+ If the index is not initialized returns 0.
60
+
61
+ Returns:
62
+ int: The total number of vectors.
63
+ """
64
+ async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, Any] | None = None) -> list[Chunk]:
65
+ """Search for semantically similar documents which returns similarity scores.
66
+
67
+ Args:
68
+ query (str): The query text to search for.
69
+ top_k (int): Number of top results to return.
70
+ retrieval_params (dict[str, Any] | None, optional): Additional parameters for the query such as:
71
+ - filter: Redis filter expression to narrow results following RedisVL FilterExpression.
72
+
73
+ Returns:
74
+ list[Chunk]: List of chunks semantically similar to the query
75
+ """
76
+ async def query_by_id(self, id_: str | list[str]) -> list[Chunk]:
77
+ """Retrieve chunks by their IDs.
78
+
79
+ Args:
80
+ id_ (str | list[str]): A single ID or list of chunk IDs to retrieve
81
+
82
+ Returns:
83
+ list[Chunk]: List of retrieved chunks
84
+ """
85
+ async def add_chunks(self, chunks: Chunk | list[Chunk], **kwargs) -> list[str]:
86
+ """Add chunks to the vector store.
87
+
88
+ Args:
89
+ chunks (Chunk | list[Chunk]): A single chunk or a list of chunks to add
90
+ **kwargs: Additional parameters for adding chunks
91
+
92
+ Returns:
93
+ list[str]: List of IDs of the added chunks
94
+ """
95
+ async def delete_chunks(self, query: str, **kwargs: Any) -> None:
96
+ '''Delete chunks from the vector store by filter/query. Not supported for Redis backend.
97
+
98
+ Args:
99
+ query (str): The query to delete chunks by. For example, "user_*" would match keys
100
+ like "user_1", "user_2", etc.
101
+ **kwargs: Additional keyword arguments.
102
+ '''
103
+ async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
104
+ """Delete chunks from the vector store by their IDs.
105
+
106
+ Args:
107
+ ids (str | list[str]): A single ID or a list of IDs to delete.
108
+ **kwargs: Additional keyword arguments.
109
+ """
110
+ async def exact_match(self, key: str, metadata: dict[str, Any] | None = None) -> Any | None:
111
+ '''Find chunks that exactly match the given prompt.
112
+
113
+ Args:
114
+ key (str): The prompt to match.
115
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
116
+ For example, `{"key": "value"}`. Defaults to None.
117
+
118
+ Returns:
119
+ Any: The value stored with the matching prompt, or None if no match is found.
120
+ '''
121
+ async def fuzzy_match(self, key: str, max_distance: int = 2, metadata: dict[str, Any] | None = None) -> Any | None:
122
+ '''Find chunks that approximately match the given key using fuzzy matching.
123
+
124
+ Args:
125
+ key (str): The key to match
126
+ max_distance (int): Maximum allowed distance for fuzzy matching
127
+ (higher values allow for more differences). Maximum is 3. Defaults to 2.
128
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
129
+ For example, `{"key": "value"}`. Defaults to None.
130
+
131
+ Note:
132
+ Maximum fuzzy distance is 3. This is a limitation of the Redis Vector Search and the Redis Search module.
133
+ See [5] for more details.
134
+
135
+ Returns:
136
+ Any: The value with the closest fuzzy match, or None if no match is found
137
+ '''
138
+ async def semantic_match(self, key: str, min_similarity: float = 0.8, metadata: dict[str, Any] | None = None) -> Any | None:
139
+ '''Find chunks that semantically match the given key using vector similarity.
140
+
141
+ This method compares the vector embedding of the search key with vector embeddings
142
+ of stored keys to find semantically similar matches.
143
+
144
+ Args:
145
+ key (str): The key to match
146
+ min_similarity (float, optional): Minimum similarity score for semantic matching
147
+ (higher values are more strict). Ranges from 0 to 1. Defaults to 0.8.
148
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
149
+ For example, `{"key": "value"}`. Defaults to None.
150
+
151
+ Returns:
152
+ Any: The semantically closest value, or None if no match meets the threshold
153
+ '''
154
+ async def delete_expired_entries(self, now: datetime, max_size: int = 10000) -> None:
155
+ """Delete expired entries (for TTL eviction).
156
+
157
+ Args:
158
+ now (datetime): The current datetime for comparison.
159
+ max_size (int): The maximum number of entries to return. Defaults to 10000.
160
+
161
+ Raises:
162
+ NotImplementedError: Currently, app-level eviction is not supported for RedisVectorDataStore.
163
+ """
164
+ async def delete_least_frequently_used_entries(self, num_entries: int) -> None:
165
+ """Delete least frequently used entries (for LFU eviction).
166
+
167
+ Args:
168
+ num_entries (int): Number of entries to return.
169
+
170
+ Raises:
171
+ NotImplementedError: Currently, app-level eviction is not supported for RedisVectorDataStore.
172
+ """
173
+ async def delete_least_recently_used_entries(self, num_entries: int) -> None:
174
+ """Delete least recently used entries (for LRU eviction).
175
+
176
+ Args:
177
+ num_entries (int): Number of entries to return.
178
+
179
+ Raises:
180
+ NotImplementedError: Currently, app-level eviction is not supported for RedisVectorDataStore.
181
+ """
182
+ async def delete_entries_by_key(self, key: str | list[str], metadata: dict[str, Any] | None = None) -> None:
183
+ '''Delete entries by key.
184
+
185
+ Args:
186
+ key (str | list[str]): The key or list of keys to delete entries for.
187
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
188
+ For example, `{"key": "value"}`. Defaults to None.
189
+ '''
190
+ async def clear(self) -> None:
191
+ """Clear all entries in the storage."""
@@ -0,0 +1,146 @@
1
+ from abc import ABC, abstractmethod
2
+ from gllm_core.schema.chunk import Chunk
3
+ from gllm_datastore.constants import DEFAULT_TOP_K as DEFAULT_TOP_K
4
+ from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
5
+ from langchain_core.embeddings import Embeddings
6
+ from typing import Any
7
+
8
+ class BaseVectorDataStore(ABC):
9
+ """Abstract base class for vector data stores in the retrieval system.
10
+
11
+ This class defines the interface for all vector data store implementations.
12
+ Subclasses must implement the `query` and `query_by_id` methods.
13
+ """
14
+ @property
15
+ def embedding(self) -> BaseEMInvoker | Embeddings | None:
16
+ """Returns the embedding model associated with this data store.
17
+
18
+ Returns:
19
+ BaseEMInvoker | Embeddings | None: The embedding model.
20
+ """
21
+ async def get_size(self) -> int:
22
+ """Returns the total number of vectors in the index.
23
+
24
+ If the index is not initialized returns 0.
25
+
26
+ Returns:
27
+ int: The total number of vectors.
28
+ """
29
+ @abstractmethod
30
+ async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, Any] | None = None) -> list[Chunk]:
31
+ """Executes a query on the data store.
32
+
33
+ This method must be implemented by subclasses.
34
+
35
+ Args:
36
+ query (str): The query string to execute.
37
+ top_k (int, optional): The maximum number of results to return. Defaults to DEFAULT_TOP_K.
38
+ retrieval_params (dict[str, Any] | None, optional): Additional parameters for the query.
39
+ Defaults to None.
40
+
41
+ Returns:
42
+ list[Chunk]: A list of query results.
43
+
44
+ Raises:
45
+ NotImplementedError: If the method is not implemented.
46
+ """
47
+ @abstractmethod
48
+ async def query_by_id(self, id_: str | list[str]) -> list[Chunk]:
49
+ """Retrieves chunks by their IDs.
50
+
51
+ This method must be implemented by subclasses.
52
+
53
+ Args:
54
+ id_ (str | list[str]): A single ID or a list of IDs to retrieve.
55
+
56
+ Returns:
57
+ list[Chunk]: A list of retrieved chunks.
58
+
59
+ Raises:
60
+ NotImplementedError: If the method is not implemented.
61
+ """
62
+ @abstractmethod
63
+ async def add_chunks(self, chunk: Chunk | list[Chunk], **kwargs) -> list[str]:
64
+ """Adds a chunk or a list of chunks in the data store.
65
+
66
+ This method must be implemented by subclasses.
67
+
68
+ Args:
69
+ chunk (Chunk | list[Chunk]): A single chunk or a list of chunks to index.
70
+ **kwargs: Additional keyword arguments to pass to the method.
71
+
72
+ Returns:
73
+ list[str]: A list of unique identifiers (IDs) assigned to the added chunks.
74
+
75
+ Raises:
76
+ NotImplementedError: If the method is not implemented.
77
+ """
78
+ @abstractmethod
79
+ async def delete_chunks(self, **kwargs: Any) -> None:
80
+ """Deletes chunks from the data store by filter or query.
81
+
82
+ This method must be implemented by subclasses.
83
+
84
+ Args:
85
+ **kwargs: Additional keyword arguments specifying the filter or query for deletion.
86
+ The exact parameters depend on the backend implementation.
87
+
88
+ Raises:
89
+ NotImplementedError: If the method is not implemented.
90
+ """
91
+ @abstractmethod
92
+ async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
93
+ """Deletes a chunk or a list of chunks from the data store by their IDs.
94
+
95
+ This method must be implemented by subclasses.
96
+
97
+ Args:
98
+ ids (str | list[str]): A single ID or a list of IDs to delete.
99
+ **kwargs: Additional keyword arguments.
100
+
101
+ Raises:
102
+ NotImplementedError: If the method is not implemented.
103
+ """
104
+ async def clear(self) -> None:
105
+ """Clear all entries in the storage.
106
+
107
+ This method should be implemented by subclasses.
108
+ """
109
+ async def query_by_field(self, retrieval_params: dict[str, Any], limit: int | None = None, **kwargs) -> list[Chunk]:
110
+ """Retrieve documents that match specific metadata constraints.
111
+
112
+ This method filters and returns stored chunks based on metadata values
113
+ rather than vector similarity. It is particularly useful for structured lookups,
114
+ such as retrieving all chunks from a certain source, tagged with a specific label,
115
+ or authored by a particular user.
116
+
117
+ Unlike semantic search methods, `query_by_field` operates purely on metadata fields
118
+ associated with each document, allowing precise filtering based on key-value pairs.
119
+
120
+ Args:
121
+ retrieval_params (dict[str, Any]): A dictionary defining filter criteria.
122
+ limit (int | None, optional): The maximum number of results to return. If None, all matching
123
+ documents will be returned.
124
+ **kwargs: Additional arguments to support datastore-specific behavior or filtering logic.
125
+
126
+ Returns:
127
+ list[Chunk]: A list of `Chunk` objects that satisfy the metadata criteria.
128
+
129
+ Raises:
130
+ NotImplementedError: If not implemented in the subclass.
131
+ """
132
+ async def query_by_vector(self, vector: list[float], top_k: int = ..., min_similarity: float = 0.8, retrieval_params: dict | None = None) -> list[Chunk]:
133
+ """Search for documents that are similar to a given vector.
134
+
135
+ Args:
136
+ vector (list[float]): The query embedding vector to compare against stored vectors.
137
+ top_k (int, optional): The number of top results to return. Defaults to DEFAULT_TOP_K.
138
+ min_similarity (float): Minimum similarity score for vector similarity.
139
+ retrieval_params (dict | None, optional): Filter parameters to narrow the search:
140
+ - filter (Where): Metadata-based filter.
141
+ - where_document (WhereDocument): Content-based filter.
142
+ Defaults to None.
143
+
144
+ Returns:
145
+ list[Chunk]: A list of Chunk objects with similarity scores based on the input vector.
146
+ """
@@ -0,0 +1 @@
1
+ *
Binary file
gllm_datastore.pyi ADDED
@@ -0,0 +1,156 @@
1
+ # This file was generated by Nuitka
2
+
3
+ # Stubs included by default
4
+
5
+
6
+ __name__ = ...
7
+
8
+
9
+
10
+ # Modules used internally, to allow implicit dependencies to be seen:
11
+ import os
12
+ import abc
13
+ import enum
14
+ import typing
15
+ import asyncio
16
+ import functools
17
+ import json
18
+ import collections
19
+ import collections.OrderedDict
20
+ import datetime
21
+ import gllm_core
22
+ import gllm_core.schema
23
+ import gllm_core.schema.chunk
24
+ import gllm_core.utils
25
+ import gllm_datastore.core.filters.FilterClause
26
+ import gllm_datastore.core.filters.QueryFilter
27
+ import gllm_datastore.core.filters.QueryOptions
28
+ import gzip
29
+ import pickle
30
+ import shutil
31
+ import time
32
+ import gllm_core.utils.logger_manager
33
+ import posixpath
34
+ import gllm_datastore.cache.hybrid_cache.key_matcher.ExactKeyMatcher
35
+ import gllm_datastore.utils.convert_ttl_to_seconds
36
+ import gllm_core.utils.imports
37
+ import Levenshtein
38
+ import gllm_datastore.vector_data_store.ElasticsearchVectorDataStore
39
+ import redis
40
+ import hashlib
41
+ import gllm_datastore.core.filters.FilterCondition
42
+ import gllm_datastore.core.filters.FilterOperator
43
+ import __future__
44
+ import gllm_inference
45
+ import gllm_inference.schema
46
+ import pydantic
47
+ import gllm_datastore.data_store.chroma.ChromaDataStore
48
+ import gllm_datastore.data_store.elasticsearch.ElasticsearchDataStore
49
+ import gllm_datastore.data_store.in_memory.InMemoryDataStore
50
+ import gllm_datastore.data_store.redis.RedisDataStore
51
+ import gllm_inference.em_invoker
52
+ import gllm_inference.em_invoker.em_invoker
53
+ import gllm_datastore.core.capabilities.FulltextCapability
54
+ import gllm_datastore.core.capabilities.GraphCapability
55
+ import gllm_datastore.core.capabilities.VectorCapability
56
+ import gllm_datastore.cache.Cache
57
+ import gllm_datastore.cache.MatchingStrategy
58
+ import chromadb
59
+ import sys
60
+ import pysqlite3
61
+ import rapidfuzz
62
+ import rapidfuzz.distance
63
+ import logging
64
+ import re
65
+ import dataclasses
66
+ import gllm_inference.em_invoker.langchain
67
+ import langchain_core
68
+ import langchain_core.runnables
69
+ import langchain_chroma
70
+ import elasticsearch
71
+ import elasticsearch.dsl
72
+ import elasticsearch.dsl.query
73
+ import gllm_core.utils.retry
74
+ import gllm_datastore.utils.flatten_dict
75
+ import langchain_elasticsearch
76
+ import langchain_elasticsearch.vectorstores
77
+ import collections.abc
78
+ import gllm_core.utils.similarity
79
+ import redis.asyncio
80
+ import redis.asyncio.client
81
+ import redis.exceptions
82
+ import redis.commands
83
+ import redis.commands.search
84
+ import redis.commands.search.query
85
+ import redis.commands.search.field
86
+ import redis.commands.search.indexDefinition
87
+ import numpy
88
+ import redisvl
89
+ import redisvl.redis
90
+ import redisvl.redis.utils
91
+ import redisvl.index
92
+ import redisvl.query
93
+ import redisvl.schema
94
+ import base64
95
+ import cryptography
96
+ import cryptography.hazmat
97
+ import cryptography.hazmat.primitives
98
+ import cryptography.hazmat.primitives.ciphers
99
+ import cryptography.hazmat.primitives.ciphers.aead
100
+ import threading
101
+ import lightrag
102
+ import tempfile
103
+ import contextlib
104
+ import gllm_inference.lm_invoker
105
+ import gllm_inference.lm_invoker.lm_invoker
106
+ import lightrag.kg
107
+ import lightrag.kg.shared_storage
108
+ import llama_index
109
+ import llama_index.core
110
+ import llama_index.core.base
111
+ import llama_index.core.base.embeddings
112
+ import llama_index.core.base.embeddings.base
113
+ import llama_index.core.graph_stores
114
+ import llama_index.core.graph_stores.types
115
+ import llama_index.core.llms
116
+ import gllm_datastore.graph_data_store.utils.LlamaIndexEMInvokerAdapter
117
+ import gllm_datastore.graph_data_store.utils.LlamaIndexLMInvokerAdapter
118
+ import llama_index.graph_stores
119
+ import llama_index.graph_stores.neo4j
120
+ import nebula3
121
+ import nebula3.Config
122
+ import nebula3.data
123
+ import nebula3.data.DataObject
124
+ import nebula3.gclient
125
+ import nebula3.gclient.net
126
+ import textwrap
127
+ import neo4j
128
+ import neo4j.exceptions
129
+ import lightrag.base
130
+ import nest_asyncio
131
+ import llama_index.core.bridge
132
+ import llama_index.core.bridge.pydantic
133
+ import llama_index.core.constants
134
+ import llama_index.core.base.llms
135
+ import llama_index.core.base.llms.types
136
+ import sqlalchemy
137
+ import sqlalchemy.engine
138
+ import pandas
139
+ import concurrent
140
+ import concurrent.futures
141
+ import concurrent.futures.Future
142
+ import concurrent.futures.ThreadPoolExecutor
143
+ import sqlalchemy.exc
144
+ import sqlalchemy.orm
145
+ import uuid
146
+ import langchain_core.documents
147
+ import langchain_core.embeddings
148
+ import langchain_core.runnables.config
149
+ import chromadb.types
150
+ import inspect
151
+ import redisvl.extensions
152
+ import redisvl.extensions.cache
153
+ import redisvl.extensions.cache.llm
154
+ import redisvl.utils
155
+ import redisvl.utils.vectorize
156
+ import redisvl.query.filter