gllm-datastore-binary 0.5.50__cp312-cp312-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. gllm_datastore/__init__.pyi +0 -0
  2. gllm_datastore/cache/__init__.pyi +4 -0
  3. gllm_datastore/cache/base.pyi +84 -0
  4. gllm_datastore/cache/cache.pyi +137 -0
  5. gllm_datastore/cache/hybrid_cache/__init__.pyi +5 -0
  6. gllm_datastore/cache/hybrid_cache/file_system_hybrid_cache.pyi +50 -0
  7. gllm_datastore/cache/hybrid_cache/hybrid_cache.pyi +115 -0
  8. gllm_datastore/cache/hybrid_cache/in_memory_hybrid_cache.pyi +29 -0
  9. gllm_datastore/cache/hybrid_cache/key_matcher/__init__.pyi +5 -0
  10. gllm_datastore/cache/hybrid_cache/key_matcher/exact_key_matcher.pyi +44 -0
  11. gllm_datastore/cache/hybrid_cache/key_matcher/fuzzy_key_matcher.pyi +70 -0
  12. gllm_datastore/cache/hybrid_cache/key_matcher/key_matcher.pyi +60 -0
  13. gllm_datastore/cache/hybrid_cache/key_matcher/semantic_key_matcher.pyi +93 -0
  14. gllm_datastore/cache/hybrid_cache/redis_hybrid_cache.pyi +34 -0
  15. gllm_datastore/cache/hybrid_cache/utils.pyi +36 -0
  16. gllm_datastore/cache/utils.pyi +34 -0
  17. gllm_datastore/cache/vector_cache/__init__.pyi +0 -0
  18. gllm_datastore/cache/vector_cache/eviction_manager/__init__.pyi +0 -0
  19. gllm_datastore/cache/vector_cache/eviction_manager/asyncio_eviction_manager.pyi +48 -0
  20. gllm_datastore/cache/vector_cache/eviction_manager/eviction_manager.pyi +38 -0
  21. gllm_datastore/cache/vector_cache/eviction_strategy/__init__.pyi +0 -0
  22. gllm_datastore/cache/vector_cache/eviction_strategy/eviction_strategy.pyi +34 -0
  23. gllm_datastore/cache/vector_cache/eviction_strategy/ttl_eviction_strategy.pyi +34 -0
  24. gllm_datastore/cache/vector_cache/vector_cache.pyi +99 -0
  25. gllm_datastore/constants.pyi +66 -0
  26. gllm_datastore/core/__init__.pyi +7 -0
  27. gllm_datastore/core/capabilities/__init__.pyi +7 -0
  28. gllm_datastore/core/capabilities/encryption_capability.pyi +21 -0
  29. gllm_datastore/core/capabilities/fulltext_capability.pyi +73 -0
  30. gllm_datastore/core/capabilities/graph_capability.pyi +70 -0
  31. gllm_datastore/core/capabilities/hybrid_capability.pyi +184 -0
  32. gllm_datastore/core/capabilities/vector_capability.pyi +90 -0
  33. gllm_datastore/core/filters/__init__.pyi +4 -0
  34. gllm_datastore/core/filters/filter.pyi +340 -0
  35. gllm_datastore/core/filters/schema.pyi +149 -0
  36. gllm_datastore/data_store/__init__.pyi +8 -0
  37. gllm_datastore/data_store/_elastic_core/__init__.pyi +0 -0
  38. gllm_datastore/data_store/_elastic_core/client_factory.pyi +66 -0
  39. gllm_datastore/data_store/_elastic_core/constants.pyi +27 -0
  40. gllm_datastore/data_store/_elastic_core/elastic_like_core.pyi +115 -0
  41. gllm_datastore/data_store/_elastic_core/index_manager.pyi +37 -0
  42. gllm_datastore/data_store/_elastic_core/query_translator.pyi +89 -0
  43. gllm_datastore/data_store/base.pyi +176 -0
  44. gllm_datastore/data_store/chroma/__init__.pyi +4 -0
  45. gllm_datastore/data_store/chroma/_chroma_import.pyi +13 -0
  46. gllm_datastore/data_store/chroma/data_store.pyi +201 -0
  47. gllm_datastore/data_store/chroma/fulltext.pyi +134 -0
  48. gllm_datastore/data_store/chroma/query.pyi +266 -0
  49. gllm_datastore/data_store/chroma/query_translator.pyi +41 -0
  50. gllm_datastore/data_store/chroma/vector.pyi +197 -0
  51. gllm_datastore/data_store/elasticsearch/__init__.pyi +5 -0
  52. gllm_datastore/data_store/elasticsearch/data_store.pyi +147 -0
  53. gllm_datastore/data_store/elasticsearch/fulltext.pyi +238 -0
  54. gllm_datastore/data_store/elasticsearch/query.pyi +118 -0
  55. gllm_datastore/data_store/elasticsearch/query_translator.pyi +18 -0
  56. gllm_datastore/data_store/elasticsearch/vector.pyi +180 -0
  57. gllm_datastore/data_store/exceptions.pyi +35 -0
  58. gllm_datastore/data_store/in_memory/__init__.pyi +5 -0
  59. gllm_datastore/data_store/in_memory/data_store.pyi +71 -0
  60. gllm_datastore/data_store/in_memory/fulltext.pyi +131 -0
  61. gllm_datastore/data_store/in_memory/query.pyi +175 -0
  62. gllm_datastore/data_store/in_memory/vector.pyi +174 -0
  63. gllm_datastore/data_store/opensearch/__init__.pyi +5 -0
  64. gllm_datastore/data_store/opensearch/data_store.pyi +160 -0
  65. gllm_datastore/data_store/opensearch/fulltext.pyi +240 -0
  66. gllm_datastore/data_store/opensearch/query.pyi +89 -0
  67. gllm_datastore/data_store/opensearch/query_translator.pyi +18 -0
  68. gllm_datastore/data_store/opensearch/vector.pyi +211 -0
  69. gllm_datastore/data_store/redis/__init__.pyi +5 -0
  70. gllm_datastore/data_store/redis/data_store.pyi +153 -0
  71. gllm_datastore/data_store/redis/fulltext.pyi +128 -0
  72. gllm_datastore/data_store/redis/query.pyi +428 -0
  73. gllm_datastore/data_store/redis/query_translator.pyi +37 -0
  74. gllm_datastore/data_store/redis/vector.pyi +131 -0
  75. gllm_datastore/data_store/sql/__init__.pyi +4 -0
  76. gllm_datastore/data_store/sql/constants.pyi +5 -0
  77. gllm_datastore/data_store/sql/data_store.pyi +201 -0
  78. gllm_datastore/data_store/sql/fulltext.pyi +164 -0
  79. gllm_datastore/data_store/sql/query.pyi +81 -0
  80. gllm_datastore/data_store/sql/query_translator.pyi +51 -0
  81. gllm_datastore/data_store/sql/schema.pyi +16 -0
  82. gllm_datastore/encryptor/__init__.pyi +4 -0
  83. gllm_datastore/encryptor/aes_gcm_encryptor.pyi +45 -0
  84. gllm_datastore/encryptor/capability/__init__.pyi +3 -0
  85. gllm_datastore/encryptor/capability/mixin.pyi +32 -0
  86. gllm_datastore/encryptor/encryptor.pyi +52 -0
  87. gllm_datastore/encryptor/key_ring/__init__.pyi +3 -0
  88. gllm_datastore/encryptor/key_ring/in_memory_key_ring.pyi +52 -0
  89. gllm_datastore/encryptor/key_ring/key_ring.pyi +45 -0
  90. gllm_datastore/encryptor/key_rotating_encryptor.pyi +60 -0
  91. gllm_datastore/graph_data_store/__init__.pyi +6 -0
  92. gllm_datastore/graph_data_store/graph_data_store.pyi +151 -0
  93. gllm_datastore/graph_data_store/graph_rag_data_store.pyi +29 -0
  94. gllm_datastore/graph_data_store/light_rag_data_store.pyi +93 -0
  95. gllm_datastore/graph_data_store/light_rag_postgres_data_store.pyi +96 -0
  96. gllm_datastore/graph_data_store/llama_index_graph_rag_data_store.pyi +49 -0
  97. gllm_datastore/graph_data_store/llama_index_neo4j_graph_rag_data_store.pyi +78 -0
  98. gllm_datastore/graph_data_store/mixins/__init__.pyi +3 -0
  99. gllm_datastore/graph_data_store/mixins/agentic_graph_tools_mixin.pyi +175 -0
  100. gllm_datastore/graph_data_store/nebula_graph_data_store.pyi +206 -0
  101. gllm_datastore/graph_data_store/neo4j_graph_data_store.pyi +182 -0
  102. gllm_datastore/graph_data_store/schema.pyi +27 -0
  103. gllm_datastore/graph_data_store/utils/__init__.pyi +6 -0
  104. gllm_datastore/graph_data_store/utils/constants.pyi +21 -0
  105. gllm_datastore/graph_data_store/utils/light_rag_em_invoker_adapter.pyi +56 -0
  106. gllm_datastore/graph_data_store/utils/light_rag_lm_invoker_adapter.pyi +43 -0
  107. gllm_datastore/graph_data_store/utils/llama_index_em_invoker_adapter.pyi +45 -0
  108. gllm_datastore/graph_data_store/utils/llama_index_lm_invoker_adapter.pyi +169 -0
  109. gllm_datastore/signature/__init__.pyi +0 -0
  110. gllm_datastore/signature/webhook_signature.pyi +31 -0
  111. gllm_datastore/sql_data_store/__init__.pyi +4 -0
  112. gllm_datastore/sql_data_store/adapter/__init__.pyi +0 -0
  113. gllm_datastore/sql_data_store/adapter/sqlalchemy_adapter.pyi +38 -0
  114. gllm_datastore/sql_data_store/constants.pyi +6 -0
  115. gllm_datastore/sql_data_store/sql_data_store.pyi +86 -0
  116. gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi +216 -0
  117. gllm_datastore/sql_data_store/types.pyi +31 -0
  118. gllm_datastore/utils/__init__.pyi +6 -0
  119. gllm_datastore/utils/converter.pyi +51 -0
  120. gllm_datastore/utils/dict.pyi +21 -0
  121. gllm_datastore/utils/ttl.pyi +25 -0
  122. gllm_datastore/utils/types.pyi +32 -0
  123. gllm_datastore/vector_data_store/__init__.pyi +6 -0
  124. gllm_datastore/vector_data_store/chroma_vector_data_store.pyi +259 -0
  125. gllm_datastore/vector_data_store/elasticsearch_vector_data_store.pyi +357 -0
  126. gllm_datastore/vector_data_store/in_memory_vector_data_store.pyi +179 -0
  127. gllm_datastore/vector_data_store/mixin/__init__.pyi +0 -0
  128. gllm_datastore/vector_data_store/mixin/cache_compatible_mixin.pyi +145 -0
  129. gllm_datastore/vector_data_store/redis_vector_data_store.pyi +191 -0
  130. gllm_datastore/vector_data_store/vector_data_store.pyi +146 -0
  131. gllm_datastore.build/.gitignore +1 -0
  132. gllm_datastore.cpython-312-darwin.so +0 -0
  133. gllm_datastore.pyi +178 -0
  134. gllm_datastore_binary-0.5.50.dist-info/METADATA +185 -0
  135. gllm_datastore_binary-0.5.50.dist-info/RECORD +137 -0
  136. gllm_datastore_binary-0.5.50.dist-info/WHEEL +5 -0
  137. gllm_datastore_binary-0.5.50.dist-info/top_level.txt +1 -0
@@ -0,0 +1,357 @@
1
+ from _typeshed import Incomplete
2
+ from datetime import datetime
3
+ from gllm_core.schema import Chunk
4
+ from gllm_datastore.constants import DEFAULT_REQUEST_TIMEOUT as DEFAULT_REQUEST_TIMEOUT, DEFAULT_TOP_K as DEFAULT_TOP_K, METADATA_KEYS as METADATA_KEYS
5
+ from gllm_datastore.utils.converter import from_langchain as from_langchain, to_langchain as to_langchain
6
+ from gllm_datastore.vector_data_store.mixin.cache_compatible_mixin import CacheCompatibleMixin as CacheCompatibleMixin
7
+ from gllm_datastore.vector_data_store.vector_data_store import BaseVectorDataStore as BaseVectorDataStore
8
+ from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
9
+ from langchain_core.embeddings import Embeddings
10
+ from typing import Any
11
+
12
+ DEFAULT_FETCH_K: int
13
+
14
+ class ElasticsearchVectorDataStore(BaseVectorDataStore, CacheCompatibleMixin):
15
+ """DataStore for interacting with Elasticsearch.
16
+
17
+ This class provides methods for executing queries and retrieving documents
18
+ from Elasticsearch. It relies on the LangChain's ElasticsearchStore for
19
+ vector operations and the underlying Elasticsearch client management.
20
+
21
+ Attributes:
22
+ vector_store (ElasticsearchStore): The ElasticsearchStore instance for vector operations.
23
+ sync_vector_store (ElasticsearchStore): The ElasticsearchStore instance for sync operations.
24
+ index_name (str): The name of the Elasticsearch index.
25
+ embedding (BaseEMInvoker | Embeddings | None): The embedding model to perform vectorization.
26
+ logger (Logger): The logger object.
27
+ """
28
+ index_name: Incomplete
29
+ vector_store: Incomplete
30
+ logger: Incomplete
31
+ def __init__(self, index_name: str, embedding: BaseEMInvoker | Embeddings | None = None, connection: Any | None = None, url: str | None = None, cloud_id: str | None = None, user: str | None = None, api_key: str | None = None, password: str | None = None, vector_query_field: str = 'vector', query_field: str = 'text', distance_strategy: str | None = None, strategy: Any | None = None, request_timeout: int = ...) -> None:
32
+ '''Initializes an instance of the ElasticsearchVectorDataStore class.
33
+
34
+ Args:
35
+ index_name (str): The name of the Elasticsearch index.
36
+ embedding (BaseEMInvoker | Embeddings | None, optional): The embedding model to perform vectorization.
37
+ Defaults to None.
38
+ connection (Any | None, optional): The Elasticsearch connection object. Defaults to None.
39
+ url (str | None, optional): The URL of the Elasticsearch server. Defaults to None.
40
+ cloud_id (str | None, optional): The cloud ID of the Elasticsearch cluster. Defaults to None.
41
+ user (str | None, optional): The username for authentication. Defaults to None.
42
+ api_key (str | None, optional): The API key for authentication. Defaults to None.
43
+ password (str | None, optional): The password for authentication. Defaults to None.
44
+ vector_query_field (str, optional): The field name for vector queries. Defaults to "vector".
45
+ query_field (str, optional): The field name for text queries. Defaults to "text".
46
+ distance_strategy (str | None, optional): The distance strategy for retrieval. Defaults to None.
47
+ strategy (Any | None, optional): The retrieval strategy for retrieval. Defaults to None, in which case
48
+ DenseVectorStrategy() is used.
49
+ request_timeout (int, optional): The request timeout. Defaults to DEFAULT_REQUEST_TIMEOUT.
50
+
51
+ Raises:
52
+ TypeError: If `embedding` is not an instance of `BaseEMInvoker` or `Embeddings`.
53
+ '''
54
+ async def get_size(self) -> int:
55
+ """Returns the total number of vectors in the index.
56
+
57
+ If the index is not initialized returns 0.
58
+
59
+ Returns:
60
+ int: The total number of vectors.
61
+ """
62
+ async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, Any] | None = None) -> list[Chunk]:
63
+ """Queries the Elasticsearch data store and includes similarity scores.
64
+
65
+ Args:
66
+ query (str): The query string.
67
+ top_k (int, optional): The number of top results to retrieve. Defaults to DEFAULT_TOP_K.
68
+ retrieval_params (dict[str, Any] | None, optional): Additional retrieval parameters. Defaults to None.
69
+
70
+ Returns:
71
+ list[Chunk]: A list of Chunk objects representing the retrieved documents with
72
+ similarity scores.
73
+ """
74
+ async def query_by_id(self, id_: str | list[str]) -> list[Chunk]:
75
+ """Queries the data store by ID and returns a list of Chunk objects.
76
+
77
+ Args:
78
+ id_: The ID of the document to query.
79
+
80
+ Returns:
81
+ A list of Chunk objects representing the queried documents.
82
+
83
+ Note:
84
+ This method not implement yet. Because the ElasticsearchStore
85
+ still not implement the get_by_ids method yet.
86
+ """
87
+ async def bm25_query(self, query: str, top_k: int = ..., search_fields: list[str] | None = None, filter: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None, k1: float | None = None, b: float | None = None) -> list[Chunk]:
88
+ '''Queries the Elasticsearch data store using BM25 algorithm for keyword-based search.
89
+
90
+ Args:
91
+ query (str): The query string.
92
+ top_k (int, optional): The number of top results to retrieve. Defaults to DEFAULT_TOP_K.
93
+ search_fields (list[str] | None, optional): The fields to search in. If None, defaults to ["text"].
94
+ For multiple fields, uses multi_match query. Defaults to None.
95
+ filter (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
96
+ For example, `{"category": "AI", "source": ["doc1", "doc2"]}`. Defaults to None.
97
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
98
+ DEPRECATED: Use `filter` parameter instead. Will be removed in a future version.
99
+ For example, `{"category": "AI", "source": ["doc1", "doc2"]}`. Defaults to None.
100
+ k1 (float | None, optional): BM25 parameter controlling term frequency saturation.
101
+ Higher values mean term frequency has more impact before diminishing returns.
102
+ Typical values: 1.2-2.0. If None, uses Elasticsearch default (~1.2). Defaults to None.
103
+ b (float | None, optional): BM25 parameter controlling document length normalization.
104
+ 0.0 = no length normalization, 1.0 = full normalization.
105
+ Typical values: 0.75. If None, uses Elasticsearch default (~0.75). Defaults to None.
106
+
107
+ Example:
108
+ ```python
109
+ # Basic BM25 query on the \'text\' field
110
+ results = await data_store.bm25_query("machine learning")
111
+
112
+ # BM25 query on specific fields with a custom top_k
113
+ results = await data_store.bm25_query(
114
+ "natural language",
115
+ top_k=5,
116
+ search_fields=["title", "abstract"]
117
+ )
118
+
119
+ # BM25 query with filter
120
+ results = await data_store.bm25_query(
121
+ "deep learning",
122
+ filter={"category": "AI", "status": "published"}
123
+ )
124
+
125
+ # BM25 query with metadata filtering (deprecated)
126
+ results = await data_store.bm25_query(
127
+ "deep learning",
128
+ metadata={"category": "AI", "status": "published"}
129
+ )
130
+
131
+ # BM25 query with custom BM25 parameters for more aggressive term frequency weighting
132
+ results = await data_store.bm25_query(
133
+ "artificial intelligence",
134
+ k1=2.0,
135
+ b=0.5
136
+ )
137
+
138
+ # BM25 query with both search fields and BM25 tuning
139
+ results = await data_store.bm25_query(
140
+ "data science applications",
141
+ search_fields=["content", "tags"],
142
+ filter={"author_id": "user123", "publication_year": [2022, 2023]},
143
+ k1=1.5,
144
+ b=0.9
145
+ )
146
+ ```
147
+
148
+ Returns:
149
+ list[Chunk]: A list of Chunk objects representing the retrieved documents.
150
+ '''
151
+ async def autocomplete(self, query: str, field: str, size: int = 20, fuzzy_tolerance: int = 1, min_prefix_length: int = 3, filter_query: dict[str, Any] | None = None) -> list[str]:
152
+ """Provides suggestions based on a prefix query for a specific field.
153
+
154
+ Args:
155
+ query (str): The query string.
156
+ field (str): The field name for autocomplete.
157
+ size (int, optional): The number of suggestions to retrieve. Defaults to 20.
158
+ fuzzy_tolerance (int, optional): The level of fuzziness for suggestions. Defaults to 1.
159
+ min_prefix_length (int, optional): The minimum prefix length to trigger fuzzy matching. Defaults to 3.
160
+ filter_query (dict[str, Any] | None, optional): The filter query. Defaults to None.
161
+
162
+ Returns:
163
+ list[str]: A list of suggestions.
164
+ """
165
+ async def autosuggest(self, query: str, search_fields: list[str], autocomplete_field: str, size: int = 20, min_length: int = 3, filter_query: dict[str, Any] | None = None) -> list[str]:
166
+ """Generates suggestions across multiple fields using a multi_match query to broaden the search criteria.
167
+
168
+ Args:
169
+ query (str): The query string.
170
+ search_fields (list[str]): The fields to search for.
171
+ autocomplete_field (str): The field name for autocomplete.
172
+ size (int, optional): The number of suggestions to retrieve. Defaults to 20.
173
+ min_length (int, optional): The minimum length of the query. Defaults to 3.
174
+ filter_query (dict[str, Any] | None, optional): The filter query. Defaults to None.
175
+
176
+ Returns:
177
+ list[str]: A list of suggestions.
178
+ """
179
+ async def shingles(self, query: str, field: str, size: int = 20, min_length: int = 3, max_length: int = 30, filter_query: dict[str, Any] | None = None) -> list[str]:
180
+ """Searches using shingles for prefix and fuzzy matching.
181
+
182
+ Args:
183
+ query (str): The query string.
184
+ field (str): The field name for autocomplete.
185
+ size (int, optional): The number of suggestions to retrieve. Defaults to 20.
186
+ min_length (int, optional): The minimum length of the query.
187
+ Queries shorter than this limit will return an empty list. Defaults to 3.
188
+ max_length (int, optional): The maximum length of the query.
189
+ Queries exceeding this limit will return an empty list. Defaults to 30.
190
+ filter_query (dict[str, Any] | None, optional): The filter query. Defaults to None.
191
+
192
+ Returns:
193
+ list[str]: A list of suggestions.
194
+ """
195
+ async def add_chunks(self, chunk: Chunk | list[Chunk], **kwargs: Any) -> list[str]:
196
+ """Adds a chunk or a list of chunks to the data store.
197
+
198
+ Args:
199
+ chunk (Chunk | list[Chunk]): The chunk or list of chunks to add.
200
+ kwargs (Any): Additional keyword arguments.
201
+
202
+ Returns:
203
+ list[str]: A list of unique identifiers (IDs) assigned to the added chunks.
204
+ """
205
+ async def add_embeddings(self, text_embeddings: list[tuple[str, list[float]]], metadatas: list[dict] | None = None, ids: list[str] | None = None, **kwargs) -> list[str]:
206
+ """Adds text embeddings to the data store.
207
+
208
+ Args:
209
+ text_embeddings (list[tuple[str, list[float]]]): Pairs of string and embedding to add to the store.
210
+ metadatas (list[dict], optional): Optional list of metadatas associated with the texts. Defaults to None.
211
+ ids (list[str], optional): Optional list of unique IDs. Defaults to None.
212
+ kwargs (Any): Additional keyword arguments.
213
+
214
+ Returns:
215
+ list[str]: A list of unique identifiers (IDs) assigned to the added embeddings.
216
+ """
217
+ async def delete_chunks(self, query: dict[str, Any], **kwargs: Any) -> None:
218
+ """Deletes chunks from the data store based on a query.
219
+
220
+ Args:
221
+ query (dict[str, Any]): Query to match documents for deletion.
222
+ kwargs (Any): Additional keyword arguments.
223
+
224
+ Returns:
225
+ None
226
+ """
227
+ async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
228
+ """Deletes chunks from the data store based on IDs.
229
+
230
+ Args:
231
+ ids (str | list[str]): A single ID or a list of IDs to delete.
232
+ kwargs (Any): Additional keyword arguments.
233
+ """
234
+ async def exact_match(self, key: str, metadata: dict[str, Any] | None = None) -> Any | None:
235
+ '''Find chunks that exactly match the given key.
236
+
237
+ Args:
238
+ key (str): The key to match.
239
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
240
+ For example, `{"key": "value"}`. Defaults to None.
241
+
242
+ Returns:
243
+ Any: The value stored with the exact key, or None if no match is found.
244
+ '''
245
+ async def fuzzy_match(self, key: str, max_distance: int = 2, metadata: dict[str, Any] | None = None) -> Any | None:
246
+ '''Find chunks that approximately match the given key using fuzzy matching.
247
+
248
+ Args:
249
+ key (str): The key to match.
250
+ max_distance (int): The maximum distance for fuzzy matching. Defaults to 2. Ranges from 0 to 2.
251
+ Higher values are more lenient.
252
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
253
+ For example, `{"key": "value"}`. Defaults to None.
254
+
255
+ Returns:
256
+ Any: The value with the closest fuzzy match, or None if no match meets the threshold.
257
+ '''
258
+ async def semantic_match(self, key: str, min_similarity: float = 0.8, metadata: dict[str, Any] | None = None) -> Any | None:
259
+ '''Find chunks that semantically match the given key using vector similarity.
260
+
261
+ Args:
262
+ key (str): The key to match.
263
+ min_similarity (float): Minimum similarity score for semantic matching
264
+ (higher values are more strict). Ranges from 0 to 1. Defaults to 0.8.
265
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
266
+ For example, `{"key": "value"}`. Defaults to None.
267
+
268
+ Returns:
269
+ Any: The semantically closest value, or None if no match meets the min_similarity threshold.
270
+ '''
271
+ async def delete_expired_entries(self, now: datetime, max_size: int = 10000) -> None:
272
+ """Delete expired entries (for TTL eviction).
273
+
274
+ Args:
275
+ now (datetime): The current datetime for comparison.
276
+ max_size (int): The maximum number of entries to return. Defaults to 10000.
277
+
278
+ Returns:
279
+ None
280
+ """
281
+ async def delete_least_frequently_used_entries(self, num_entries: int) -> None:
282
+ """Delete least frequently used entries (for LFU eviction).
283
+
284
+ Args:
285
+ num_entries (int): Number of entries to return.
286
+
287
+ Returns:
288
+ None
289
+ """
290
+ async def delete_least_recently_used_entries(self, num_entries: int) -> None:
291
+ """Delete least recently used entries (for LRU eviction).
292
+
293
+ Args:
294
+ num_entries (int): Number of entries to return.
295
+
296
+ Returns:
297
+ None
298
+ """
299
+ async def delete_entries_by_key(self, key: str | list[str], metadata: dict[str, Any] | None = None) -> None:
300
+ '''Delete entries by key.
301
+
302
+ Example:
303
+ ```python
304
+ key = "key-1"
305
+ metadata = {"id": "id-1"}
306
+ await delete_entries_by_key(key, metadata)
307
+ ```
308
+ This will delete the entry with the key "key-1" by filtering by the metadata "id": "id-1".
309
+
310
+ Args:
311
+ key (str | list[str]): The key or list of keys to delete entries for.
312
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
313
+ Defaults to None.
314
+ '''
315
+ async def clear(self) -> None:
316
+ """Clear all entries in the storage.
317
+
318
+ Raises:
319
+ NotImplementedError: Currently, app-level eviction is not supported for ElasticsearchVectorDataStore.
320
+ """
321
+ async def query_by_field(self, retrieval_params: dict, limit: int | None = None, **kwargs) -> list[Chunk]:
322
+ """Retrieve documents that match specific metadata constraints.
323
+
324
+ This method filters and returns stored chunks based on metadata values
325
+ rather than vector similarity. It is particularly useful for structured lookups,
326
+ such as retrieving all chunks from a certain source, tagged with a specific label,
327
+ or authored by a particular user.
328
+
329
+ Unlike semantic search methods, `query_by_field` operates purely on metadata fields
330
+ associated with each document, allowing precise filtering based on key-value pairs.
331
+
332
+ Expected:
333
+ Returns a list of `Chunk` objects matching the metadata query.
334
+
335
+ Args:
336
+ retrieval_params (dict): Must contain a `filter` key with an Elasticsearch DSL query.
337
+ limit (int, optional): Maximum number of results to return.
338
+ **kwargs: Additional arguments for the Elasticsearch search call.
339
+
340
+ Returns:
341
+ list[Chunk]: The filtered results as `Chunk` objects.
342
+ """
343
+ async def query_by_vector(self, vector: list[float], top_k: int = ..., min_similarity: float = 0.8, retrieval_params: dict | None = None) -> list[Chunk]:
344
+ """Search for documents that are similar to a given vector.
345
+
346
+ Args:
347
+ vector (list[float]): The query embedding vector to compare against stored vectors.
348
+ top_k (int, optional): The number of top results to return. Defaults to DEFAULT_TOP_K.
349
+ min_similarity (float): Minimum similarity score for vector similarity.
350
+ retrieval_params (dict | None, optional): Filter parameters to narrow the search:
351
+ - filter (Where): Metadata-based filter.
352
+ - where_document (WhereDocument): Content-based filter.
353
+ Defaults to None.
354
+
355
+ Returns:
356
+ list[Chunk]: A list of Chunk objects with similarity scores based on the input vector.
357
+ """
@@ -0,0 +1,179 @@
1
+ from _typeshed import Incomplete
2
+ from gllm_core.schema.chunk import Chunk
3
+ from gllm_datastore.constants import CHUNK_KEYS as CHUNK_KEYS, DEFAULT_TOP_K as DEFAULT_TOP_K
4
+ from gllm_datastore.core.filters import QueryFilter as QueryFilter, QueryOptions as QueryOptions
5
+ from gllm_datastore.data_store.in_memory.data_store import InMemoryDataStore as InMemoryDataStore
6
+ from gllm_datastore.vector_data_store.vector_data_store import BaseVectorDataStore as BaseVectorDataStore
7
+ from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
8
+ from gllm_inference.schema import Vector
9
+ from typing import Any
10
+
11
+ class InMemoryVectorDataStore(BaseVectorDataStore):
12
+ """In-memory vector data store implementation.
13
+
14
+ This class provides a simple in-memory implementation of the BaseVectorDataStore
15
+ that stores vectors and metadata in memory. It's primarily intended for testing
16
+ purposes and does not require any external services.
17
+
18
+ Attributes:
19
+ store (dict[str, dict[str, Any]]): Dictionary storing documents with their vectors and metadata.
20
+ Each entry has keys: 'id', 'vector', 'text', 'metadata'.
21
+ embedding (BaseEMInvoker | None): Optional embedding model for vectorization.
22
+ """
23
+ store: Incomplete
24
+ def __init__(self, embedding: BaseEMInvoker | None = None) -> None:
25
+ """Initialize the in-memory vector data store.
26
+
27
+ Args:
28
+ embedding (BaseEMInvoker | None, optional): The embedding model to perform vectorization.
29
+ Defaults to None, in which case vectors must be provided manually when adding chunks.
30
+ """
31
+ async def get_size(self) -> int:
32
+ """Return the number of items in the data store.
33
+
34
+ Returns:
35
+ int: The number of items in the data store.
36
+ """
37
+ async def add_chunks(self, chunk: Chunk | list[Chunk], vector: Vector | list[Vector] | None = None) -> list[str]:
38
+ '''Adds a chunk or a list of chunks in the data store.
39
+
40
+ Example:
41
+ ```python
42
+ await store.add_chunks(
43
+ [Chunk(id="1", content="AI contains machine learning", metadata={"topic": "AI"}),
44
+ Chunk(id="2", content="AI in 2025", metadata={"topic": "AI"}),
45
+ ])
46
+ ```
47
+ Args:
48
+ chunk (Chunk | list[Chunk]): A single chunk or a list of chunks to index.
49
+ vector (Vector | list[Vector] | None, optional): A manual vector specification.
50
+ Defaults to None, in which case the embedding model will be used.
51
+ The vector length must match the embedding size used by the store.
52
+ **kwargs: Additional keyword arguments.
53
+
54
+ Returns:
55
+ list[str]: A list of unique identifiers (IDs) assigned to the added chunks.
56
+
57
+ Raises:
58
+ ValueError: If the number of chunks and vectors are not the same.
59
+ ValueError: If no embedding model is provided and no vector is specified.
60
+ '''
61
+ async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, Any] | None = None) -> list[Chunk]:
62
+ '''Executes a query on the data store using semantic similarity.
63
+
64
+ Example:
65
+ ```python
66
+ chunks = await store.add_chunks(
67
+ [
68
+ Chunk(id="1", content="AI contains machine learning", metadata={"topic": "AI"}),
69
+ Chunk(id="2", content="AI in 2025", metadata={"topic": "AI"}),
70
+ ]
71
+ )
72
+ await store.query(query="AI and machine learning", retrieval_params={"topic": "AI"})
73
+ ```
74
+
75
+ Args:
76
+ query (str): The query string to execute.
77
+ top_k (int, optional): The maximum number of results to return. Defaults to DEFAULT_TOP_K.
78
+ retrieval_params (dict[str, Any] | None, optional): Additional parameters for the query.
79
+
80
+ Returns:
81
+ list[Chunk]: A list of query results with similarity scores.
82
+
83
+ Raises:
84
+ ValueError: If no embedding model is provided.
85
+ '''
86
+ async def query_by_vector(self, vector: Vector, top_k: int = ..., min_similarity: float = 0.8, retrieval_params: dict[str, Any] | None = None) -> list[Chunk]:
87
+ '''Search for documents that are similar to a given vector.
88
+
89
+ Example:
90
+ ```python
91
+ chunks = await store.add_chunks(
92
+ [
93
+ Chunk(id="1", content="AI contains machine learning", metadata={"topic": "AI"}),
94
+ Chunk(id="2", content="AI in 2025", metadata={"topic": "AI"}),
95
+ ]
96
+ )
97
+ query_vector = await embedding.invoke("AI and machine learning")
98
+ await store.query_by_vector(query_vector, retrieval_params={"topic": "AI"})
99
+ ```
100
+ Args:
101
+ vector (Vector): The query embedding vector to compare against stored vectors.
102
+ top_k (int, optional): The number of top results to return. Defaults to DEFAULT_TOP_K.
103
+ min_similarity (float, optional): Minimum similarity score for vector similarity. Defaults to 0.8.
104
+ retrieval_params (dict[str, Any] | None, optional): Filter parameters to narrow the search.
105
+
106
+ Returns:
107
+ list[Chunk]: A list of Chunk objects with similarity scores based on the input vector.
108
+ '''
109
+ async def query_by_field(self, retrieval_params: dict[str, Any], limit: int | None = None, **kwargs) -> list[Chunk]:
110
+ '''Retrieve documents that match specific metadata constraints.
111
+
112
+ Example:
113
+ ```python
114
+ sample_chunks = [
115
+ Chunk(id="1", content="AI is a topic", metadata={"topic": "AI"}),
116
+ Chunk(id="2", content="Deep learning is a topic", metadata={"topic": "Deep Learning"}),
117
+ ]
118
+ await store.add_chunks(sample_chunks)
119
+ await store.query_by_field({"topic": "AI"})
120
+ ```
121
+
122
+ Args:
123
+ retrieval_params (dict[str, Any]): A dictionary with metadata field names as keys and their expected values.
124
+ limit (int | None, optional): The maximum number of results to return. Defaults to None, in which
125
+ case all matching documents will be returned.
126
+ **kwargs: Additional arguments (currently unused).
127
+
128
+ Returns:
129
+ list[Chunk]: A list of Chunk objects that satisfy the metadata criteria.
130
+ '''
131
+ async def query_by_id(self, id_: str | list[str]) -> list[Chunk]:
132
+ '''Retrieves chunks by their IDs.
133
+
134
+ Example:
135
+ ```python
136
+ chunks = await store.add_chunks(
137
+ [Chunk(id="1", content="AI contains machine learning", metadata={"topic": "AI"}),
138
+ Chunk(id="2", content="AI in 2025", metadata={"topic": "AI"}),
139
+ ])
140
+ await store.query_by_id(["1", "2"])
141
+ ```
142
+
143
+ Args:
144
+ id_ (str | list[str]): A single ID or a list of IDs to retrieve.
145
+
146
+ Returns:
147
+ list[Chunk]: A list of retrieved chunks.
148
+ '''
149
+ async def delete_chunks(self, retrieval_params: dict[str, Any] | None = None) -> None:
150
+ '''Deletes chunks from the data store by filter criteria.
151
+
152
+ Example:
153
+ ```python
154
+ sample_chunks = [
155
+ Chunk(id="1", content="AI is a topic", metadata={"topic": "AI"}),
156
+ Chunk(id="2", content="Deep learning is a topic", metadata={"topic": "Deep Learning"}),
157
+ ]
158
+ await store.add_chunks(sample_chunks)
159
+ await store.delete_chunks(retrieval_params={"topic": "AI"})
160
+ ```
161
+
162
+ Args:
163
+ retrieval_params (dict[str, Any] | None, optional): A dictionary with metadata field names as keys
164
+ and their expected values. Defaults to None, in which case no operation is performed (no-op).
165
+ '''
166
+ async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
167
+ '''Deletes a chunk or a list of chunks from the data store by their IDs.
168
+
169
+ Example:
170
+ ```python
171
+ await store.delete_chunks_by_ids(["1", "2"])
172
+ ```
173
+
174
+ Args:
175
+ ids (str | list[str]): A single ID or a list of IDs to delete.
176
+ **kwargs: Additional keyword arguments (currently unused).
177
+ '''
178
+ async def clear(self) -> None:
179
+ """Clear all entries in the storage."""
File without changes