gllm-datastore-binary 0.5.45__cp311-cp311-macosx_13_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gllm-datastore-binary might be problematic. Click here for more details.
- gllm_datastore/__init__.pyi +0 -0
- gllm_datastore/cache/__init__.pyi +4 -0
- gllm_datastore/cache/base.pyi +84 -0
- gllm_datastore/cache/cache.pyi +137 -0
- gllm_datastore/cache/hybrid_cache/__init__.pyi +5 -0
- gllm_datastore/cache/hybrid_cache/file_system_hybrid_cache.pyi +50 -0
- gllm_datastore/cache/hybrid_cache/hybrid_cache.pyi +115 -0
- gllm_datastore/cache/hybrid_cache/in_memory_hybrid_cache.pyi +29 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/__init__.pyi +5 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/exact_key_matcher.pyi +44 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/fuzzy_key_matcher.pyi +70 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/key_matcher.pyi +60 -0
- gllm_datastore/cache/hybrid_cache/key_matcher/semantic_key_matcher.pyi +93 -0
- gllm_datastore/cache/hybrid_cache/redis_hybrid_cache.pyi +34 -0
- gllm_datastore/cache/hybrid_cache/utils.pyi +36 -0
- gllm_datastore/cache/utils.pyi +34 -0
- gllm_datastore/cache/vector_cache/__init__.pyi +0 -0
- gllm_datastore/cache/vector_cache/eviction_manager/__init__.pyi +0 -0
- gllm_datastore/cache/vector_cache/eviction_manager/asyncio_eviction_manager.pyi +48 -0
- gllm_datastore/cache/vector_cache/eviction_manager/eviction_manager.pyi +38 -0
- gllm_datastore/cache/vector_cache/eviction_strategy/__init__.pyi +0 -0
- gllm_datastore/cache/vector_cache/eviction_strategy/eviction_strategy.pyi +34 -0
- gllm_datastore/cache/vector_cache/eviction_strategy/ttl_eviction_strategy.pyi +34 -0
- gllm_datastore/cache/vector_cache/vector_cache.pyi +99 -0
- gllm_datastore/constants.pyi +66 -0
- gllm_datastore/core/__init__.pyi +7 -0
- gllm_datastore/core/capabilities/__init__.pyi +5 -0
- gllm_datastore/core/capabilities/fulltext_capability.pyi +73 -0
- gllm_datastore/core/capabilities/graph_capability.pyi +70 -0
- gllm_datastore/core/capabilities/vector_capability.pyi +90 -0
- gllm_datastore/core/filters/__init__.pyi +4 -0
- gllm_datastore/core/filters/filter.pyi +340 -0
- gllm_datastore/core/filters/schema.pyi +149 -0
- gllm_datastore/data_store/__init__.pyi +7 -0
- gllm_datastore/data_store/base.pyi +138 -0
- gllm_datastore/data_store/chroma/__init__.pyi +4 -0
- gllm_datastore/data_store/chroma/_chroma_import.pyi +13 -0
- gllm_datastore/data_store/chroma/data_store.pyi +202 -0
- gllm_datastore/data_store/chroma/fulltext.pyi +134 -0
- gllm_datastore/data_store/chroma/query.pyi +266 -0
- gllm_datastore/data_store/chroma/query_translator.pyi +41 -0
- gllm_datastore/data_store/chroma/vector.pyi +197 -0
- gllm_datastore/data_store/elasticsearch/__init__.pyi +5 -0
- gllm_datastore/data_store/elasticsearch/data_store.pyi +119 -0
- gllm_datastore/data_store/elasticsearch/fulltext.pyi +237 -0
- gllm_datastore/data_store/elasticsearch/query.pyi +114 -0
- gllm_datastore/data_store/elasticsearch/vector.pyi +179 -0
- gllm_datastore/data_store/exceptions.pyi +35 -0
- gllm_datastore/data_store/in_memory/__init__.pyi +5 -0
- gllm_datastore/data_store/in_memory/data_store.pyi +71 -0
- gllm_datastore/data_store/in_memory/fulltext.pyi +131 -0
- gllm_datastore/data_store/in_memory/query.pyi +175 -0
- gllm_datastore/data_store/in_memory/vector.pyi +174 -0
- gllm_datastore/data_store/redis/__init__.pyi +5 -0
- gllm_datastore/data_store/redis/data_store.pyi +154 -0
- gllm_datastore/data_store/redis/fulltext.pyi +128 -0
- gllm_datastore/data_store/redis/query.pyi +428 -0
- gllm_datastore/data_store/redis/query_translator.pyi +37 -0
- gllm_datastore/data_store/redis/vector.pyi +131 -0
- gllm_datastore/encryptor/__init__.pyi +4 -0
- gllm_datastore/encryptor/aes_gcm_encryptor.pyi +45 -0
- gllm_datastore/encryptor/encryptor.pyi +52 -0
- gllm_datastore/encryptor/key_ring/__init__.pyi +3 -0
- gllm_datastore/encryptor/key_ring/in_memory_key_ring.pyi +52 -0
- gllm_datastore/encryptor/key_ring/key_ring.pyi +45 -0
- gllm_datastore/encryptor/key_rotating_encryptor.pyi +60 -0
- gllm_datastore/graph_data_store/__init__.pyi +6 -0
- gllm_datastore/graph_data_store/graph_data_store.pyi +151 -0
- gllm_datastore/graph_data_store/graph_rag_data_store.pyi +29 -0
- gllm_datastore/graph_data_store/light_rag_data_store.pyi +93 -0
- gllm_datastore/graph_data_store/light_rag_postgres_data_store.pyi +96 -0
- gllm_datastore/graph_data_store/llama_index_graph_rag_data_store.pyi +49 -0
- gllm_datastore/graph_data_store/llama_index_neo4j_graph_rag_data_store.pyi +78 -0
- gllm_datastore/graph_data_store/nebula_graph_data_store.pyi +206 -0
- gllm_datastore/graph_data_store/neo4j_graph_data_store.pyi +182 -0
- gllm_datastore/graph_data_store/utils/__init__.pyi +6 -0
- gllm_datastore/graph_data_store/utils/constants.pyi +21 -0
- gllm_datastore/graph_data_store/utils/light_rag_em_invoker_adapter.pyi +56 -0
- gllm_datastore/graph_data_store/utils/light_rag_lm_invoker_adapter.pyi +43 -0
- gllm_datastore/graph_data_store/utils/llama_index_em_invoker_adapter.pyi +45 -0
- gllm_datastore/graph_data_store/utils/llama_index_lm_invoker_adapter.pyi +169 -0
- gllm_datastore/sql_data_store/__init__.pyi +4 -0
- gllm_datastore/sql_data_store/adapter/__init__.pyi +0 -0
- gllm_datastore/sql_data_store/adapter/sqlalchemy_adapter.pyi +38 -0
- gllm_datastore/sql_data_store/constants.pyi +6 -0
- gllm_datastore/sql_data_store/sql_data_store.pyi +86 -0
- gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi +216 -0
- gllm_datastore/sql_data_store/types.pyi +31 -0
- gllm_datastore/utils/__init__.pyi +6 -0
- gllm_datastore/utils/converter.pyi +51 -0
- gllm_datastore/utils/dict.pyi +21 -0
- gllm_datastore/utils/ttl.pyi +25 -0
- gllm_datastore/utils/types.pyi +32 -0
- gllm_datastore/vector_data_store/__init__.pyi +6 -0
- gllm_datastore/vector_data_store/chroma_vector_data_store.pyi +259 -0
- gllm_datastore/vector_data_store/elasticsearch_vector_data_store.pyi +357 -0
- gllm_datastore/vector_data_store/in_memory_vector_data_store.pyi +179 -0
- gllm_datastore/vector_data_store/mixin/__init__.pyi +0 -0
- gllm_datastore/vector_data_store/mixin/cache_compatible_mixin.pyi +145 -0
- gllm_datastore/vector_data_store/redis_vector_data_store.pyi +191 -0
- gllm_datastore/vector_data_store/vector_data_store.pyi +146 -0
- gllm_datastore.build/.gitignore +1 -0
- gllm_datastore.cpython-311-darwin.so +0 -0
- gllm_datastore.pyi +156 -0
- gllm_datastore_binary-0.5.45.dist-info/METADATA +178 -0
- gllm_datastore_binary-0.5.45.dist-info/RECORD +108 -0
- gllm_datastore_binary-0.5.45.dist-info/WHEEL +5 -0
- gllm_datastore_binary-0.5.45.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from chromadb.types import Where, WhereDocument
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import StrEnum
|
|
5
|
+
from gllm_datastore.core.filters.schema import FilterOperator as FilterOperator, QueryFilter as QueryFilter
|
|
6
|
+
from gllm_datastore.data_store.chroma.query_translator import ChromaQueryTranslator as ChromaQueryTranslator
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
DEFAULT_NUM_CANDIDATES: int
|
|
10
|
+
|
|
11
|
+
class ChromaCollectionKeys:
|
|
12
|
+
"""Constants for ChromaDB collection method keyword arguments.
|
|
13
|
+
|
|
14
|
+
This class provides constants for all string literals used in ChromaDB
|
|
15
|
+
collection method calls (get, delete, query, etc.) to avoid magic strings
|
|
16
|
+
and improve maintainability.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
WHERE (str): Keyword for metadata filtering condition.
|
|
20
|
+
WHERE_DOCUMENT (str): Keyword for document content filtering condition.
|
|
21
|
+
IDS (str): Keyword for filtering by document IDs.
|
|
22
|
+
INCLUDE (str): Keyword for specifying fields to include in results.
|
|
23
|
+
LIMIT (str): Keyword for limiting the number of results.
|
|
24
|
+
METADATA_PREFIX (str): Prefix for metadata field keys.
|
|
25
|
+
"""
|
|
26
|
+
WHERE: str
|
|
27
|
+
WHERE_DOCUMENT: str
|
|
28
|
+
IDS: str
|
|
29
|
+
INCLUDE: str
|
|
30
|
+
LIMIT: str
|
|
31
|
+
METADATA_PREFIX: str
|
|
32
|
+
|
|
33
|
+
class ChromaOperators(StrEnum):
|
|
34
|
+
"""Constants for ChromaDB query operators.
|
|
35
|
+
|
|
36
|
+
This class provides constants for all operator string literals used in
|
|
37
|
+
ChromaDB query expressions to avoid magic strings and improve maintainability.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
AND (str): Logical AND operator for combining filters.
|
|
41
|
+
OR (str): Logical OR operator for combining filters.
|
|
42
|
+
NE (str): Not equal comparison operator.
|
|
43
|
+
GT (str): Greater than comparison operator.
|
|
44
|
+
LT (str): Less than comparison operator.
|
|
45
|
+
GTE (str): Greater than or equal comparison operator.
|
|
46
|
+
LTE (str): Less than or equal comparison operator.
|
|
47
|
+
IN (str): Array membership operator (value in list).
|
|
48
|
+
NIN (str): Array non-membership operator (value not in list).
|
|
49
|
+
TEXT_CONTAINS (str): Document content substring match operator.
|
|
50
|
+
NOT_CONTAINS (str): Document content substring exclusion operator.
|
|
51
|
+
"""
|
|
52
|
+
AND: str
|
|
53
|
+
OR: str
|
|
54
|
+
NE: str
|
|
55
|
+
GT: str
|
|
56
|
+
LT: str
|
|
57
|
+
GTE: str
|
|
58
|
+
LTE: str
|
|
59
|
+
IN: str
|
|
60
|
+
NIN: str
|
|
61
|
+
TEXT_CONTAINS: str
|
|
62
|
+
NOT_CONTAINS: str
|
|
63
|
+
|
|
64
|
+
class ChromaOperatorMapper:
|
|
65
|
+
"""Maps FilterOperator to ChromaDB operators and provides inverse operator mappings.
|
|
66
|
+
|
|
67
|
+
This class encapsulates operator translation logic.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
OPERATOR_TO_CHROMA (dict[FilterOperator, str]): Mapping from FilterOperator to ChromaDB operators.
|
|
71
|
+
OPERATOR_INVERSE (dict[FilterOperator, FilterOperator]): Mapping from FilterOperator to its inverse operator.
|
|
72
|
+
"""
|
|
73
|
+
OPERATOR_TO_CHROMA: dict[FilterOperator, str]
|
|
74
|
+
OPERATOR_INVERSE: dict[FilterOperator, FilterOperator]
|
|
75
|
+
@classmethod
|
|
76
|
+
def get_inverse_operator(cls, operator: FilterOperator) -> FilterOperator | None:
|
|
77
|
+
"""Get the inverse operator for a given FilterOperator.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
operator (FilterOperator): The operator to get the inverse for.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
FilterOperator | None: The inverse operator, or None if no inverse exists.
|
|
84
|
+
"""
|
|
85
|
+
@classmethod
|
|
86
|
+
def has_inverse(cls, operator: FilterOperator) -> bool:
|
|
87
|
+
"""Check if an operator has an inverse mapping.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
operator (FilterOperator): The operator to check.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
bool: True if the operator has an inverse, False otherwise.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class ChromaQueryComponents:
|
|
98
|
+
"""ChromaDB query components extracted from a QueryFilter.
|
|
99
|
+
|
|
100
|
+
Attributes:
|
|
101
|
+
where_condition (Where | None): Where clause for metadata filters, or None.
|
|
102
|
+
where_document (WhereDocument | None): WhereDocument clause for content filters, or None.
|
|
103
|
+
id_values (list[str] | None): List of IDs for id filters, or None.
|
|
104
|
+
"""
|
|
105
|
+
where_condition: Where | None
|
|
106
|
+
where_document: WhereDocument | None
|
|
107
|
+
id_values: list[str] | None
|
|
108
|
+
def to_dict(self) -> dict[str, Any] | None:
|
|
109
|
+
"""Convert to ChromaDB kwargs dict, omitting None values.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
dict[str, Any] | None: Dictionary with non-None components,
|
|
113
|
+
or None if all components are None/empty.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def sanitize_metadata(metadata: dict[str, Any] | None, logger: logging.Logger) -> dict[str, Any]:
|
|
117
|
+
'''Sanitize metadata by removing list values that ChromaDB doesn\'t support.
|
|
118
|
+
|
|
119
|
+
ChromaDB only supports str, int, float, or bool as metadata values.
|
|
120
|
+
This function filters out list values and logs warnings for each removed key.
|
|
121
|
+
|
|
122
|
+
Examples:
|
|
123
|
+
1. Remove list values:
|
|
124
|
+
```python
|
|
125
|
+
logger = logging.getLogger(__name__)
|
|
126
|
+
input_meta = {"status": "active", "tags": ["a", "b"], "age": 30}
|
|
127
|
+
out = sanitize_metadata(input_meta, logger)
|
|
128
|
+
# out -> {"status": "active", "age": 30}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
2. Handle None input:
|
|
132
|
+
```python
|
|
133
|
+
out = sanitize_metadata(None, logging.getLogger(__name__))
|
|
134
|
+
# out -> {}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
metadata (dict[str, Any] | None): Metadata dictionary to sanitize.
|
|
139
|
+
logger (logging.Logger): Logger instance for warning messages.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
dict[str, Any]: Sanitized metadata with list values removed.
|
|
143
|
+
'''
|
|
144
|
+
def build_chroma_get_kwargs(filters: QueryFilter | None, query_translator: ChromaQueryTranslator, include: list[str] | None = None, limit: int | None = None, **additional_kwargs: Any) -> dict[str, Any]:
|
|
145
|
+
'''Build kwargs dictionary for ChromaDB collection.get() operations.
|
|
146
|
+
|
|
147
|
+
This function processes filters and builds a kwargs dictionary that includes
|
|
148
|
+
where, where_document, ids, include, and limit parameters as needed.
|
|
149
|
+
|
|
150
|
+
Examples:
|
|
151
|
+
1. Build kwargs with metadata and content filters:
|
|
152
|
+
```python
|
|
153
|
+
from gllm_datastore.core.filters import filter as F
|
|
154
|
+
from gllm_datastore.data_store.chroma.query_translator import ChromaQueryTranslator
|
|
155
|
+
|
|
156
|
+
translator = ChromaQueryTranslator()
|
|
157
|
+
filters = F.and_(
|
|
158
|
+
F.eq("metadata.status", "active"),
|
|
159
|
+
F.text_contains("content", "python"),
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
out = build_chroma_get_kwargs(filters, translator, include=["documents"], limit=10)
|
|
163
|
+
# out ->
|
|
164
|
+
# {
|
|
165
|
+
# "where": {"status": "active"},
|
|
166
|
+
# "where_document": {"$contains": "python"},
|
|
167
|
+
# "include": ["documents"],
|
|
168
|
+
# "limit": 10
|
|
169
|
+
# }
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
2. Build kwargs using id filters:
|
|
173
|
+
```python
|
|
174
|
+
from gllm_datastore.core.filters import filter as F
|
|
175
|
+
from gllm_datastore.data_store.chroma.query_translator import ChromaQueryTranslator
|
|
176
|
+
|
|
177
|
+
translator = ChromaQueryTranslator()
|
|
178
|
+
filters = F.or_(F.eq("id", "123"), F.in_("id", ["a", "b"]))
|
|
179
|
+
out = build_chroma_get_kwargs(filters, translator)
|
|
180
|
+
# out -> {"ids": ["123", "a", "b"]}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
filters (QueryFilter | None): QueryFilter to process.
|
|
185
|
+
query_translator (ChromaQueryTranslator): Query translator instance to use.
|
|
186
|
+
include (list[str] | None, optional): List of fields to include in results.
|
|
187
|
+
Defaults to None.
|
|
188
|
+
limit (int | None, optional): Maximum number of results to return.
|
|
189
|
+
Defaults to None.
|
|
190
|
+
**additional_kwargs: Additional kwargs to include in the result.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
dict[str, Any]: Dictionary of kwargs ready for ChromaDB collection.get() call.
|
|
194
|
+
'''
|
|
195
|
+
def build_chroma_delete_kwargs(filters: QueryFilter | None, query_translator: ChromaQueryTranslator, **additional_kwargs: Any) -> dict[str, Any]:
|
|
196
|
+
'''Build kwargs dictionary for ChromaDB collection.delete() operations.
|
|
197
|
+
|
|
198
|
+
This function processes filters and builds a kwargs dictionary that includes
|
|
199
|
+
where, where_document, and ids parameters as needed.
|
|
200
|
+
|
|
201
|
+
Examples:
|
|
202
|
+
1. Delete by ids or where:
|
|
203
|
+
```python
|
|
204
|
+
from gllm_datastore.core.filters import filter as F
|
|
205
|
+
from gllm_datastore.data_store.chroma.query_translator import ChromaQueryTranslator
|
|
206
|
+
|
|
207
|
+
translator = ChromaQueryTranslator()
|
|
208
|
+
filters = F.and_(
|
|
209
|
+
F.in_("id", ["x1", "x2"]),
|
|
210
|
+
F.eq("metadata.status", "inactive"),
|
|
211
|
+
)
|
|
212
|
+
out = build_chroma_delete_kwargs(filters, translator)
|
|
213
|
+
# out ->
|
|
214
|
+
# {
|
|
215
|
+
# "ids": ["x1", "x2"],
|
|
216
|
+
# "where": {"status": "inactive"}
|
|
217
|
+
# }
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
filters (QueryFilter | None): QueryFilter to process.
|
|
222
|
+
query_translator (ChromaQueryTranslator): Query translator instance to use.
|
|
223
|
+
**additional_kwargs: Additional kwargs to include in the result.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
dict[str, Any]: Dictionary of kwargs ready for ChromaDB collection.delete() call.
|
|
227
|
+
'''
|
|
228
|
+
def extract_chroma_query_components(filters: QueryFilter | None) -> ChromaQueryComponents:
|
|
229
|
+
'''Prepare all ChromaDB query parameters from a QueryFilter.
|
|
230
|
+
|
|
231
|
+
This function processes a QueryFilter and extracts:
|
|
232
|
+
1. Metadata filters -> Where clause
|
|
233
|
+
2. Content filters -> WhereDocument clause
|
|
234
|
+
3. id filters -> ids parameter
|
|
235
|
+
|
|
236
|
+
Only operators natively supported by ChromaDB are allowed:
|
|
237
|
+
1. id: EQ, IN (using ids parameter)
|
|
238
|
+
2. content: TEXT_CONTAINS (substring match in document content, maps to $contains)
|
|
239
|
+
3. metadata: EQ, NE, GT, LT, GTE, LTE, IN, NIN (using where clause)
|
|
240
|
+
4. metadata: ARRAY_CONTAINS (array membership, not supported by ChromaDB - raises NotImplementedError)
|
|
241
|
+
|
|
242
|
+
Examples:
|
|
243
|
+
1. Extract all components from a mixed filter:
|
|
244
|
+
```python
|
|
245
|
+
from gllm_datastore.core.filters import filter as F
|
|
246
|
+
|
|
247
|
+
filters = F.and_(
|
|
248
|
+
F.eq("metadata.status", "active"),
|
|
249
|
+
F.text_contains("content", "python"),
|
|
250
|
+
F.in_("id", ["a", "b"]),
|
|
251
|
+
)
|
|
252
|
+
components = extract_chroma_query_components(filters)
|
|
253
|
+
# components.where_condition -> dict
|
|
254
|
+
# components.where_document -> dict
|
|
255
|
+
# components.id_values -> ["a", "b"]
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
filters (QueryFilter | None): QueryFilter to process.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
ChromaQueryComponents: Dataclass containing where_condition, where_document, and id_values.
|
|
263
|
+
|
|
264
|
+
Raises:
|
|
265
|
+
NotImplementedError: If unsupported operators are used for id or content filters.
|
|
266
|
+
'''
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from gllm_datastore.constants import CHUNK_KEYS as CHUNK_KEYS
|
|
3
|
+
from gllm_datastore.core.filters.schema import FilterClause as FilterClause, FilterCondition as FilterCondition, FilterOperator as FilterOperator, QueryFilter as QueryFilter
|
|
4
|
+
from gllm_datastore.data_store.chroma.query import ChromaCollectionKeys as ChromaCollectionKeys, ChromaOperatorMapper as ChromaOperatorMapper, ChromaOperators as ChromaOperators, ChromaQueryComponents as ChromaQueryComponents
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class FilterSeparationResult:
|
|
8
|
+
"""Intermediate result from separating special filters (id, content) from metadata filters.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
id_values (list[str] | None): Extracted ID values, or None if no ID filters found.
|
|
12
|
+
document_filters (list[FilterClause | QueryFilter]): List of content FilterClauses or
|
|
13
|
+
QueryFilters for where_document. QueryFilters are used to represent NOT conditions.
|
|
14
|
+
metadata_filters (list[FilterClause | QueryFilter]): Metadata filters for where clause.
|
|
15
|
+
condition (FilterCondition): The original FilterCondition from the QueryFilter.
|
|
16
|
+
"""
|
|
17
|
+
id_values: list[str] | None
|
|
18
|
+
document_filters: list[FilterClause | QueryFilter]
|
|
19
|
+
metadata_filters: list[FilterClause | QueryFilter]
|
|
20
|
+
condition: FilterCondition
|
|
21
|
+
|
|
22
|
+
class ChromaQueryTranslator:
|
|
23
|
+
"""Translates QueryFilter and FilterClause objects to ChromaDB native filter syntax.
|
|
24
|
+
|
|
25
|
+
This class encapsulates all query translation logic for ChromaDB, converting
|
|
26
|
+
structured FilterClause and QueryFilter objects into ChromaDB's where, where_document,
|
|
27
|
+
and ids parameters.
|
|
28
|
+
"""
|
|
29
|
+
def translate(self, filters: QueryFilter | None = None) -> ChromaQueryComponents:
|
|
30
|
+
"""Translate QueryFilter to ChromaDB query components.
|
|
31
|
+
|
|
32
|
+
This is the main entry point for query translation. It handles None filters
|
|
33
|
+
and orchestrates filter separation and translation.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
filters (QueryFilter | None, optional): Structured QueryFilter to translate. Defaults to None.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
ChromaQueryComponents: ChromaDB query components containing where,
|
|
40
|
+
where_document, and id_values, or None if no filters are provided.
|
|
41
|
+
"""
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from chromadb import ClientAPI
|
|
3
|
+
from gllm_core.schema import Chunk
|
|
4
|
+
from gllm_datastore.constants import CHUNK_KEYS as CHUNK_KEYS, DEFAULT_TOP_K as DEFAULT_TOP_K, METADATA_KEYS as METADATA_KEYS
|
|
5
|
+
from gllm_datastore.core.filters import FilterClause as FilterClause, QueryFilter as QueryFilter, QueryOptions as QueryOptions
|
|
6
|
+
from gllm_datastore.data_store.chroma._chroma_import import safe_import_chromadb as safe_import_chromadb
|
|
7
|
+
from gllm_datastore.data_store.chroma.query import ChromaCollectionKeys as ChromaCollectionKeys, DEFAULT_NUM_CANDIDATES as DEFAULT_NUM_CANDIDATES, build_chroma_delete_kwargs as build_chroma_delete_kwargs, build_chroma_get_kwargs as build_chroma_get_kwargs
|
|
8
|
+
from gllm_datastore.data_store.chroma.query_translator import ChromaQueryTranslator as ChromaQueryTranslator
|
|
9
|
+
from gllm_datastore.utils.converter import from_langchain as from_langchain, l2_distance_to_similarity_score as l2_distance_to_similarity_score, to_langchain as to_langchain
|
|
10
|
+
from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
|
|
11
|
+
from gllm_inference.schema import Vector
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
chromadb: Incomplete
|
|
15
|
+
|
|
16
|
+
class ChromaVectorCapability:
|
|
17
|
+
"""ChromaDB implementation of VectorCapability protocol.
|
|
18
|
+
|
|
19
|
+
This class provides document CRUD operations and vector search using ChromaDB.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
collection_name (str): The name of the ChromaDB collection.
|
|
23
|
+
collection (Collection): The ChromaDB collection instance.
|
|
24
|
+
vector_store (Chroma): The langchain Chroma vector store instance.
|
|
25
|
+
num_candidates (int): The maximum number of candidates to consider during search.
|
|
26
|
+
"""
|
|
27
|
+
collection_name: Incomplete
|
|
28
|
+
client: Incomplete
|
|
29
|
+
collection: Incomplete
|
|
30
|
+
num_candidates: Incomplete
|
|
31
|
+
vector_store: Incomplete
|
|
32
|
+
def __init__(self, collection_name: str, em_invoker: BaseEMInvoker, client: ClientAPI, num_candidates: int = ...) -> None:
|
|
33
|
+
"""Initialize the ChromaDB vector capability.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
collection_name (str): The name of the ChromaDB collection.
|
|
37
|
+
em_invoker (BaseEMInvoker): The embedding model to perform vectorization.
|
|
38
|
+
client (ClientAPI): The ChromaDB client instance.
|
|
39
|
+
num_candidates (int, optional): Maximum number of candidates to consider during search.
|
|
40
|
+
Defaults to 50.
|
|
41
|
+
"""
|
|
42
|
+
@property
|
|
43
|
+
def em_invoker(self) -> BaseEMInvoker:
|
|
44
|
+
"""Returns the EM Invoker instance.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
BaseEMInvoker: The EM Invoker instance.
|
|
48
|
+
"""
|
|
49
|
+
async def ensure_index(self) -> None:
|
|
50
|
+
"""Ensure ChromaDB collection exists, creating it if necessary.
|
|
51
|
+
|
|
52
|
+
This method is idempotent - if the collection already exists, it will return
|
|
53
|
+
the existing collection. The collection is automatically created during initialization,
|
|
54
|
+
but this method can be called explicitly to ensure it exists.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
RuntimeError: If collection creation fails.
|
|
58
|
+
"""
|
|
59
|
+
async def create(self, data: Chunk | list[Chunk], **kwargs: Any) -> None:
|
|
60
|
+
"""Add chunks to the vector store with automatic embedding generation.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
data (Chunk | list[Chunk]): Single chunk or list of chunks to add.
|
|
64
|
+
**kwargs: Backend-specific parameters.
|
|
65
|
+
"""
|
|
66
|
+
async def create_from_vector(self, chunk_vectors: list[tuple[Chunk, Vector]], **kwargs: Any) -> None:
|
|
67
|
+
'''Add pre-computed embeddings directly.
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
```python
|
|
71
|
+
await datastore.vector.create_from_vector(chunk_vectors=[
|
|
72
|
+
(Chunk(content="text1", metadata={"source": "source1"}, id="id1"), [0.1, 0.2, 0.3]),
|
|
73
|
+
(Chunk(content="text2", metadata={"source": "source2"}, id="id2"), [0.4, 0.5, 0.6]),
|
|
74
|
+
])
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
chunk_vectors (list[tuple[Chunk, Vector]]): List of tuples containing chunks and their
|
|
79
|
+
corresponding vectors.
|
|
80
|
+
**kwargs: Datastore-specific parameters.
|
|
81
|
+
'''
|
|
82
|
+
async def retrieve(self, query: str, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None) -> list[Chunk]:
|
|
83
|
+
'''Semantic search using text query converted to vector.
|
|
84
|
+
|
|
85
|
+
Examples:
|
|
86
|
+
```python
|
|
87
|
+
from gllm_datastore.core.filters import filter as F
|
|
88
|
+
|
|
89
|
+
# Direct FilterClause usage
|
|
90
|
+
await datastore.vector.retrieve(
|
|
91
|
+
query="What is the capital of France?",
|
|
92
|
+
filters=F.eq("metadata.category", "tech")
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Multiple filters
|
|
96
|
+
filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
|
|
97
|
+
await datastore.vector.retrieve(query="What is the capital of France?", filters=filters)
|
|
98
|
+
```
|
|
99
|
+
This will retrieve the top 10 chunks by similarity score from the vector store
|
|
100
|
+
that match the query and the filters. The chunks will be sorted by score in descending order.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
query (str): Text query to embed and search for.
|
|
104
|
+
filters (FilterClause | QueryFilter | None, optional): Filters to apply to the search.
|
|
105
|
+
FilterClause objects are automatically converted to QueryFilter internally.
|
|
106
|
+
Defaults to None.
|
|
107
|
+
options (QueryOptions | None, optional): Options to apply to the search. Defaults to None.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
list[Chunk]: List of chunks ordered by relevance score.
|
|
111
|
+
'''
|
|
112
|
+
async def retrieve_by_vector(self, vector: Vector, filters: FilterClause | QueryFilter | None = None, options: QueryOptions | None = None) -> list[Chunk]:
|
|
113
|
+
'''Direct vector similarity search.
|
|
114
|
+
|
|
115
|
+
Examples:
|
|
116
|
+
```python
|
|
117
|
+
from gllm_datastore.core.filters import filter as F
|
|
118
|
+
|
|
119
|
+
# Direct FilterClause usage
|
|
120
|
+
await datastore.vector.retrieve_by_vector(
|
|
121
|
+
vector=[0.1, 0.2, 0.3],
|
|
122
|
+
filters=F.eq("metadata.category", "tech")
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Multiple filters
|
|
126
|
+
filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
|
|
127
|
+
await datastore.vector.retrieve_by_vector(vector=[0.1, 0.2, 0.3], filters=filters)
|
|
128
|
+
```
|
|
129
|
+
This will retrieve the top 10 chunks by similarity score from the vector store
|
|
130
|
+
that match the vector and the filters. The chunks will be sorted by score in descending order.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
vector (Vector): Query embedding vector.
|
|
134
|
+
filters (FilterClause | QueryFilter | None, optional): Filters to apply to the search.
|
|
135
|
+
FilterClause objects are automatically converted to QueryFilter internally.
|
|
136
|
+
Defaults to None.
|
|
137
|
+
options (QueryOptions | None, optional): Options to apply to the search. Defaults to None.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
list[Chunk]: List of chunks ordered by similarity score.
|
|
141
|
+
'''
|
|
142
|
+
async def update(self, update_values: dict[str, Any], filters: FilterClause | QueryFilter | None = None) -> None:
|
|
143
|
+
'''Update existing records in the datastore.
|
|
144
|
+
|
|
145
|
+
Examples:
|
|
146
|
+
```python
|
|
147
|
+
from gllm_datastore.core.filters import filter as F
|
|
148
|
+
|
|
149
|
+
# Direct FilterClause usage
|
|
150
|
+
await datastore.vector.update(
|
|
151
|
+
update_values={"metadata": {"status": "published"}},
|
|
152
|
+
filters=F.eq("metadata.category", "tech"),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Multiple filters
|
|
156
|
+
filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
|
|
157
|
+
await datastore.vector.update(
|
|
158
|
+
update_values={"metadata": {"status": "published"}},
|
|
159
|
+
filters=filters,
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
This will update the metadata of the chunks that match the filters to "published".
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
update_values (dict[str, Any]): Values to update.
|
|
166
|
+
filters (FilterClause | QueryFilter | None, optional): Filters to select records to update.
|
|
167
|
+
FilterClause objects are automatically converted to QueryFilter internally.
|
|
168
|
+
Defaults to None, in which case no operation is performed (no-op).
|
|
169
|
+
|
|
170
|
+
Note:
|
|
171
|
+
ChromaDB doesn\'t support direct update operations. This method requires
|
|
172
|
+
filters to identify records and will update matching records.
|
|
173
|
+
'''
|
|
174
|
+
async def delete(self, filters: FilterClause | QueryFilter | None = None, **kwargs: Any) -> None:
|
|
175
|
+
'''Delete records from the datastore.
|
|
176
|
+
|
|
177
|
+
Examples:
|
|
178
|
+
```python
|
|
179
|
+
from gllm_datastore.core.filters import filter as F
|
|
180
|
+
|
|
181
|
+
# Direct FilterClause usage
|
|
182
|
+
await datastore.vector.delete(filters=F.eq("metadata.category", "tech"))
|
|
183
|
+
|
|
184
|
+
# Multiple filters
|
|
185
|
+
filters = F.and_(F.eq("metadata.source", "wikipedia"), F.eq("metadata.category", "tech"))
|
|
186
|
+
await datastore.vector.delete(filters=filters)
|
|
187
|
+
```
|
|
188
|
+
This will delete all chunks from the vector store that match the filters.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
filters (FilterClause | QueryFilter | None, optional): Filters to select records to delete.
|
|
192
|
+
FilterClause objects are automatically converted to QueryFilter internally.
|
|
193
|
+
Defaults to None, in which case no operation is performed (no-op).
|
|
194
|
+
**kwargs: Datastore-specific parameters.
|
|
195
|
+
'''
|
|
196
|
+
async def clear(self) -> None:
|
|
197
|
+
"""Clear all records from the datastore."""
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
from gllm_datastore.data_store.elasticsearch.data_store import ElasticsearchDataStore as ElasticsearchDataStore
|
|
2
|
+
from gllm_datastore.data_store.elasticsearch.fulltext import ElasticsearchFulltextCapability as ElasticsearchFulltextCapability
|
|
3
|
+
from gllm_datastore.data_store.elasticsearch.vector import ElasticsearchVectorCapability as ElasticsearchVectorCapability
|
|
4
|
+
|
|
5
|
+
__all__ = ['ElasticsearchDataStore', 'ElasticsearchFulltextCapability', 'ElasticsearchVectorCapability']
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from _typeshed import Incomplete
|
|
2
|
+
from elasticsearch import AsyncElasticsearch
|
|
3
|
+
from gllm_datastore.constants import DEFAULT_REQUEST_TIMEOUT as DEFAULT_REQUEST_TIMEOUT
|
|
4
|
+
from gllm_datastore.core.filters.schema import FilterClause as FilterClause, QueryFilter as QueryFilter
|
|
5
|
+
from gllm_datastore.data_store.base import BaseDataStore as BaseDataStore, CapabilityType as CapabilityType
|
|
6
|
+
from gllm_datastore.data_store.elasticsearch.fulltext import ElasticsearchFulltextCapability as ElasticsearchFulltextCapability
|
|
7
|
+
from gllm_datastore.data_store.elasticsearch.query import translate_filter as translate_filter
|
|
8
|
+
from gllm_datastore.data_store.elasticsearch.vector import ElasticsearchVectorCapability as ElasticsearchVectorCapability
|
|
9
|
+
from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
|
|
10
|
+
from langchain_elasticsearch.vectorstores import AsyncRetrievalStrategy
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
class ElasticsearchDataStore(BaseDataStore):
|
|
14
|
+
"""Elasticsearch data store with multiple capability support.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
index_name (str): The name of the Elasticsearch index.
|
|
18
|
+
client (AsyncElasticsearch): AsyncElasticsearch client.
|
|
19
|
+
"""
|
|
20
|
+
client: Incomplete
|
|
21
|
+
index_name: Incomplete
|
|
22
|
+
def __init__(self, index_name: str, client: AsyncElasticsearch | None = None, url: str | None = None, cloud_id: str | None = None, api_key: str | None = None, username: str | None = None, password: str | None = None, request_timeout: int = ...) -> None:
|
|
23
|
+
"""Initialize the Elasticsearch fulltext capability.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
index_name (str): The name of the Elasticsearch index.
|
|
27
|
+
client (AsyncElasticsearch | None, optional): The Elasticsearch client. Defaults to None.
|
|
28
|
+
If provided, it will be used instead of the url and cloud_id.
|
|
29
|
+
url (str | None, optional): The URL of the Elasticsearch server. Defaults to None.
|
|
30
|
+
cloud_id (str | None, optional): The cloud ID of the Elasticsearch cluster. Defaults to None.
|
|
31
|
+
api_key (str | None, optional): The API key for authentication. Defaults to None.
|
|
32
|
+
username (str | None, optional): The username for authentication. Defaults to None.
|
|
33
|
+
password (str | None, optional): The password for authentication. Defaults to None.
|
|
34
|
+
request_timeout (int, optional): The request timeout. Defaults to DEFAULT_REQUEST_TIMEOUT.
|
|
35
|
+
"""
|
|
36
|
+
@property
|
|
37
|
+
def supported_capabilities(self) -> list[str]:
|
|
38
|
+
"""Return list of currently supported capabilities.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
list[str]: List of capability names that are supported.
|
|
42
|
+
"""
|
|
43
|
+
@property
|
|
44
|
+
def fulltext(self) -> ElasticsearchFulltextCapability:
|
|
45
|
+
"""Access fulltext capability if supported.
|
|
46
|
+
|
|
47
|
+
This method uses the logic of its parent class to return the fulltext capability handler.
|
|
48
|
+
This method overrides the parent class to return the ElasticsearchFulltextCapability handler for better
|
|
49
|
+
type hinting.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
ElasticsearchFulltextCapability: Fulltext capability handler.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
NotSupportedException: If fulltext capability is not supported.
|
|
56
|
+
"""
|
|
57
|
+
@property
|
|
58
|
+
def vector(self) -> ElasticsearchVectorCapability:
|
|
59
|
+
"""Access vector capability if supported.
|
|
60
|
+
|
|
61
|
+
This method uses the logic of its parent class to return the vector capability handler.
|
|
62
|
+
This method overrides the parent class to return the ElasticsearchVectorCapability handler for better
|
|
63
|
+
type hinting.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
ElasticsearchVectorCapability: Vector capability handler.
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
NotSupportedException: If vector capability is not supported.
|
|
70
|
+
"""
|
|
71
|
+
def with_fulltext(self, index_name: str | None = None, query_field: str = 'text') -> ElasticsearchDataStore:
|
|
72
|
+
'''Configure fulltext capability and return datastore instance.
|
|
73
|
+
|
|
74
|
+
This method uses the logic of its parent class to configure the fulltext capability.
|
|
75
|
+
This method overrides the parent class for better type hinting.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
index_name (str | None, optional): The name of the Elasticsearch index. Defaults to None,
|
|
79
|
+
in which case the default class attribute will be utilized.
|
|
80
|
+
query_field (str, optional): The field name to use for text content. Defaults to "text".
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Self: Self for method chaining.
|
|
84
|
+
'''
|
|
85
|
+
def with_vector(self, em_invoker: BaseEMInvoker, index_name: str | None = None, query_field: str = 'text', vector_query_field: str = 'vector', retrieval_strategy: AsyncRetrievalStrategy | None = None, distance_strategy: str | None = None) -> ElasticsearchDataStore:
|
|
86
|
+
'''Configure vector capability and return datastore instance.
|
|
87
|
+
|
|
88
|
+
This method uses the logic of its parent class to configure the vector capability.
|
|
89
|
+
This method overrides the parent class for better type hinting.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
em_invoker (BaseEMInvoker): The embedding model to perform vectorization.
|
|
93
|
+
index_name (str | None, optional): The name of the Elasticsearch index. Defaults to None,
|
|
94
|
+
in which case the default class attribute will be utilized.
|
|
95
|
+
query_field (str, optional): The field name for text queries. Defaults to "text".
|
|
96
|
+
vector_query_field (str, optional): The field name for vector queries. Defaults to "vector".
|
|
97
|
+
retrieval_strategy (AsyncRetrievalStrategy | None, optional): The retrieval strategy for retrieval.
|
|
98
|
+
Defaults to None, in which case DenseVectorStrategy() is used.
|
|
99
|
+
distance_strategy (str | None, optional): The distance strategy for retrieval. Defaults to None.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Self: Self for method chaining.
|
|
103
|
+
'''
|
|
104
|
+
@classmethod
|
|
105
|
+
def translate_query_filter(cls, query_filter: FilterClause | QueryFilter | None) -> dict[str, Any] | None:
|
|
106
|
+
"""Translate QueryFilter or FilterClause to Elasticsearch native filter syntax.
|
|
107
|
+
|
|
108
|
+
This method delegates to the existing translate_filter function in the
|
|
109
|
+
elasticsearch.query module and returns the result as a dictionary.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
query_filter (FilterClause | QueryFilter | None): The filter to translate.
|
|
113
|
+
Can be a single FilterClause, a QueryFilter with multiple clauses,
|
|
114
|
+
or None for empty filters.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
dict[str, Any] | None: The translated filter as an Elasticsearch DSL dict.
|
|
118
|
+
Returns None for empty filters.
|
|
119
|
+
"""
|