pyseekdb 0.1.0.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,357 @@
1
+ """
2
+ Filter builder utilities for metadata and document filtering
3
+
4
+ Supports:
5
+ - Metadata filters: $eq, $lt, $gt, $lte, $gte, $ne, $in, $nin
6
+ - Logical operators: $or, $and, $not
7
+ - Document filters: $contains, $regex
8
+ """
9
+ import re
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+
13
+ class FilterBuilder:
14
+ """Build SQL WHERE clauses from filter dictionaries"""
15
+
16
+ # Comparison operators mapping
17
+ COMPARISON_OPS = {
18
+ "$eq": "=",
19
+ "$lt": "<",
20
+ "$gt": ">",
21
+ "$lte": "<=",
22
+ "$gte": ">=",
23
+ "$ne": "!="
24
+ }
25
+
26
+ # Logical operators
27
+ LOGICAL_OPS = ["$and", "$or", "$not"]
28
+
29
+ # Document operators
30
+ DOCUMENT_OPS = ["$contains", "$regex"]
31
+
32
+ @staticmethod
33
+ def build_metadata_filter(
34
+ where: Dict[str, Any],
35
+ metadata_column: str = "metadata"
36
+ ) -> Tuple[str, List[Any]]:
37
+ """
38
+ Build WHERE clause for metadata filtering
39
+
40
+ Args:
41
+ where: Filter dictionary with operators like $eq, $lt, $gt, $lte, $gte, $ne, $in, $nin, $and, $or, $not
42
+ metadata_column: Name of metadata column (default: "metadata")
43
+
44
+ Returns:
45
+ Tuple of (where_clause, params) for parameterized query
46
+
47
+ Examples:
48
+ where = {"age": {"$gte": 18}}
49
+ -> ("JSON_EXTRACT(metadata, '$.age') >= %s", [18])
50
+
51
+ where = {"$and": [{"age": {"$gte": 18}}, {"city": "Beijing"}]}
52
+ -> ("(JSON_EXTRACT(metadata, '$.age') >= %s AND JSON_EXTRACT(metadata, '$.city') = %s)", [18, "Beijing"])
53
+ """
54
+ if not where:
55
+ return "", []
56
+
57
+ return FilterBuilder._build_condition(where, metadata_column)
58
+
59
+ @staticmethod
60
+ def build_document_filter(
61
+ where_document: Dict[str, Any],
62
+ document_column: str = "document"
63
+ ) -> Tuple[str, List[Any]]:
64
+ """
65
+ Build WHERE clause for document filtering
66
+
67
+ Args:
68
+ where_document: Filter dictionary with $contains, $regex, $and, $or operators
69
+ document_column: Name of document column (default: "document")
70
+
71
+ Returns:
72
+ Tuple of (where_clause, params) for parameterized query
73
+
74
+ Examples:
75
+ where_document = {"$contains": "python"}
76
+ -> ("MATCH(document) AGAINST (%s IN NATURAL LANGUAGE MODE)", ["python"])
77
+
78
+ where_document = {"$regex": "^hello.*world$"}
79
+ -> ("document REGEXP %s", ["^hello.*world$"])
80
+ """
81
+ if not where_document:
82
+ return "", []
83
+
84
+ return FilterBuilder._build_document_condition(where_document, document_column)
85
+
86
+ @staticmethod
87
+ def _build_condition(
88
+ condition: Dict[str, Any],
89
+ metadata_column: str,
90
+ params: Optional[List[Any]] = None
91
+ ) -> Tuple[str, List[Any]]:
92
+ """Recursively build condition from nested dictionary"""
93
+ if params is None:
94
+ params = []
95
+
96
+ clauses = []
97
+
98
+ for key, value in condition.items():
99
+ if key in FilterBuilder.LOGICAL_OPS:
100
+ # Handle logical operators
101
+ if key == "$and":
102
+ sub_clauses = []
103
+ for sub_condition in value:
104
+ sub_clause, params = FilterBuilder._build_condition(sub_condition, metadata_column, params)
105
+ sub_clauses.append(sub_clause)
106
+ clauses.append(f"({' AND '.join(sub_clauses)})")
107
+
108
+ elif key == "$or":
109
+ sub_clauses = []
110
+ for sub_condition in value:
111
+ sub_clause, params = FilterBuilder._build_condition(sub_condition, metadata_column, params)
112
+ sub_clauses.append(sub_clause)
113
+ clauses.append(f"({' OR '.join(sub_clauses)})")
114
+
115
+ elif key == "$not":
116
+ sub_clause, params = FilterBuilder._build_condition(value, metadata_column, params)
117
+ clauses.append(f"NOT ({sub_clause})")
118
+
119
+ elif isinstance(value, dict):
120
+ # Handle comparison operators
121
+ for op, op_value in value.items():
122
+ if op in FilterBuilder.COMPARISON_OPS:
123
+ sql_op = FilterBuilder.COMPARISON_OPS[op]
124
+ clauses.append(f"JSON_EXTRACT({metadata_column}, '$.{key}') {sql_op} %s")
125
+ params.append(op_value)
126
+
127
+ elif op == "$in":
128
+ placeholders = ", ".join(["%s"] * len(op_value))
129
+ clauses.append(f"JSON_EXTRACT({metadata_column}, '$.{key}') IN ({placeholders})")
130
+ params.extend(op_value)
131
+
132
+ elif op == "$nin":
133
+ placeholders = ", ".join(["%s"] * len(op_value))
134
+ clauses.append(f"JSON_EXTRACT({metadata_column}, '$.{key}') NOT IN ({placeholders})")
135
+ params.extend(op_value)
136
+
137
+ else:
138
+ # Direct equality comparison
139
+ clauses.append(f"JSON_EXTRACT({metadata_column}, '$.{key}') = %s")
140
+ params.append(value)
141
+
142
+ where_clause = " AND ".join(clauses) if clauses else "1=1"
143
+ return where_clause, params
144
+
145
+ @staticmethod
146
+ def _build_document_condition(
147
+ condition: Dict[str, Any],
148
+ document_column: str,
149
+ params: Optional[List[Any]] = None
150
+ ) -> Tuple[str, List[Any]]:
151
+ """Build document filter condition"""
152
+ if params is None:
153
+ params = []
154
+
155
+ clauses = []
156
+
157
+ for key, value in condition.items():
158
+ if key == "$contains":
159
+ # Full-text search using MATCH AGAINST
160
+ clauses.append(f"MATCH({document_column}) AGAINST (%s IN NATURAL LANGUAGE MODE)")
161
+ params.append(value)
162
+
163
+ elif key == "$regex":
164
+ # Regular expression matching
165
+ clauses.append(f"{document_column} REGEXP %s")
166
+ params.append(value)
167
+
168
+ elif key == "$and":
169
+ sub_clauses = []
170
+ for sub_condition in value:
171
+ sub_clause, params = FilterBuilder._build_document_condition(sub_condition, document_column, params)
172
+ sub_clauses.append(sub_clause)
173
+ clauses.append(f"({' AND '.join(sub_clauses)})")
174
+
175
+ elif key == "$or":
176
+ sub_clauses = []
177
+ for sub_condition in value:
178
+ sub_clause, params = FilterBuilder._build_document_condition(sub_condition, document_column, params)
179
+ sub_clauses.append(sub_clause)
180
+ clauses.append(f"({' OR '.join(sub_clauses)})")
181
+
182
+ where_clause = " AND ".join(clauses) if clauses else "1=1"
183
+ return where_clause, params
184
+
185
+ @staticmethod
186
+ def combine_filters(
187
+ metadata_filter: Tuple[str, List[Any]],
188
+ document_filter: Tuple[str, List[Any]]
189
+ ) -> Tuple[str, List[Any]]:
190
+ """
191
+ Combine metadata and document filters
192
+
193
+ Args:
194
+ metadata_filter: Tuple of (where_clause, params) for metadata
195
+ document_filter: Tuple of (where_clause, params) for document
196
+
197
+ Returns:
198
+ Combined (where_clause, params)
199
+ """
200
+ meta_clause, meta_params = metadata_filter
201
+ doc_clause, doc_params = document_filter
202
+
203
+ clauses = []
204
+ all_params = []
205
+
206
+ if meta_clause:
207
+ clauses.append(meta_clause)
208
+ all_params.extend(meta_params)
209
+
210
+ if doc_clause:
211
+ clauses.append(doc_clause)
212
+ all_params.extend(doc_params)
213
+
214
+ if clauses:
215
+ combined_clause = " AND ".join(clauses)
216
+ return combined_clause, all_params
217
+ else:
218
+ return "", []
219
+
220
+ @staticmethod
221
+ def build_search_filter(where: Optional[Dict[str, Any]]) -> Optional[List[Dict[str, Any]]]:
222
+ """
223
+ Build search_params filter format from where condition for hybrid search
224
+
225
+ Args:
226
+ where: Filter dictionary with operators like $eq, $lt, $gt, $lte, $gte, $ne, $in, $nin, $and, $or, $not
227
+
228
+ Returns:
229
+ List of filter conditions in search_params format, or None if where is empty
230
+
231
+ Examples:
232
+ where = {"category": {"$eq": "science"}}
233
+ -> [{"term": {"metadata.category": {"value": "science"}}}]
234
+
235
+ where = {"$and": [{"page": {"$gte": 5}}, {"page": {"$lte": 10}}]}
236
+ -> [{"bool": {"must": [{"range": {"metadata.page": {"gte": 5}}}, {"range": {"metadata.page": {"lte": 10}}}]}}]
237
+ """
238
+ if not where:
239
+ return None
240
+
241
+ filter_condition = FilterBuilder._build_search_filter_condition(where)
242
+ if filter_condition:
243
+ return [filter_condition]
244
+ return None
245
+
246
+ @staticmethod
247
+ def _build_search_filter_condition(condition: Dict[str, Any]) -> Optional[Dict[str, Any]]:
248
+ """Recursively build search_params filter condition from nested dictionary"""
249
+ if not condition:
250
+ return None
251
+
252
+ # Handle logical operators
253
+ if "$and" in condition:
254
+ must_conditions = []
255
+ for sub_condition in condition["$and"]:
256
+ sub_filter = FilterBuilder._build_search_filter_condition(sub_condition)
257
+ if sub_filter:
258
+ must_conditions.append(sub_filter)
259
+ if must_conditions:
260
+ return {"bool": {"must": must_conditions}}
261
+ return None
262
+
263
+ if "$or" in condition:
264
+ should_conditions = []
265
+ for sub_condition in condition["$or"]:
266
+ sub_filter = FilterBuilder._build_search_filter_condition(sub_condition)
267
+ if sub_filter:
268
+ should_conditions.append(sub_filter)
269
+ if should_conditions:
270
+ return {"bool": {"should": should_conditions}}
271
+ return None
272
+
273
+ if "$not" in condition:
274
+ not_filter = FilterBuilder._build_search_filter_condition(condition["$not"])
275
+ if not_filter:
276
+ return {"bool": {"must_not": [not_filter]}}
277
+ return None
278
+
279
+ # Handle field conditions
280
+ result = {"bool": {"must": [], "should": [], "must_not": []}}
281
+ has_conditions = False
282
+
283
+ for key, value in condition.items():
284
+ if key in FilterBuilder.LOGICAL_OPS:
285
+ continue
286
+
287
+ field_name = f"metadata.{key}"
288
+
289
+ if isinstance(value, dict):
290
+ # Handle comparison operators
291
+ range_conditions = {}
292
+ term_conditions = []
293
+ in_conditions = []
294
+ nin_conditions = []
295
+
296
+ for op, op_value in value.items():
297
+ if op == "$eq":
298
+ term_conditions.append({"term": {field_name: {"value": op_value}}})
299
+ has_conditions = True
300
+ elif op == "$ne":
301
+ result["bool"]["must_not"].append({"term": {field_name: {"value": op_value}}})
302
+ has_conditions = True
303
+ elif op == "$lt":
304
+ range_conditions["lt"] = op_value
305
+ has_conditions = True
306
+ elif op == "$lte":
307
+ range_conditions["lte"] = op_value
308
+ has_conditions = True
309
+ elif op == "$gt":
310
+ range_conditions["gt"] = op_value
311
+ has_conditions = True
312
+ elif op == "$gte":
313
+ range_conditions["gte"] = op_value
314
+ has_conditions = True
315
+ elif op == "$in":
316
+ for val in op_value:
317
+ in_conditions.append({"term": {field_name: {"value": val}}})
318
+ has_conditions = True
319
+ elif op == "$nin":
320
+ for val in op_value:
321
+ nin_conditions.append({"term": {field_name: {"value": val}}})
322
+ has_conditions = True
323
+
324
+ if range_conditions:
325
+ result["bool"]["must"].append({"range": {field_name: range_conditions}})
326
+ if term_conditions:
327
+ result["bool"]["must"].extend(term_conditions)
328
+ if in_conditions:
329
+ result["bool"]["should"].extend(in_conditions)
330
+ if nin_conditions:
331
+ result["bool"]["must_not"].extend(nin_conditions)
332
+ else:
333
+ # Direct equality
334
+ result["bool"]["must"].append({"term": {field_name: {"value": value}}})
335
+ has_conditions = True
336
+
337
+ if not has_conditions:
338
+ return None
339
+
340
+ # Clean up empty arrays
341
+ if not result["bool"]["must"]:
342
+ del result["bool"]["must"]
343
+ if not result["bool"]["should"]:
344
+ del result["bool"]["should"]
345
+ if not result["bool"]["must_not"]:
346
+ del result["bool"]["must_not"]
347
+
348
+ # If only one type of condition, simplify
349
+ if len(result["bool"]) == 1:
350
+ key = list(result["bool"].keys())[0]
351
+ conditions = result["bool"][key]
352
+ if len(conditions) == 1:
353
+ return conditions[0]
354
+ return {"bool": {key: conditions}}
355
+
356
+ return result
357
+
@@ -0,0 +1,15 @@
1
+ """
2
+ Metadata information for collection fields.
3
+ """
4
+ class CollectionFieldNames:
5
+ ID = "_id"
6
+ DOCUMENT = "document"
7
+ EMBEDDING = "embedding"
8
+ METADATA = "metadata"
9
+
10
+ ALL_FIELDS = [ID, DOCUMENT, EMBEDDING, METADATA]
11
+
12
+ class CollectionNames:
13
+ @staticmethod
14
+ def table_name(collection_name: str) -> str:
15
+ return f"c$v1${collection_name}"
@@ -0,0 +1,122 @@
1
+ """
2
+ Query result wrapper class with JSON serialization support
3
+ """
4
+ import json
5
+ from typing import Any, Dict, List, Optional
6
+
7
+
8
+ class QueryResultItem:
9
+ """Single query result item"""
10
+
11
+ def __init__(
12
+ self,
13
+ id: Any,
14
+ document: Optional[str] = None,
15
+ embedding: Optional[List[float]] = None,
16
+ metadata: Optional[Dict[str, Any]] = None,
17
+ distance: Optional[float] = None
18
+ ):
19
+ """
20
+ Initialize a query result item
21
+
22
+ Args:
23
+ id: Record ID
24
+ document: Document text (optional)
25
+ embedding: Vector embedding (optional)
26
+ metadata: Metadata dictionary (optional)
27
+ distance: Distance/similarity score (optional)
28
+ """
29
+ self._id = id
30
+ self.document = document
31
+ self.embedding = embedding
32
+ self.metadata = metadata if metadata is not None else {}
33
+ self.distance = distance
34
+
35
+ def to_dict(self) -> Dict[str, Any]:
36
+ """Convert to dictionary"""
37
+ result = {"_id": self._id}
38
+
39
+ if self.document is not None:
40
+ result["document"] = self.document
41
+
42
+ if self.embedding is not None:
43
+ result["embedding"] = self.embedding
44
+
45
+ if self.metadata:
46
+ result["metadata"] = self.metadata
47
+
48
+ if self.distance is not None:
49
+ result["distance"] = self.distance
50
+
51
+ return result
52
+
53
+ def to_json(self) -> str:
54
+ """Convert to JSON string"""
55
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=2)
56
+
57
+ def __repr__(self) -> str:
58
+ return f"QueryResultItem(id={self._id}, distance={self.distance})"
59
+
60
+
61
+ class QueryResult:
62
+ """Query result wrapper with multiple items"""
63
+
64
+ def __init__(self, items: Optional[List[QueryResultItem]] = None):
65
+ """
66
+ Initialize query result
67
+
68
+ Args:
69
+ items: List of QueryResultItem objects (optional)
70
+ """
71
+ self.items = items if items is not None else []
72
+
73
+ def add_item(
74
+ self,
75
+ id: Any,
76
+ document: Optional[str] = None,
77
+ embedding: Optional[List[float]] = None,
78
+ metadata: Optional[Dict[str, Any]] = None,
79
+ distance: Optional[float] = None
80
+ ) -> None:
81
+ """
82
+ Add a result item
83
+
84
+ Args:
85
+ id: Record ID
86
+ document: Document text (optional)
87
+ embedding: Vector embedding (optional)
88
+ metadata: Metadata dictionary (optional)
89
+ distance: Distance/similarity score (optional)
90
+ """
91
+ item = QueryResultItem(
92
+ id=id,
93
+ document=document,
94
+ embedding=embedding,
95
+ metadata=metadata,
96
+ distance=distance
97
+ )
98
+ self.items.append(item)
99
+
100
+ def to_list(self) -> List[Dict[str, Any]]:
101
+ """Convert to list of dictionaries"""
102
+ return [item.to_dict() for item in self.items]
103
+
104
+ def to_json(self) -> str:
105
+ """Convert to JSON string"""
106
+ return json.dumps(self.to_list(), ensure_ascii=False, indent=2)
107
+
108
+ def __len__(self) -> int:
109
+ """Return number of items"""
110
+ return len(self.items)
111
+
112
+ def __getitem__(self, index: int) -> QueryResultItem:
113
+ """Get item by index"""
114
+ return self.items[index]
115
+
116
+ def __iter__(self):
117
+ """Iterate over items"""
118
+ return iter(self.items)
119
+
120
+ def __repr__(self) -> str:
121
+ return f"QueryResult(items={len(self.items)})"
122
+
@@ -0,0 +1,48 @@
1
+ """
2
+ Utility functions and classes for SQL string generation and escaping in SeekDB client.
3
+
4
+ Provides helpers to safely stringify values and SQL identifiers for insertion into SQL expressions.
5
+ """
6
+
7
+ from typing import Optional, Union
8
+
9
+
10
+ def _quote_string(value, quote: str):
11
+ return quote + str(value) + quote
12
+
13
+
14
+ class SqlStringifier:
15
+ """
16
+ Translate values into strings in SQL.
17
+ """
18
+
19
+ def __init__(self, *, quote: str = "'", identifier: str = "`"):
20
+ self._quote = quote
21
+ self._identifier = identifier
22
+
23
+ def stringify_value(self, value: Optional[Union[str, int, float, bytes]]):
24
+ if value is None:
25
+ return "NULL"
26
+ if isinstance(value, bytes):
27
+ # For varbinary type, convert bytes to hex string and use UNHEX function
28
+ hex_str = value.hex()
29
+ return f"UNHEX('{hex_str}')"
30
+ if isinstance(value, str):
31
+ # Check if it's a hex string (for varbinary IDs)
32
+ # If it looks like a hex string (even length, only hex chars), use UNHEX
33
+ # Otherwise, treat as regular string
34
+ if len(value) > 0 and len(value) % 2 == 0 and all(c in '0123456789abcdefABCDEF' for c in value):
35
+ # Likely a hex string for varbinary, use UNHEX
36
+ return f"UNHEX('{value}')"
37
+ formatted = value.replace('\\', '\\\\').replace(self._quote, f"\\{self._quote}")
38
+ return _quote_string(formatted, self._quote)
39
+ if isinstance(value, (int, float)):
40
+ return str(value)
41
+ return _quote_string(str(value), self._quote)
42
+
43
+ def stringify_id(self, id_name: str):
44
+ if id_name is None:
45
+ raise ValueError("Identifier shouldn't be null")
46
+ if not isinstance(id_name, str):
47
+ raise ValueError(f"Identifier should be string type, but got {type(id_name).__name__}")
48
+ return _quote_string(id_name, self._identifier)