pyseekdb 0.1.0.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyseekdb/__init__.py +90 -0
- pyseekdb/client/__init__.py +324 -0
- pyseekdb/client/admin_client.py +202 -0
- pyseekdb/client/base_connection.py +82 -0
- pyseekdb/client/client_base.py +1921 -0
- pyseekdb/client/client_oceanbase_server.py +258 -0
- pyseekdb/client/client_seekdb_embedded.py +324 -0
- pyseekdb/client/client_seekdb_server.py +226 -0
- pyseekdb/client/collection.py +485 -0
- pyseekdb/client/database.py +55 -0
- pyseekdb/client/filters.py +357 -0
- pyseekdb/client/meta_info.py +15 -0
- pyseekdb/client/query_result.py +122 -0
- pyseekdb/client/sql_utils.py +48 -0
- pyseekdb/examples/comprehensive_example.py +412 -0
- pyseekdb/examples/simple_example.py +113 -0
- pyseekdb/tests/__init__.py +0 -0
- pyseekdb/tests/test_admin_database_management.py +307 -0
- pyseekdb/tests/test_client_creation.py +425 -0
- pyseekdb/tests/test_collection_dml.py +652 -0
- pyseekdb/tests/test_collection_get.py +550 -0
- pyseekdb/tests/test_collection_hybrid_search.py +1126 -0
- pyseekdb/tests/test_collection_query.py +428 -0
- pyseekdb-0.1.0.dev3.dist-info/LICENSE +202 -0
- pyseekdb-0.1.0.dev3.dist-info/METADATA +856 -0
- pyseekdb-0.1.0.dev3.dist-info/RECORD +27 -0
- pyseekdb-0.1.0.dev3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Collection hybrid search tests - testing collection.hybrid_search() interface for all three modes
|
|
3
|
+
Supports configuring connection parameters via environment variables
|
|
4
|
+
"""
|
|
5
|
+
import pytest
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
import json
|
|
10
|
+
import uuid
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
# Add project path
|
|
14
|
+
project_root = Path(__file__).parent.parent.parent
|
|
15
|
+
sys.path.insert(0, str(project_root))
|
|
16
|
+
|
|
17
|
+
import seekdbclient
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ==================== Environment Variable Configuration ====================
|
|
21
|
+
# Embedded mode
|
|
22
|
+
SEEKDB_PATH = os.environ.get('SEEKDB_PATH', os.path.join(project_root, "seekdb_store"))
|
|
23
|
+
SEEKDB_DATABASE = os.environ.get('SEEKDB_DATABASE', 'test')
|
|
24
|
+
|
|
25
|
+
# Server mode
|
|
26
|
+
SERVER_HOST = os.environ.get('SERVER_HOST', 'localhost')
|
|
27
|
+
SERVER_PORT = int(os.environ.get('SERVER_PORT', '2881'))
|
|
28
|
+
SERVER_DATABASE = os.environ.get('SERVER_DATABASE', 'test')
|
|
29
|
+
SERVER_USER = os.environ.get('SERVER_USER', 'root')
|
|
30
|
+
SERVER_PASSWORD = os.environ.get('SERVER_PASSWORD', '')
|
|
31
|
+
|
|
32
|
+
# OceanBase mode
|
|
33
|
+
OB_HOST = os.environ.get('OB_HOST', 'localhost')
|
|
34
|
+
OB_PORT = int(os.environ.get('OB_PORT', '11202'))
|
|
35
|
+
OB_TENANT = os.environ.get('OB_TENANT', 'mysql')
|
|
36
|
+
OB_DATABASE = os.environ.get('OB_DATABASE', 'test')
|
|
37
|
+
OB_USER = os.environ.get('OB_USER', 'root')
|
|
38
|
+
OB_PASSWORD = os.environ.get('OB_PASSWORD', '')
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class TestCollectionHybridSearch:
|
|
42
|
+
"""Test collection.hybrid_search() interface for all three modes"""
|
|
43
|
+
|
|
44
|
+
def _create_test_collection(self, client, collection_name: str, dimension: int = 3):
|
|
45
|
+
"""Helper method to create a test collection"""
|
|
46
|
+
# Use client.create_collection to create the collection
|
|
47
|
+
collection = client.create_collection(
|
|
48
|
+
name=collection_name,
|
|
49
|
+
dimension=dimension
|
|
50
|
+
)
|
|
51
|
+
return collection
|
|
52
|
+
|
|
53
|
+
def _insert_test_data(self, client, collection_name: str):
|
|
54
|
+
"""Helper method to insert test data via SQL"""
|
|
55
|
+
table_name = f"c$v1${collection_name}"
|
|
56
|
+
|
|
57
|
+
# Insert test data with vectors, documents, and metadata
|
|
58
|
+
# Data designed for hybrid search testing
|
|
59
|
+
test_data = [
|
|
60
|
+
{
|
|
61
|
+
"document": "Machine learning is a subset of artificial intelligence",
|
|
62
|
+
"embedding": [1.0, 2.0, 3.0],
|
|
63
|
+
"metadata": {"category": "AI", "page": 1, "score": 95, "tag": "ml"}
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"document": "Python programming language is widely used in data science",
|
|
67
|
+
"embedding": [2.0, 3.0, 4.0],
|
|
68
|
+
"metadata": {"category": "Programming", "page": 2, "score": 88, "tag": "python"}
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"document": "Deep learning algorithms for neural networks",
|
|
72
|
+
"embedding": [1.1, 2.1, 3.1],
|
|
73
|
+
"metadata": {"category": "AI", "page": 3, "score": 92, "tag": "ml"}
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"document": "Data science with Python and machine learning",
|
|
77
|
+
"embedding": [2.1, 3.1, 4.1],
|
|
78
|
+
"metadata": {"category": "Data Science", "page": 4, "score": 90, "tag": "python"}
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"document": "Introduction to artificial intelligence and neural networks",
|
|
82
|
+
"embedding": [1.2, 2.2, 3.2],
|
|
83
|
+
"metadata": {"category": "AI", "page": 5, "score": 85, "tag": "neural"}
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"document": "Advanced machine learning techniques and algorithms",
|
|
87
|
+
"embedding": [1.3, 2.3, 3.3],
|
|
88
|
+
"metadata": {"category": "AI", "page": 6, "score": 93, "tag": "ml"}
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"document": "Python tutorial for beginners in programming",
|
|
92
|
+
"embedding": [2.2, 3.2, 4.2],
|
|
93
|
+
"metadata": {"category": "Programming", "page": 7, "score": 87, "tag": "python"}
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"document": "Natural language processing with machine learning",
|
|
97
|
+
"embedding": [1.4, 2.4, 3.4],
|
|
98
|
+
"metadata": {"category": "AI", "page": 8, "score": 91, "tag": "nlp"}
|
|
99
|
+
}
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
for data in test_data:
|
|
103
|
+
# Generate UUID for _id (use string format directly)
|
|
104
|
+
id_str = str(uuid.uuid4())
|
|
105
|
+
# Escape single quotes in ID
|
|
106
|
+
id_str_escaped = id_str.replace("'", "''")
|
|
107
|
+
|
|
108
|
+
# Convert vector to string format: [1.0,2.0,3.0]
|
|
109
|
+
vector_str = "[" + ",".join(map(str, data["embedding"])) + "]"
|
|
110
|
+
# Convert metadata to JSON string
|
|
111
|
+
metadata_str = json.dumps(data["metadata"], ensure_ascii=False).replace("'", "\\'")
|
|
112
|
+
# Escape single quotes in document
|
|
113
|
+
document_str = data["document"].replace("'", "\\'")
|
|
114
|
+
|
|
115
|
+
# Use CAST to convert string to binary for varbinary(512) field
|
|
116
|
+
sql = f"""INSERT INTO `{table_name}` (_id, document, embedding, metadata)
|
|
117
|
+
VALUES (CAST('{id_str_escaped}' AS BINARY), '{document_str}', '{vector_str}', '{metadata_str}')"""
|
|
118
|
+
client._server.execute(sql)
|
|
119
|
+
|
|
120
|
+
print(f" Inserted {len(test_data)} test records")
|
|
121
|
+
|
|
122
|
+
def _cleanup_collection(self, client, collection_name: str):
|
|
123
|
+
"""Helper method to cleanup test collection"""
|
|
124
|
+
table_name = f"c$v1${collection_name}"
|
|
125
|
+
try:
|
|
126
|
+
client._server.execute(f"DROP TABLE IF EXISTS `{table_name}`")
|
|
127
|
+
print(f" Cleaned up test table: {table_name}")
|
|
128
|
+
except Exception as cleanup_error:
|
|
129
|
+
print(f" Warning: Failed to cleanup test table: {cleanup_error}")
|
|
130
|
+
|
|
131
|
+
def test_oceanbase_hybrid_search_full_text_only(self):
|
|
132
|
+
"""Test hybrid_search with only full-text search (query)"""
|
|
133
|
+
# Create OceanBase client
|
|
134
|
+
client = seekdbclient.OBClient(
|
|
135
|
+
host=OB_HOST,
|
|
136
|
+
port=OB_PORT,
|
|
137
|
+
tenant=OB_TENANT,
|
|
138
|
+
database=OB_DATABASE,
|
|
139
|
+
user=OB_USER,
|
|
140
|
+
password=OB_PASSWORD
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
assert client is not None
|
|
144
|
+
assert hasattr(client, '_server')
|
|
145
|
+
assert isinstance(client._server, seekdbclient.OceanBaseServerClient)
|
|
146
|
+
|
|
147
|
+
# Test connection
|
|
148
|
+
try:
|
|
149
|
+
result = client._server.execute("SELECT 1 as test")
|
|
150
|
+
assert result is not None
|
|
151
|
+
except Exception as e:
|
|
152
|
+
pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
|
|
153
|
+
|
|
154
|
+
# Create test collection
|
|
155
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
156
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
# Insert test data
|
|
160
|
+
self._insert_test_data(client, collection_name)
|
|
161
|
+
|
|
162
|
+
# Wait a bit for indexes to be ready
|
|
163
|
+
time.sleep(1)
|
|
164
|
+
|
|
165
|
+
# Test 1: Full-text search only
|
|
166
|
+
print(f"\n✅ Testing hybrid_search with full-text search only")
|
|
167
|
+
results = collection.hybrid_search(
|
|
168
|
+
query={
|
|
169
|
+
"where_document": {
|
|
170
|
+
"$contains": "machine learning"
|
|
171
|
+
}
|
|
172
|
+
},
|
|
173
|
+
n_results=5,
|
|
174
|
+
include=["documents", "metadatas"]
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
assert results is not None
|
|
178
|
+
assert "ids" in results
|
|
179
|
+
assert "documents" in results
|
|
180
|
+
assert "metadatas" in results
|
|
181
|
+
assert len(results["ids"]) > 0
|
|
182
|
+
print(f" Found {len(results['ids'])} results")
|
|
183
|
+
|
|
184
|
+
# Verify results contain "machine learning"
|
|
185
|
+
for doc in results["documents"]:
|
|
186
|
+
if doc:
|
|
187
|
+
assert "machine" in doc.lower() or "learning" in doc.lower()
|
|
188
|
+
|
|
189
|
+
finally:
|
|
190
|
+
# Cleanup
|
|
191
|
+
self._cleanup_collection(client, collection_name)
|
|
192
|
+
|
|
193
|
+
def test_oceanbase_hybrid_search_vector_only(self):
|
|
194
|
+
"""Test hybrid_search with only vector search (knn)"""
|
|
195
|
+
# Create OceanBase client
|
|
196
|
+
client = seekdbclient.OBClient(
|
|
197
|
+
host=OB_HOST,
|
|
198
|
+
port=OB_PORT,
|
|
199
|
+
tenant=OB_TENANT,
|
|
200
|
+
database=OB_DATABASE,
|
|
201
|
+
user=OB_USER,
|
|
202
|
+
password=OB_PASSWORD
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Test connection
|
|
206
|
+
try:
|
|
207
|
+
result = client._server.execute("SELECT 1 as test")
|
|
208
|
+
assert result is not None
|
|
209
|
+
except Exception as e:
|
|
210
|
+
pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
|
|
211
|
+
|
|
212
|
+
# Create test collection
|
|
213
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
214
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
# Insert test data
|
|
218
|
+
self._insert_test_data(client, collection_name)
|
|
219
|
+
|
|
220
|
+
# Wait a bit for indexes to be ready
|
|
221
|
+
time.sleep(1)
|
|
222
|
+
|
|
223
|
+
# Test 2: Vector search only
|
|
224
|
+
print(f"\n✅ Testing hybrid_search with vector search only")
|
|
225
|
+
results = collection.hybrid_search(
|
|
226
|
+
knn={
|
|
227
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
228
|
+
"n_results": 5
|
|
229
|
+
},
|
|
230
|
+
n_results=5,
|
|
231
|
+
include=["documents", "metadatas", "embeddings"]
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
assert results is not None
|
|
235
|
+
assert "ids" in results
|
|
236
|
+
assert "distances" in results
|
|
237
|
+
assert len(results["ids"]) > 0
|
|
238
|
+
print(f" Found {len(results['ids'])} results")
|
|
239
|
+
|
|
240
|
+
# Verify distances are reasonable
|
|
241
|
+
# Note: APPROXIMATE ordering may not be perfectly sorted, so we only check
|
|
242
|
+
# that distances are non-negative and reasonable
|
|
243
|
+
distances = results["distances"]
|
|
244
|
+
assert len(distances) > 0
|
|
245
|
+
# All distances should be non-negative
|
|
246
|
+
for dist in distances:
|
|
247
|
+
assert dist >= 0, f"Distance should be non-negative, got {dist}"
|
|
248
|
+
# At least one distance should be relatively small (close match)
|
|
249
|
+
min_distance = min(distances)
|
|
250
|
+
assert min_distance < 10.0, f"At least one distance should be reasonable, got min={min_distance}"
|
|
251
|
+
|
|
252
|
+
finally:
|
|
253
|
+
# Cleanup
|
|
254
|
+
self._cleanup_collection(client, collection_name)
|
|
255
|
+
|
|
256
|
+
def test_oceanbase_hybrid_search_combined(self):
|
|
257
|
+
"""Test hybrid_search with both full-text and vector search"""
|
|
258
|
+
# Create OceanBase client
|
|
259
|
+
client = seekdbclient.OBClient(
|
|
260
|
+
host=OB_HOST,
|
|
261
|
+
port=OB_PORT,
|
|
262
|
+
tenant=OB_TENANT,
|
|
263
|
+
database=OB_DATABASE,
|
|
264
|
+
user=OB_USER,
|
|
265
|
+
password=OB_PASSWORD
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Test connection
|
|
269
|
+
try:
|
|
270
|
+
result = client._server.execute("SELECT 1 as test")
|
|
271
|
+
assert result is not None
|
|
272
|
+
except Exception as e:
|
|
273
|
+
pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
|
|
274
|
+
|
|
275
|
+
# Create test collection
|
|
276
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
277
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
# Insert test data
|
|
281
|
+
self._insert_test_data(client, collection_name)
|
|
282
|
+
|
|
283
|
+
# Wait a bit for indexes to be ready
|
|
284
|
+
time.sleep(1)
|
|
285
|
+
|
|
286
|
+
# Test 3: Combined full-text and vector search
|
|
287
|
+
print(f"\n✅ Testing hybrid_search with both full-text and vector search")
|
|
288
|
+
results = collection.hybrid_search(
|
|
289
|
+
query={
|
|
290
|
+
"where_document": {
|
|
291
|
+
"$contains": "machine learning"
|
|
292
|
+
},
|
|
293
|
+
"n_results": 10
|
|
294
|
+
},
|
|
295
|
+
knn={
|
|
296
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
297
|
+
"n_results": 10
|
|
298
|
+
},
|
|
299
|
+
rank={
|
|
300
|
+
"rrf": {
|
|
301
|
+
"rank_window_size": 60,
|
|
302
|
+
"rank_constant": 60
|
|
303
|
+
}
|
|
304
|
+
},
|
|
305
|
+
n_results=5,
|
|
306
|
+
include=["documents", "metadatas", "embeddings"]
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
assert results is not None
|
|
310
|
+
assert "ids" in results
|
|
311
|
+
assert len(results["ids"]) > 0
|
|
312
|
+
print(f" Found {len(results['ids'])} results after RRF ranking")
|
|
313
|
+
|
|
314
|
+
finally:
|
|
315
|
+
# Cleanup
|
|
316
|
+
self._cleanup_collection(client, collection_name)
|
|
317
|
+
|
|
318
|
+
def test_oceanbase_hybrid_search_with_metadata_filter(self):
|
|
319
|
+
"""Test hybrid_search with metadata filters"""
|
|
320
|
+
# Create OceanBase client
|
|
321
|
+
client = seekdbclient.OBClient(
|
|
322
|
+
host=OB_HOST,
|
|
323
|
+
port=OB_PORT,
|
|
324
|
+
tenant=OB_TENANT,
|
|
325
|
+
database=OB_DATABASE,
|
|
326
|
+
user=OB_USER,
|
|
327
|
+
password=OB_PASSWORD
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Test connection
|
|
331
|
+
try:
|
|
332
|
+
result = client._server.execute("SELECT 1 as test")
|
|
333
|
+
assert result is not None
|
|
334
|
+
except Exception as e:
|
|
335
|
+
pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
|
|
336
|
+
|
|
337
|
+
# Create test collection
|
|
338
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
339
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
# Insert test data
|
|
343
|
+
self._insert_test_data(client, collection_name)
|
|
344
|
+
|
|
345
|
+
# Wait a bit for indexes to be ready
|
|
346
|
+
time.sleep(1)
|
|
347
|
+
|
|
348
|
+
# Test 4: Hybrid search with metadata filter
|
|
349
|
+
print(f"\n✅ Testing hybrid_search with metadata filter")
|
|
350
|
+
results = collection.hybrid_search(
|
|
351
|
+
query={
|
|
352
|
+
"where_document": {
|
|
353
|
+
"$contains": "machine"
|
|
354
|
+
},
|
|
355
|
+
"where": {
|
|
356
|
+
"$and": [
|
|
357
|
+
{"category": {"$eq": "AI"}},
|
|
358
|
+
{"page": {"$gte": 1}},
|
|
359
|
+
{"page": {"$lte": 5}}
|
|
360
|
+
]
|
|
361
|
+
},
|
|
362
|
+
"n_results": 10
|
|
363
|
+
},
|
|
364
|
+
knn={
|
|
365
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
366
|
+
"where": {
|
|
367
|
+
"$and": [
|
|
368
|
+
{"category": {"$eq": "AI"}},
|
|
369
|
+
{"score": {"$gte": 90}}
|
|
370
|
+
]
|
|
371
|
+
},
|
|
372
|
+
"n_results": 10
|
|
373
|
+
},
|
|
374
|
+
n_results=5,
|
|
375
|
+
include=["documents", "metadatas"]
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
assert results is not None
|
|
379
|
+
assert len(results["ids"]) > 0
|
|
380
|
+
print(f" Found {len(results['ids'])} results with metadata filters")
|
|
381
|
+
|
|
382
|
+
# Verify metadata filters are applied
|
|
383
|
+
# Note: In hybrid search with RRF ranking, results may include records from both
|
|
384
|
+
# full-text and vector search, so we check that all results meet at least one set of filters
|
|
385
|
+
for metadata in results["metadatas"]:
|
|
386
|
+
if metadata:
|
|
387
|
+
# Results should have category "AI" (common to both query and knn filters)
|
|
388
|
+
assert metadata.get("category") == "AI"
|
|
389
|
+
# Page filter may not be strictly applied in hybrid search results
|
|
390
|
+
# due to RRF ranking combining results from both queries
|
|
391
|
+
|
|
392
|
+
finally:
|
|
393
|
+
# Cleanup
|
|
394
|
+
self._cleanup_collection(client, collection_name)
|
|
395
|
+
|
|
396
|
+
def test_oceanbase_hybrid_search_with_logical_operators(self):
|
|
397
|
+
"""Test hybrid_search with logical operators in metadata filters"""
|
|
398
|
+
# Create OceanBase client
|
|
399
|
+
client = seekdbclient.OBClient(
|
|
400
|
+
host=OB_HOST,
|
|
401
|
+
port=OB_PORT,
|
|
402
|
+
tenant=OB_TENANT,
|
|
403
|
+
database=OB_DATABASE,
|
|
404
|
+
user=OB_USER,
|
|
405
|
+
password=OB_PASSWORD
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Test connection
|
|
409
|
+
try:
|
|
410
|
+
result = client._server.execute("SELECT 1 as test")
|
|
411
|
+
assert result is not None
|
|
412
|
+
except Exception as e:
|
|
413
|
+
pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
|
|
414
|
+
|
|
415
|
+
# Create test collection
|
|
416
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
417
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
418
|
+
|
|
419
|
+
try:
|
|
420
|
+
# Insert test data
|
|
421
|
+
self._insert_test_data(client, collection_name)
|
|
422
|
+
|
|
423
|
+
# Wait a bit for indexes to be ready
|
|
424
|
+
time.sleep(1)
|
|
425
|
+
|
|
426
|
+
# Test 5: Hybrid search with logical operators ($or, $in)
|
|
427
|
+
print(f"\n✅ Testing hybrid_search with logical operators")
|
|
428
|
+
results = collection.hybrid_search(
|
|
429
|
+
query={
|
|
430
|
+
"where_document": {
|
|
431
|
+
"$and": [
|
|
432
|
+
{"$contains": "machine"},
|
|
433
|
+
{"$contains": "learning"}
|
|
434
|
+
]
|
|
435
|
+
},
|
|
436
|
+
"where": {
|
|
437
|
+
"$or": [
|
|
438
|
+
{"tag": {"$eq": "ml"}},
|
|
439
|
+
{"tag": {"$eq": "python"}}
|
|
440
|
+
]
|
|
441
|
+
},
|
|
442
|
+
"n_results": 10
|
|
443
|
+
},
|
|
444
|
+
knn={
|
|
445
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
446
|
+
"where": {
|
|
447
|
+
"tag": {"$in": ["ml", "python"]}
|
|
448
|
+
},
|
|
449
|
+
"n_results": 10
|
|
450
|
+
},
|
|
451
|
+
rank={"rrf": {}},
|
|
452
|
+
n_results=5,
|
|
453
|
+
include=["documents", "metadatas"]
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
assert results is not None
|
|
457
|
+
assert len(results["ids"]) > 0
|
|
458
|
+
print(f" Found {len(results['ids'])} results with logical operators")
|
|
459
|
+
|
|
460
|
+
# Verify logical operators are applied
|
|
461
|
+
for metadata in results["metadatas"]:
|
|
462
|
+
if metadata and "tag" in metadata:
|
|
463
|
+
assert metadata["tag"] in ["ml", "python"]
|
|
464
|
+
|
|
465
|
+
finally:
|
|
466
|
+
# Cleanup
|
|
467
|
+
self._cleanup_collection(client, collection_name)
|
|
468
|
+
|
|
469
|
+
def test_seekdb_server_hybrid_search_full_text_only(self):
|
|
470
|
+
"""Test hybrid_search with only full-text search (query) using SeekdbServer"""
|
|
471
|
+
# Create SeekdbServer client
|
|
472
|
+
client = seekdbclient.Client(
|
|
473
|
+
host=SERVER_HOST,
|
|
474
|
+
port=SERVER_PORT,
|
|
475
|
+
database=SERVER_DATABASE,
|
|
476
|
+
user=SERVER_USER,
|
|
477
|
+
password=SERVER_PASSWORD
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
assert client is not None
|
|
481
|
+
assert hasattr(client, '_server')
|
|
482
|
+
assert isinstance(client._server, seekdbclient.SeekdbServerClient)
|
|
483
|
+
|
|
484
|
+
# Test connection
|
|
485
|
+
try:
|
|
486
|
+
result = client._server.execute("SELECT 1 as test")
|
|
487
|
+
assert result is not None
|
|
488
|
+
except Exception as e:
|
|
489
|
+
pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
|
|
490
|
+
|
|
491
|
+
# Create test collection
|
|
492
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
493
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
494
|
+
|
|
495
|
+
try:
|
|
496
|
+
# Insert test data
|
|
497
|
+
self._insert_test_data(client, collection_name)
|
|
498
|
+
|
|
499
|
+
# Wait a bit for indexes to be ready
|
|
500
|
+
time.sleep(1)
|
|
501
|
+
|
|
502
|
+
# Test 1: Full-text search only
|
|
503
|
+
print(f"\n✅ Testing hybrid_search with full-text search only (SeekdbServer)")
|
|
504
|
+
results = collection.hybrid_search(
|
|
505
|
+
query={
|
|
506
|
+
"where_document": {
|
|
507
|
+
"$contains": "machine learning"
|
|
508
|
+
}
|
|
509
|
+
},
|
|
510
|
+
n_results=5,
|
|
511
|
+
include=["documents", "metadatas"]
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
assert results is not None
|
|
515
|
+
assert "ids" in results
|
|
516
|
+
assert "documents" in results
|
|
517
|
+
assert "metadatas" in results
|
|
518
|
+
assert len(results["ids"]) > 0
|
|
519
|
+
print(f" Found {len(results['ids'])} results")
|
|
520
|
+
|
|
521
|
+
# Verify results contain "machine learning"
|
|
522
|
+
for doc in results["documents"]:
|
|
523
|
+
if doc:
|
|
524
|
+
assert "machine" in doc.lower() or "learning" in doc.lower()
|
|
525
|
+
|
|
526
|
+
finally:
|
|
527
|
+
# Cleanup
|
|
528
|
+
self._cleanup_collection(client, collection_name)
|
|
529
|
+
|
|
530
|
+
def test_seekdb_server_hybrid_search_combined(self):
|
|
531
|
+
"""Test hybrid_search with both full-text and vector search using SeekdbServer"""
|
|
532
|
+
# Create SeekdbServer client
|
|
533
|
+
client = seekdbclient.Client(
|
|
534
|
+
host=SERVER_HOST,
|
|
535
|
+
port=SERVER_PORT,
|
|
536
|
+
database=SERVER_DATABASE,
|
|
537
|
+
user=SERVER_USER,
|
|
538
|
+
password=SERVER_PASSWORD
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Test connection
|
|
542
|
+
try:
|
|
543
|
+
result = client._server.execute("SELECT 1 as test")
|
|
544
|
+
assert result is not None
|
|
545
|
+
except Exception as e:
|
|
546
|
+
pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
|
|
547
|
+
|
|
548
|
+
# Create test collection
|
|
549
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
550
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
551
|
+
|
|
552
|
+
try:
|
|
553
|
+
# Insert test data
|
|
554
|
+
self._insert_test_data(client, collection_name)
|
|
555
|
+
|
|
556
|
+
# Wait a bit for indexes to be ready
|
|
557
|
+
time.sleep(1)
|
|
558
|
+
|
|
559
|
+
# Test: Combined full-text and vector search
|
|
560
|
+
print(f"\n✅ Testing hybrid_search with both full-text and vector search (SeekdbServer)")
|
|
561
|
+
results = collection.hybrid_search(
|
|
562
|
+
query={
|
|
563
|
+
"where_document": {
|
|
564
|
+
"$contains": "machine learning"
|
|
565
|
+
},
|
|
566
|
+
"n_results": 10
|
|
567
|
+
},
|
|
568
|
+
knn={
|
|
569
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
570
|
+
"n_results": 10
|
|
571
|
+
},
|
|
572
|
+
rank={
|
|
573
|
+
"rrf": {
|
|
574
|
+
"rank_window_size": 60,
|
|
575
|
+
"rank_constant": 60
|
|
576
|
+
}
|
|
577
|
+
},
|
|
578
|
+
n_results=5,
|
|
579
|
+
include=["documents", "metadatas", "embeddings"]
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
assert results is not None
|
|
583
|
+
assert "ids" in results
|
|
584
|
+
assert len(results["ids"]) > 0
|
|
585
|
+
print(f" Found {len(results['ids'])} results after RRF ranking")
|
|
586
|
+
|
|
587
|
+
finally:
|
|
588
|
+
# Cleanup
|
|
589
|
+
self._cleanup_collection(client, collection_name)
|
|
590
|
+
|
|
591
|
+
def test_seekdb_server_hybrid_search_vector_only(self):
|
|
592
|
+
"""Test hybrid_search with only vector search (knn) using SeekdbServer"""
|
|
593
|
+
# Create SeekdbServer client
|
|
594
|
+
client = seekdbclient.Client(
|
|
595
|
+
host=SERVER_HOST,
|
|
596
|
+
port=SERVER_PORT,
|
|
597
|
+
database=SERVER_DATABASE,
|
|
598
|
+
user=SERVER_USER,
|
|
599
|
+
password=SERVER_PASSWORD
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# Test connection
|
|
603
|
+
try:
|
|
604
|
+
result = client._server.execute("SELECT 1 as test")
|
|
605
|
+
assert result is not None
|
|
606
|
+
except Exception as e:
|
|
607
|
+
pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
|
|
608
|
+
|
|
609
|
+
# Create test collection
|
|
610
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
611
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
612
|
+
|
|
613
|
+
try:
|
|
614
|
+
# Insert test data
|
|
615
|
+
self._insert_test_data(client, collection_name)
|
|
616
|
+
|
|
617
|
+
# Wait a bit for indexes to be ready
|
|
618
|
+
time.sleep(1)
|
|
619
|
+
|
|
620
|
+
# Test: Vector search only
|
|
621
|
+
print(f"\n✅ Testing hybrid_search with vector search only (SeekdbServer)")
|
|
622
|
+
results = collection.hybrid_search(
|
|
623
|
+
knn={
|
|
624
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
625
|
+
"n_results": 5
|
|
626
|
+
},
|
|
627
|
+
n_results=5,
|
|
628
|
+
include=["documents", "metadatas", "embeddings"]
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
assert results is not None
|
|
632
|
+
assert "ids" in results
|
|
633
|
+
assert "distances" in results
|
|
634
|
+
assert len(results["ids"]) > 0
|
|
635
|
+
print(f" Found {len(results['ids'])} results")
|
|
636
|
+
|
|
637
|
+
# Verify distances are reasonable
|
|
638
|
+
distances = results["distances"]
|
|
639
|
+
assert len(distances) > 0
|
|
640
|
+
for dist in distances:
|
|
641
|
+
assert dist >= 0, f"Distance should be non-negative, got {dist}"
|
|
642
|
+
min_distance = min(distances)
|
|
643
|
+
assert min_distance < 10.0, f"At least one distance should be reasonable, got min={min_distance}"
|
|
644
|
+
|
|
645
|
+
finally:
|
|
646
|
+
# Cleanup
|
|
647
|
+
self._cleanup_collection(client, collection_name)
|
|
648
|
+
|
|
649
|
+
def test_seekdb_server_hybrid_search_with_metadata_filter(self):
|
|
650
|
+
"""Test hybrid_search with metadata filters using SeekdbServer"""
|
|
651
|
+
# Create SeekdbServer client
|
|
652
|
+
client = seekdbclient.Client(
|
|
653
|
+
host=SERVER_HOST,
|
|
654
|
+
port=SERVER_PORT,
|
|
655
|
+
database=SERVER_DATABASE,
|
|
656
|
+
user=SERVER_USER,
|
|
657
|
+
password=SERVER_PASSWORD
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Test connection
|
|
661
|
+
try:
|
|
662
|
+
result = client._server.execute("SELECT 1 as test")
|
|
663
|
+
assert result is not None
|
|
664
|
+
except Exception as e:
|
|
665
|
+
pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
|
|
666
|
+
|
|
667
|
+
# Create test collection
|
|
668
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
669
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
670
|
+
|
|
671
|
+
try:
|
|
672
|
+
# Insert test data
|
|
673
|
+
self._insert_test_data(client, collection_name)
|
|
674
|
+
|
|
675
|
+
# Wait a bit for indexes to be ready
|
|
676
|
+
time.sleep(1)
|
|
677
|
+
|
|
678
|
+
# Test: Hybrid search with metadata filter
|
|
679
|
+
print(f"\n✅ Testing hybrid_search with metadata filter (SeekdbServer)")
|
|
680
|
+
results = collection.hybrid_search(
|
|
681
|
+
query={
|
|
682
|
+
"where_document": {
|
|
683
|
+
"$contains": "machine"
|
|
684
|
+
},
|
|
685
|
+
"where": {
|
|
686
|
+
"$and": [
|
|
687
|
+
{"category": {"$eq": "AI"}},
|
|
688
|
+
{"page": {"$gte": 1}},
|
|
689
|
+
{"page": {"$lte": 5}}
|
|
690
|
+
]
|
|
691
|
+
},
|
|
692
|
+
"n_results": 10
|
|
693
|
+
},
|
|
694
|
+
knn={
|
|
695
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
696
|
+
"where": {
|
|
697
|
+
"$and": [
|
|
698
|
+
{"category": {"$eq": "AI"}},
|
|
699
|
+
{"score": {"$gte": 90}}
|
|
700
|
+
]
|
|
701
|
+
},
|
|
702
|
+
"n_results": 10
|
|
703
|
+
},
|
|
704
|
+
n_results=5,
|
|
705
|
+
include=["documents", "metadatas"]
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
assert results is not None
|
|
709
|
+
assert len(results["ids"]) > 0
|
|
710
|
+
print(f" Found {len(results['ids'])} results with metadata filters")
|
|
711
|
+
|
|
712
|
+
# Verify metadata filters are applied
|
|
713
|
+
for metadata in results["metadatas"]:
|
|
714
|
+
if metadata:
|
|
715
|
+
assert metadata.get("category") == "AI"
|
|
716
|
+
|
|
717
|
+
finally:
|
|
718
|
+
# Cleanup
|
|
719
|
+
self._cleanup_collection(client, collection_name)
|
|
720
|
+
|
|
721
|
+
def test_seekdb_server_hybrid_search_with_logical_operators(self):
|
|
722
|
+
"""Test hybrid_search with logical operators in metadata filters using SeekdbServer"""
|
|
723
|
+
# Create SeekdbServer client
|
|
724
|
+
client = seekdbclient.Client(
|
|
725
|
+
host=SERVER_HOST,
|
|
726
|
+
port=SERVER_PORT,
|
|
727
|
+
database=SERVER_DATABASE,
|
|
728
|
+
user=SERVER_USER,
|
|
729
|
+
password=SERVER_PASSWORD
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Test connection
|
|
733
|
+
try:
|
|
734
|
+
result = client._server.execute("SELECT 1 as test")
|
|
735
|
+
assert result is not None
|
|
736
|
+
except Exception as e:
|
|
737
|
+
pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
|
|
738
|
+
|
|
739
|
+
# Create test collection
|
|
740
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
741
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
742
|
+
|
|
743
|
+
try:
|
|
744
|
+
# Insert test data
|
|
745
|
+
self._insert_test_data(client, collection_name)
|
|
746
|
+
|
|
747
|
+
# Wait a bit for indexes to be ready
|
|
748
|
+
time.sleep(1)
|
|
749
|
+
|
|
750
|
+
# Test: Hybrid search with logical operators
|
|
751
|
+
print(f"\n✅ Testing hybrid_search with logical operators (SeekdbServer)")
|
|
752
|
+
results = collection.hybrid_search(
|
|
753
|
+
query={
|
|
754
|
+
"where_document": {
|
|
755
|
+
"$and": [
|
|
756
|
+
{"$contains": "machine"},
|
|
757
|
+
{"$contains": "learning"}
|
|
758
|
+
]
|
|
759
|
+
},
|
|
760
|
+
"where": {
|
|
761
|
+
"$or": [
|
|
762
|
+
{"tag": {"$eq": "ml"}},
|
|
763
|
+
{"tag": {"$eq": "python"}}
|
|
764
|
+
]
|
|
765
|
+
},
|
|
766
|
+
"n_results": 10
|
|
767
|
+
},
|
|
768
|
+
knn={
|
|
769
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
770
|
+
"where": {
|
|
771
|
+
"tag": {"$in": ["ml", "python"]}
|
|
772
|
+
},
|
|
773
|
+
"n_results": 10
|
|
774
|
+
},
|
|
775
|
+
rank={"rrf": {}},
|
|
776
|
+
n_results=5,
|
|
777
|
+
include=["documents", "metadatas"]
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
assert results is not None
|
|
781
|
+
assert len(results["ids"]) > 0
|
|
782
|
+
print(f" Found {len(results['ids'])} results with logical operators")
|
|
783
|
+
|
|
784
|
+
# Verify logical operators are applied
|
|
785
|
+
for metadata in results["metadatas"]:
|
|
786
|
+
if metadata and "tag" in metadata:
|
|
787
|
+
assert metadata["tag"] in ["ml", "python"]
|
|
788
|
+
|
|
789
|
+
finally:
|
|
790
|
+
# Cleanup
|
|
791
|
+
self._cleanup_collection(client, collection_name)
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def test_embedded_hybrid_search_full_text_only(self):
|
|
795
|
+
"""Test hybrid_search with only full-text search (query) using SeekdbEmbedded"""
|
|
796
|
+
if not os.path.exists(SEEKDB_PATH):
|
|
797
|
+
pytest.skip(
|
|
798
|
+
f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
|
|
799
|
+
f"Set SEEKDB_PATH environment variable to run this test"
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
# Check if seekdb package is available
|
|
803
|
+
try:
|
|
804
|
+
import seekdb
|
|
805
|
+
except ImportError:
|
|
806
|
+
pytest.skip("SeekDB embedded package is not installed")
|
|
807
|
+
|
|
808
|
+
# Create embedded client
|
|
809
|
+
client = seekdbclient.Client(
|
|
810
|
+
path=SEEKDB_PATH,
|
|
811
|
+
database=SEEKDB_DATABASE
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
assert client is not None
|
|
815
|
+
assert hasattr(client, '_server')
|
|
816
|
+
assert isinstance(client._server, seekdbclient.SeekdbEmbeddedClient)
|
|
817
|
+
|
|
818
|
+
# Create test collection
|
|
819
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
820
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
821
|
+
|
|
822
|
+
try:
|
|
823
|
+
# Insert test data
|
|
824
|
+
self._insert_test_data(client, collection_name)
|
|
825
|
+
|
|
826
|
+
# Wait a bit for indexes to be ready
|
|
827
|
+
time.sleep(1)
|
|
828
|
+
|
|
829
|
+
# Test 1: Full-text search only
|
|
830
|
+
print(f"\n✅ Testing hybrid_search with full-text search only (SeekdbEmbedded)")
|
|
831
|
+
results = collection.hybrid_search(
|
|
832
|
+
query={
|
|
833
|
+
"where_document": {
|
|
834
|
+
"$contains": "machine learning"
|
|
835
|
+
}
|
|
836
|
+
},
|
|
837
|
+
n_results=5,
|
|
838
|
+
include=["documents", "metadatas"]
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
assert results is not None
|
|
842
|
+
assert "ids" in results
|
|
843
|
+
assert "documents" in results
|
|
844
|
+
assert "metadatas" in results
|
|
845
|
+
assert len(results["ids"]) > 0
|
|
846
|
+
print(f" Found {len(results['ids'])} results")
|
|
847
|
+
|
|
848
|
+
# Verify results contain "machine learning"
|
|
849
|
+
for doc in results["documents"]:
|
|
850
|
+
if doc:
|
|
851
|
+
assert "machine" in doc.lower() or "learning" in doc.lower()
|
|
852
|
+
|
|
853
|
+
finally:
|
|
854
|
+
# Cleanup
|
|
855
|
+
self._cleanup_collection(client, collection_name)
|
|
856
|
+
|
|
857
|
+
def test_embedded_hybrid_search_vector_only(self):
|
|
858
|
+
"""Test hybrid_search with only vector search (knn) using SeekdbEmbedded"""
|
|
859
|
+
if not os.path.exists(SEEKDB_PATH):
|
|
860
|
+
pytest.skip(
|
|
861
|
+
f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
|
|
862
|
+
f"Set SEEKDB_PATH environment variable to run this test"
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
# Check if seekdb package is available
|
|
866
|
+
try:
|
|
867
|
+
import seekdb
|
|
868
|
+
except ImportError:
|
|
869
|
+
pytest.skip("SeekDB embedded package is not installed")
|
|
870
|
+
|
|
871
|
+
# Create embedded client
|
|
872
|
+
client = seekdbclient.Client(
|
|
873
|
+
path=SEEKDB_PATH,
|
|
874
|
+
database=SEEKDB_DATABASE
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
# Create test collection
|
|
878
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
879
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
880
|
+
|
|
881
|
+
try:
|
|
882
|
+
# Insert test data
|
|
883
|
+
self._insert_test_data(client, collection_name)
|
|
884
|
+
|
|
885
|
+
# Wait a bit for indexes to be ready
|
|
886
|
+
time.sleep(1)
|
|
887
|
+
|
|
888
|
+
# Test: Vector search only
|
|
889
|
+
print(f"\n✅ Testing hybrid_search with vector search only (SeekdbEmbedded)")
|
|
890
|
+
results = collection.hybrid_search(
|
|
891
|
+
knn={
|
|
892
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
893
|
+
"n_results": 5
|
|
894
|
+
},
|
|
895
|
+
n_results=5,
|
|
896
|
+
include=["documents", "metadatas", "embeddings"]
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
assert results is not None
|
|
900
|
+
assert "ids" in results
|
|
901
|
+
assert "distances" in results
|
|
902
|
+
assert len(results["ids"]) > 0
|
|
903
|
+
print(f" Found {len(results['ids'])} results")
|
|
904
|
+
|
|
905
|
+
# Verify distances are reasonable
|
|
906
|
+
distances = results["distances"]
|
|
907
|
+
assert len(distances) > 0
|
|
908
|
+
for dist in distances:
|
|
909
|
+
assert dist >= 0, f"Distance should be non-negative, got {dist}"
|
|
910
|
+
min_distance = min(distances)
|
|
911
|
+
assert min_distance < 10.0, f"At least one distance should be reasonable, got min={min_distance}"
|
|
912
|
+
|
|
913
|
+
finally:
|
|
914
|
+
# Cleanup
|
|
915
|
+
self._cleanup_collection(client, collection_name)
|
|
916
|
+
|
|
917
|
+
def test_embedded_hybrid_search_combined(self):
|
|
918
|
+
"""Test hybrid_search with both full-text and vector search using SeekdbEmbedded"""
|
|
919
|
+
if not os.path.exists(SEEKDB_PATH):
|
|
920
|
+
pytest.skip(
|
|
921
|
+
f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
|
|
922
|
+
f"Set SEEKDB_PATH environment variable to run this test"
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Check if seekdb package is available
|
|
926
|
+
try:
|
|
927
|
+
import seekdb
|
|
928
|
+
except ImportError:
|
|
929
|
+
pytest.skip("SeekDB embedded package is not installed")
|
|
930
|
+
|
|
931
|
+
# Create embedded client
|
|
932
|
+
client = seekdbclient.Client(
|
|
933
|
+
path=SEEKDB_PATH,
|
|
934
|
+
database=SEEKDB_DATABASE
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
# Create test collection
|
|
938
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
939
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
940
|
+
|
|
941
|
+
try:
|
|
942
|
+
# Insert test data
|
|
943
|
+
self._insert_test_data(client, collection_name)
|
|
944
|
+
|
|
945
|
+
# Wait a bit for indexes to be ready
|
|
946
|
+
time.sleep(1)
|
|
947
|
+
|
|
948
|
+
# Test: Combined full-text and vector search
|
|
949
|
+
print(f"\n✅ Testing hybrid_search with both full-text and vector search (SeekdbEmbedded)")
|
|
950
|
+
results = collection.hybrid_search(
|
|
951
|
+
query={
|
|
952
|
+
"where_document": {
|
|
953
|
+
"$contains": "machine learning"
|
|
954
|
+
},
|
|
955
|
+
"n_results": 10
|
|
956
|
+
},
|
|
957
|
+
knn={
|
|
958
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
959
|
+
"n_results": 10
|
|
960
|
+
},
|
|
961
|
+
rank={
|
|
962
|
+
"rrf": {
|
|
963
|
+
"rank_window_size": 60,
|
|
964
|
+
"rank_constant": 60
|
|
965
|
+
}
|
|
966
|
+
},
|
|
967
|
+
n_results=5,
|
|
968
|
+
include=["documents", "metadatas", "embeddings"]
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
assert results is not None
|
|
972
|
+
assert "ids" in results
|
|
973
|
+
assert len(results["ids"]) > 0
|
|
974
|
+
print(f" Found {len(results['ids'])} results after RRF ranking")
|
|
975
|
+
|
|
976
|
+
finally:
|
|
977
|
+
# Cleanup
|
|
978
|
+
self._cleanup_collection(client, collection_name)
|
|
979
|
+
|
|
980
|
+
def test_embedded_hybrid_search_with_metadata_filter(self):
|
|
981
|
+
"""Test hybrid_search with metadata filters using SeekdbEmbedded"""
|
|
982
|
+
if not os.path.exists(SEEKDB_PATH):
|
|
983
|
+
pytest.skip(
|
|
984
|
+
f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
|
|
985
|
+
f"Set SEEKDB_PATH environment variable to run this test"
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
# Check if seekdb package is available
|
|
989
|
+
try:
|
|
990
|
+
import seekdb
|
|
991
|
+
except ImportError:
|
|
992
|
+
pytest.skip("SeekDB embedded package is not installed")
|
|
993
|
+
|
|
994
|
+
# Create embedded client
|
|
995
|
+
client = seekdbclient.Client(
|
|
996
|
+
path=SEEKDB_PATH,
|
|
997
|
+
database=SEEKDB_DATABASE
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
# Create test collection
|
|
1001
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
1002
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
1003
|
+
|
|
1004
|
+
try:
|
|
1005
|
+
# Insert test data
|
|
1006
|
+
self._insert_test_data(client, collection_name)
|
|
1007
|
+
|
|
1008
|
+
# Wait a bit for indexes to be ready
|
|
1009
|
+
time.sleep(1)
|
|
1010
|
+
|
|
1011
|
+
# Test: Hybrid search with metadata filter
|
|
1012
|
+
print(f"\n✅ Testing hybrid_search with metadata filter (SeekdbEmbedded)")
|
|
1013
|
+
results = collection.hybrid_search(
|
|
1014
|
+
query={
|
|
1015
|
+
"where_document": {
|
|
1016
|
+
"$contains": "machine"
|
|
1017
|
+
},
|
|
1018
|
+
"where": {
|
|
1019
|
+
"$and": [
|
|
1020
|
+
{"category": {"$eq": "AI"}},
|
|
1021
|
+
{"page": {"$gte": 1}},
|
|
1022
|
+
{"page": {"$lte": 5}}
|
|
1023
|
+
]
|
|
1024
|
+
},
|
|
1025
|
+
"n_results": 10
|
|
1026
|
+
},
|
|
1027
|
+
knn={
|
|
1028
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
1029
|
+
"where": {
|
|
1030
|
+
"$and": [
|
|
1031
|
+
{"category": {"$eq": "AI"}},
|
|
1032
|
+
{"score": {"$gte": 90}}
|
|
1033
|
+
]
|
|
1034
|
+
},
|
|
1035
|
+
"n_results": 10
|
|
1036
|
+
},
|
|
1037
|
+
n_results=5,
|
|
1038
|
+
include=["documents", "metadatas"]
|
|
1039
|
+
)
|
|
1040
|
+
|
|
1041
|
+
assert results is not None
|
|
1042
|
+
assert len(results["ids"]) > 0
|
|
1043
|
+
print(f" Found {len(results['ids'])} results with metadata filters")
|
|
1044
|
+
|
|
1045
|
+
# Verify metadata filters are applied
|
|
1046
|
+
for metadata in results["metadatas"]:
|
|
1047
|
+
if metadata:
|
|
1048
|
+
assert metadata.get("category") == "AI"
|
|
1049
|
+
|
|
1050
|
+
finally:
|
|
1051
|
+
# Cleanup
|
|
1052
|
+
self._cleanup_collection(client, collection_name)
|
|
1053
|
+
|
|
1054
|
+
def test_embedded_hybrid_search_with_logical_operators(self):
|
|
1055
|
+
"""Test hybrid_search with logical operators in metadata filters using SeekdbEmbedded"""
|
|
1056
|
+
if not os.path.exists(SEEKDB_PATH):
|
|
1057
|
+
pytest.skip(
|
|
1058
|
+
f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
|
|
1059
|
+
f"Set SEEKDB_PATH environment variable to run this test"
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
# Check if seekdb package is available
|
|
1063
|
+
try:
|
|
1064
|
+
import seekdb
|
|
1065
|
+
except ImportError:
|
|
1066
|
+
pytest.skip("SeekDB embedded package is not installed")
|
|
1067
|
+
|
|
1068
|
+
# Create embedded client
|
|
1069
|
+
client = seekdbclient.Client(
|
|
1070
|
+
path=SEEKDB_PATH,
|
|
1071
|
+
database=SEEKDB_DATABASE
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
# Create test collection
|
|
1075
|
+
collection_name = f"test_hybrid_search_{int(time.time())}"
|
|
1076
|
+
collection = self._create_test_collection(client, collection_name, dimension=3)
|
|
1077
|
+
|
|
1078
|
+
try:
|
|
1079
|
+
# Insert test data
|
|
1080
|
+
self._insert_test_data(client, collection_name)
|
|
1081
|
+
|
|
1082
|
+
# Wait a bit for indexes to be ready
|
|
1083
|
+
time.sleep(1)
|
|
1084
|
+
|
|
1085
|
+
# Test: Hybrid search with logical operators
|
|
1086
|
+
print(f"\n✅ Testing hybrid_search with logical operators (SeekdbEmbedded)")
|
|
1087
|
+
results = collection.hybrid_search(
|
|
1088
|
+
query={
|
|
1089
|
+
"where_document": {
|
|
1090
|
+
"$and": [
|
|
1091
|
+
{"$contains": "machine"},
|
|
1092
|
+
{"$contains": "learning"}
|
|
1093
|
+
]
|
|
1094
|
+
},
|
|
1095
|
+
"where": {
|
|
1096
|
+
"$or": [
|
|
1097
|
+
{"tag": {"$eq": "ml"}},
|
|
1098
|
+
{"tag": {"$eq": "python"}}
|
|
1099
|
+
]
|
|
1100
|
+
},
|
|
1101
|
+
"n_results": 10
|
|
1102
|
+
},
|
|
1103
|
+
knn={
|
|
1104
|
+
"query_embeddings": [1.0, 2.0, 3.0],
|
|
1105
|
+
"where": {
|
|
1106
|
+
"tag": {"$in": ["ml", "python"]}
|
|
1107
|
+
},
|
|
1108
|
+
"n_results": 10
|
|
1109
|
+
},
|
|
1110
|
+
rank={"rrf": {}},
|
|
1111
|
+
n_results=5,
|
|
1112
|
+
include=["documents", "metadatas"]
|
|
1113
|
+
)
|
|
1114
|
+
|
|
1115
|
+
assert results is not None
|
|
1116
|
+
assert len(results["ids"]) > 0
|
|
1117
|
+
print(f" Found {len(results['ids'])} results with logical operators")
|
|
1118
|
+
|
|
1119
|
+
# Verify logical operators are applied
|
|
1120
|
+
for metadata in results["metadatas"]:
|
|
1121
|
+
if metadata and "tag" in metadata:
|
|
1122
|
+
assert metadata["tag"] in ["ml", "python"]
|
|
1123
|
+
|
|
1124
|
+
finally:
|
|
1125
|
+
# Cleanup
|
|
1126
|
+
self._cleanup_collection(client, collection_name)
|