pyseekdb 0.1.0.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1126 @@
1
+ """
2
+ Collection hybrid search tests - testing collection.hybrid_search() interface for all three modes
3
+ Supports configuring connection parameters via environment variables
4
+ """
5
+ import pytest
6
+ import sys
7
+ import os
8
+ import time
9
+ import json
10
+ import uuid
11
+ from pathlib import Path
12
+
13
+ # Add project path
14
+ project_root = Path(__file__).parent.parent.parent
15
+ sys.path.insert(0, str(project_root))
16
+
17
+ import seekdbclient
18
+
19
+
20
+ # ==================== Environment Variable Configuration ====================
21
+ # Embedded mode
22
+ SEEKDB_PATH = os.environ.get('SEEKDB_PATH', os.path.join(project_root, "seekdb_store"))
23
+ SEEKDB_DATABASE = os.environ.get('SEEKDB_DATABASE', 'test')
24
+
25
+ # Server mode
26
+ SERVER_HOST = os.environ.get('SERVER_HOST', 'localhost')
27
+ SERVER_PORT = int(os.environ.get('SERVER_PORT', '2881'))
28
+ SERVER_DATABASE = os.environ.get('SERVER_DATABASE', 'test')
29
+ SERVER_USER = os.environ.get('SERVER_USER', 'root')
30
+ SERVER_PASSWORD = os.environ.get('SERVER_PASSWORD', '')
31
+
32
+ # OceanBase mode
33
+ OB_HOST = os.environ.get('OB_HOST', 'localhost')
34
+ OB_PORT = int(os.environ.get('OB_PORT', '11202'))
35
+ OB_TENANT = os.environ.get('OB_TENANT', 'mysql')
36
+ OB_DATABASE = os.environ.get('OB_DATABASE', 'test')
37
+ OB_USER = os.environ.get('OB_USER', 'root')
38
+ OB_PASSWORD = os.environ.get('OB_PASSWORD', '')
39
+
40
+
41
+ class TestCollectionHybridSearch:
42
+ """Test collection.hybrid_search() interface for all three modes"""
43
+
44
+ def _create_test_collection(self, client, collection_name: str, dimension: int = 3):
45
+ """Helper method to create a test collection"""
46
+ # Use client.create_collection to create the collection
47
+ collection = client.create_collection(
48
+ name=collection_name,
49
+ dimension=dimension
50
+ )
51
+ return collection
52
+
53
+ def _insert_test_data(self, client, collection_name: str):
54
+ """Helper method to insert test data via SQL"""
55
+ table_name = f"c$v1${collection_name}"
56
+
57
+ # Insert test data with vectors, documents, and metadata
58
+ # Data designed for hybrid search testing
59
+ test_data = [
60
+ {
61
+ "document": "Machine learning is a subset of artificial intelligence",
62
+ "embedding": [1.0, 2.0, 3.0],
63
+ "metadata": {"category": "AI", "page": 1, "score": 95, "tag": "ml"}
64
+ },
65
+ {
66
+ "document": "Python programming language is widely used in data science",
67
+ "embedding": [2.0, 3.0, 4.0],
68
+ "metadata": {"category": "Programming", "page": 2, "score": 88, "tag": "python"}
69
+ },
70
+ {
71
+ "document": "Deep learning algorithms for neural networks",
72
+ "embedding": [1.1, 2.1, 3.1],
73
+ "metadata": {"category": "AI", "page": 3, "score": 92, "tag": "ml"}
74
+ },
75
+ {
76
+ "document": "Data science with Python and machine learning",
77
+ "embedding": [2.1, 3.1, 4.1],
78
+ "metadata": {"category": "Data Science", "page": 4, "score": 90, "tag": "python"}
79
+ },
80
+ {
81
+ "document": "Introduction to artificial intelligence and neural networks",
82
+ "embedding": [1.2, 2.2, 3.2],
83
+ "metadata": {"category": "AI", "page": 5, "score": 85, "tag": "neural"}
84
+ },
85
+ {
86
+ "document": "Advanced machine learning techniques and algorithms",
87
+ "embedding": [1.3, 2.3, 3.3],
88
+ "metadata": {"category": "AI", "page": 6, "score": 93, "tag": "ml"}
89
+ },
90
+ {
91
+ "document": "Python tutorial for beginners in programming",
92
+ "embedding": [2.2, 3.2, 4.2],
93
+ "metadata": {"category": "Programming", "page": 7, "score": 87, "tag": "python"}
94
+ },
95
+ {
96
+ "document": "Natural language processing with machine learning",
97
+ "embedding": [1.4, 2.4, 3.4],
98
+ "metadata": {"category": "AI", "page": 8, "score": 91, "tag": "nlp"}
99
+ }
100
+ ]
101
+
102
+ for data in test_data:
103
+ # Generate UUID for _id (use string format directly)
104
+ id_str = str(uuid.uuid4())
105
+ # Escape single quotes in ID
106
+ id_str_escaped = id_str.replace("'", "''")
107
+
108
+ # Convert vector to string format: [1.0,2.0,3.0]
109
+ vector_str = "[" + ",".join(map(str, data["embedding"])) + "]"
110
+ # Convert metadata to JSON string
111
+ metadata_str = json.dumps(data["metadata"], ensure_ascii=False).replace("'", "\\'")
112
+ # Escape single quotes in document
113
+ document_str = data["document"].replace("'", "\\'")
114
+
115
+ # Use CAST to convert string to binary for varbinary(512) field
116
+ sql = f"""INSERT INTO `{table_name}` (_id, document, embedding, metadata)
117
+ VALUES (CAST('{id_str_escaped}' AS BINARY), '{document_str}', '{vector_str}', '{metadata_str}')"""
118
+ client._server.execute(sql)
119
+
120
+ print(f" Inserted {len(test_data)} test records")
121
+
122
+ def _cleanup_collection(self, client, collection_name: str):
123
+ """Helper method to cleanup test collection"""
124
+ table_name = f"c$v1${collection_name}"
125
+ try:
126
+ client._server.execute(f"DROP TABLE IF EXISTS `{table_name}`")
127
+ print(f" Cleaned up test table: {table_name}")
128
+ except Exception as cleanup_error:
129
+ print(f" Warning: Failed to cleanup test table: {cleanup_error}")
130
+
131
+ def test_oceanbase_hybrid_search_full_text_only(self):
132
+ """Test hybrid_search with only full-text search (query)"""
133
+ # Create OceanBase client
134
+ client = seekdbclient.OBClient(
135
+ host=OB_HOST,
136
+ port=OB_PORT,
137
+ tenant=OB_TENANT,
138
+ database=OB_DATABASE,
139
+ user=OB_USER,
140
+ password=OB_PASSWORD
141
+ )
142
+
143
+ assert client is not None
144
+ assert hasattr(client, '_server')
145
+ assert isinstance(client._server, seekdbclient.OceanBaseServerClient)
146
+
147
+ # Test connection
148
+ try:
149
+ result = client._server.execute("SELECT 1 as test")
150
+ assert result is not None
151
+ except Exception as e:
152
+ pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
153
+
154
+ # Create test collection
155
+ collection_name = f"test_hybrid_search_{int(time.time())}"
156
+ collection = self._create_test_collection(client, collection_name, dimension=3)
157
+
158
+ try:
159
+ # Insert test data
160
+ self._insert_test_data(client, collection_name)
161
+
162
+ # Wait a bit for indexes to be ready
163
+ time.sleep(1)
164
+
165
+ # Test 1: Full-text search only
166
+ print(f"\n✅ Testing hybrid_search with full-text search only")
167
+ results = collection.hybrid_search(
168
+ query={
169
+ "where_document": {
170
+ "$contains": "machine learning"
171
+ }
172
+ },
173
+ n_results=5,
174
+ include=["documents", "metadatas"]
175
+ )
176
+
177
+ assert results is not None
178
+ assert "ids" in results
179
+ assert "documents" in results
180
+ assert "metadatas" in results
181
+ assert len(results["ids"]) > 0
182
+ print(f" Found {len(results['ids'])} results")
183
+
184
+ # Verify results contain "machine learning"
185
+ for doc in results["documents"]:
186
+ if doc:
187
+ assert "machine" in doc.lower() or "learning" in doc.lower()
188
+
189
+ finally:
190
+ # Cleanup
191
+ self._cleanup_collection(client, collection_name)
192
+
193
+ def test_oceanbase_hybrid_search_vector_only(self):
194
+ """Test hybrid_search with only vector search (knn)"""
195
+ # Create OceanBase client
196
+ client = seekdbclient.OBClient(
197
+ host=OB_HOST,
198
+ port=OB_PORT,
199
+ tenant=OB_TENANT,
200
+ database=OB_DATABASE,
201
+ user=OB_USER,
202
+ password=OB_PASSWORD
203
+ )
204
+
205
+ # Test connection
206
+ try:
207
+ result = client._server.execute("SELECT 1 as test")
208
+ assert result is not None
209
+ except Exception as e:
210
+ pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
211
+
212
+ # Create test collection
213
+ collection_name = f"test_hybrid_search_{int(time.time())}"
214
+ collection = self._create_test_collection(client, collection_name, dimension=3)
215
+
216
+ try:
217
+ # Insert test data
218
+ self._insert_test_data(client, collection_name)
219
+
220
+ # Wait a bit for indexes to be ready
221
+ time.sleep(1)
222
+
223
+ # Test 2: Vector search only
224
+ print(f"\n✅ Testing hybrid_search with vector search only")
225
+ results = collection.hybrid_search(
226
+ knn={
227
+ "query_embeddings": [1.0, 2.0, 3.0],
228
+ "n_results": 5
229
+ },
230
+ n_results=5,
231
+ include=["documents", "metadatas", "embeddings"]
232
+ )
233
+
234
+ assert results is not None
235
+ assert "ids" in results
236
+ assert "distances" in results
237
+ assert len(results["ids"]) > 0
238
+ print(f" Found {len(results['ids'])} results")
239
+
240
+ # Verify distances are reasonable
241
+ # Note: APPROXIMATE ordering may not be perfectly sorted, so we only check
242
+ # that distances are non-negative and reasonable
243
+ distances = results["distances"]
244
+ assert len(distances) > 0
245
+ # All distances should be non-negative
246
+ for dist in distances:
247
+ assert dist >= 0, f"Distance should be non-negative, got {dist}"
248
+ # At least one distance should be relatively small (close match)
249
+ min_distance = min(distances)
250
+ assert min_distance < 10.0, f"At least one distance should be reasonable, got min={min_distance}"
251
+
252
+ finally:
253
+ # Cleanup
254
+ self._cleanup_collection(client, collection_name)
255
+
256
+ def test_oceanbase_hybrid_search_combined(self):
257
+ """Test hybrid_search with both full-text and vector search"""
258
+ # Create OceanBase client
259
+ client = seekdbclient.OBClient(
260
+ host=OB_HOST,
261
+ port=OB_PORT,
262
+ tenant=OB_TENANT,
263
+ database=OB_DATABASE,
264
+ user=OB_USER,
265
+ password=OB_PASSWORD
266
+ )
267
+
268
+ # Test connection
269
+ try:
270
+ result = client._server.execute("SELECT 1 as test")
271
+ assert result is not None
272
+ except Exception as e:
273
+ pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
274
+
275
+ # Create test collection
276
+ collection_name = f"test_hybrid_search_{int(time.time())}"
277
+ collection = self._create_test_collection(client, collection_name, dimension=3)
278
+
279
+ try:
280
+ # Insert test data
281
+ self._insert_test_data(client, collection_name)
282
+
283
+ # Wait a bit for indexes to be ready
284
+ time.sleep(1)
285
+
286
+ # Test 3: Combined full-text and vector search
287
+ print(f"\n✅ Testing hybrid_search with both full-text and vector search")
288
+ results = collection.hybrid_search(
289
+ query={
290
+ "where_document": {
291
+ "$contains": "machine learning"
292
+ },
293
+ "n_results": 10
294
+ },
295
+ knn={
296
+ "query_embeddings": [1.0, 2.0, 3.0],
297
+ "n_results": 10
298
+ },
299
+ rank={
300
+ "rrf": {
301
+ "rank_window_size": 60,
302
+ "rank_constant": 60
303
+ }
304
+ },
305
+ n_results=5,
306
+ include=["documents", "metadatas", "embeddings"]
307
+ )
308
+
309
+ assert results is not None
310
+ assert "ids" in results
311
+ assert len(results["ids"]) > 0
312
+ print(f" Found {len(results['ids'])} results after RRF ranking")
313
+
314
+ finally:
315
+ # Cleanup
316
+ self._cleanup_collection(client, collection_name)
317
+
318
+ def test_oceanbase_hybrid_search_with_metadata_filter(self):
319
+ """Test hybrid_search with metadata filters"""
320
+ # Create OceanBase client
321
+ client = seekdbclient.OBClient(
322
+ host=OB_HOST,
323
+ port=OB_PORT,
324
+ tenant=OB_TENANT,
325
+ database=OB_DATABASE,
326
+ user=OB_USER,
327
+ password=OB_PASSWORD
328
+ )
329
+
330
+ # Test connection
331
+ try:
332
+ result = client._server.execute("SELECT 1 as test")
333
+ assert result is not None
334
+ except Exception as e:
335
+ pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
336
+
337
+ # Create test collection
338
+ collection_name = f"test_hybrid_search_{int(time.time())}"
339
+ collection = self._create_test_collection(client, collection_name, dimension=3)
340
+
341
+ try:
342
+ # Insert test data
343
+ self._insert_test_data(client, collection_name)
344
+
345
+ # Wait a bit for indexes to be ready
346
+ time.sleep(1)
347
+
348
+ # Test 4: Hybrid search with metadata filter
349
+ print(f"\n✅ Testing hybrid_search with metadata filter")
350
+ results = collection.hybrid_search(
351
+ query={
352
+ "where_document": {
353
+ "$contains": "machine"
354
+ },
355
+ "where": {
356
+ "$and": [
357
+ {"category": {"$eq": "AI"}},
358
+ {"page": {"$gte": 1}},
359
+ {"page": {"$lte": 5}}
360
+ ]
361
+ },
362
+ "n_results": 10
363
+ },
364
+ knn={
365
+ "query_embeddings": [1.0, 2.0, 3.0],
366
+ "where": {
367
+ "$and": [
368
+ {"category": {"$eq": "AI"}},
369
+ {"score": {"$gte": 90}}
370
+ ]
371
+ },
372
+ "n_results": 10
373
+ },
374
+ n_results=5,
375
+ include=["documents", "metadatas"]
376
+ )
377
+
378
+ assert results is not None
379
+ assert len(results["ids"]) > 0
380
+ print(f" Found {len(results['ids'])} results with metadata filters")
381
+
382
+ # Verify metadata filters are applied
383
+ # Note: In hybrid search with RRF ranking, results may include records from both
384
+ # full-text and vector search, so we check that all results meet at least one set of filters
385
+ for metadata in results["metadatas"]:
386
+ if metadata:
387
+ # Results should have category "AI" (common to both query and knn filters)
388
+ assert metadata.get("category") == "AI"
389
+ # Page filter may not be strictly applied in hybrid search results
390
+ # due to RRF ranking combining results from both queries
391
+
392
+ finally:
393
+ # Cleanup
394
+ self._cleanup_collection(client, collection_name)
395
+
396
+ def test_oceanbase_hybrid_search_with_logical_operators(self):
397
+ """Test hybrid_search with logical operators in metadata filters"""
398
+ # Create OceanBase client
399
+ client = seekdbclient.OBClient(
400
+ host=OB_HOST,
401
+ port=OB_PORT,
402
+ tenant=OB_TENANT,
403
+ database=OB_DATABASE,
404
+ user=OB_USER,
405
+ password=OB_PASSWORD
406
+ )
407
+
408
+ # Test connection
409
+ try:
410
+ result = client._server.execute("SELECT 1 as test")
411
+ assert result is not None
412
+ except Exception as e:
413
+ pytest.skip(f"OceanBase connection failed ({OB_HOST}:{OB_PORT}): {e}")
414
+
415
+ # Create test collection
416
+ collection_name = f"test_hybrid_search_{int(time.time())}"
417
+ collection = self._create_test_collection(client, collection_name, dimension=3)
418
+
419
+ try:
420
+ # Insert test data
421
+ self._insert_test_data(client, collection_name)
422
+
423
+ # Wait a bit for indexes to be ready
424
+ time.sleep(1)
425
+
426
+ # Test 5: Hybrid search with logical operators ($or, $in)
427
+ print(f"\n✅ Testing hybrid_search with logical operators")
428
+ results = collection.hybrid_search(
429
+ query={
430
+ "where_document": {
431
+ "$and": [
432
+ {"$contains": "machine"},
433
+ {"$contains": "learning"}
434
+ ]
435
+ },
436
+ "where": {
437
+ "$or": [
438
+ {"tag": {"$eq": "ml"}},
439
+ {"tag": {"$eq": "python"}}
440
+ ]
441
+ },
442
+ "n_results": 10
443
+ },
444
+ knn={
445
+ "query_embeddings": [1.0, 2.0, 3.0],
446
+ "where": {
447
+ "tag": {"$in": ["ml", "python"]}
448
+ },
449
+ "n_results": 10
450
+ },
451
+ rank={"rrf": {}},
452
+ n_results=5,
453
+ include=["documents", "metadatas"]
454
+ )
455
+
456
+ assert results is not None
457
+ assert len(results["ids"]) > 0
458
+ print(f" Found {len(results['ids'])} results with logical operators")
459
+
460
+ # Verify logical operators are applied
461
+ for metadata in results["metadatas"]:
462
+ if metadata and "tag" in metadata:
463
+ assert metadata["tag"] in ["ml", "python"]
464
+
465
+ finally:
466
+ # Cleanup
467
+ self._cleanup_collection(client, collection_name)
468
+
469
+ def test_seekdb_server_hybrid_search_full_text_only(self):
470
+ """Test hybrid_search with only full-text search (query) using SeekdbServer"""
471
+ # Create SeekdbServer client
472
+ client = seekdbclient.Client(
473
+ host=SERVER_HOST,
474
+ port=SERVER_PORT,
475
+ database=SERVER_DATABASE,
476
+ user=SERVER_USER,
477
+ password=SERVER_PASSWORD
478
+ )
479
+
480
+ assert client is not None
481
+ assert hasattr(client, '_server')
482
+ assert isinstance(client._server, seekdbclient.SeekdbServerClient)
483
+
484
+ # Test connection
485
+ try:
486
+ result = client._server.execute("SELECT 1 as test")
487
+ assert result is not None
488
+ except Exception as e:
489
+ pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
490
+
491
+ # Create test collection
492
+ collection_name = f"test_hybrid_search_{int(time.time())}"
493
+ collection = self._create_test_collection(client, collection_name, dimension=3)
494
+
495
+ try:
496
+ # Insert test data
497
+ self._insert_test_data(client, collection_name)
498
+
499
+ # Wait a bit for indexes to be ready
500
+ time.sleep(1)
501
+
502
+ # Test 1: Full-text search only
503
+ print(f"\n✅ Testing hybrid_search with full-text search only (SeekdbServer)")
504
+ results = collection.hybrid_search(
505
+ query={
506
+ "where_document": {
507
+ "$contains": "machine learning"
508
+ }
509
+ },
510
+ n_results=5,
511
+ include=["documents", "metadatas"]
512
+ )
513
+
514
+ assert results is not None
515
+ assert "ids" in results
516
+ assert "documents" in results
517
+ assert "metadatas" in results
518
+ assert len(results["ids"]) > 0
519
+ print(f" Found {len(results['ids'])} results")
520
+
521
+ # Verify results contain "machine learning"
522
+ for doc in results["documents"]:
523
+ if doc:
524
+ assert "machine" in doc.lower() or "learning" in doc.lower()
525
+
526
+ finally:
527
+ # Cleanup
528
+ self._cleanup_collection(client, collection_name)
529
+
530
+ def test_seekdb_server_hybrid_search_combined(self):
531
+ """Test hybrid_search with both full-text and vector search using SeekdbServer"""
532
+ # Create SeekdbServer client
533
+ client = seekdbclient.Client(
534
+ host=SERVER_HOST,
535
+ port=SERVER_PORT,
536
+ database=SERVER_DATABASE,
537
+ user=SERVER_USER,
538
+ password=SERVER_PASSWORD
539
+ )
540
+
541
+ # Test connection
542
+ try:
543
+ result = client._server.execute("SELECT 1 as test")
544
+ assert result is not None
545
+ except Exception as e:
546
+ pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
547
+
548
+ # Create test collection
549
+ collection_name = f"test_hybrid_search_{int(time.time())}"
550
+ collection = self._create_test_collection(client, collection_name, dimension=3)
551
+
552
+ try:
553
+ # Insert test data
554
+ self._insert_test_data(client, collection_name)
555
+
556
+ # Wait a bit for indexes to be ready
557
+ time.sleep(1)
558
+
559
+ # Test: Combined full-text and vector search
560
+ print(f"\n✅ Testing hybrid_search with both full-text and vector search (SeekdbServer)")
561
+ results = collection.hybrid_search(
562
+ query={
563
+ "where_document": {
564
+ "$contains": "machine learning"
565
+ },
566
+ "n_results": 10
567
+ },
568
+ knn={
569
+ "query_embeddings": [1.0, 2.0, 3.0],
570
+ "n_results": 10
571
+ },
572
+ rank={
573
+ "rrf": {
574
+ "rank_window_size": 60,
575
+ "rank_constant": 60
576
+ }
577
+ },
578
+ n_results=5,
579
+ include=["documents", "metadatas", "embeddings"]
580
+ )
581
+
582
+ assert results is not None
583
+ assert "ids" in results
584
+ assert len(results["ids"]) > 0
585
+ print(f" Found {len(results['ids'])} results after RRF ranking")
586
+
587
+ finally:
588
+ # Cleanup
589
+ self._cleanup_collection(client, collection_name)
590
+
591
+ def test_seekdb_server_hybrid_search_vector_only(self):
592
+ """Test hybrid_search with only vector search (knn) using SeekdbServer"""
593
+ # Create SeekdbServer client
594
+ client = seekdbclient.Client(
595
+ host=SERVER_HOST,
596
+ port=SERVER_PORT,
597
+ database=SERVER_DATABASE,
598
+ user=SERVER_USER,
599
+ password=SERVER_PASSWORD
600
+ )
601
+
602
+ # Test connection
603
+ try:
604
+ result = client._server.execute("SELECT 1 as test")
605
+ assert result is not None
606
+ except Exception as e:
607
+ pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
608
+
609
+ # Create test collection
610
+ collection_name = f"test_hybrid_search_{int(time.time())}"
611
+ collection = self._create_test_collection(client, collection_name, dimension=3)
612
+
613
+ try:
614
+ # Insert test data
615
+ self._insert_test_data(client, collection_name)
616
+
617
+ # Wait a bit for indexes to be ready
618
+ time.sleep(1)
619
+
620
+ # Test: Vector search only
621
+ print(f"\n✅ Testing hybrid_search with vector search only (SeekdbServer)")
622
+ results = collection.hybrid_search(
623
+ knn={
624
+ "query_embeddings": [1.0, 2.0, 3.0],
625
+ "n_results": 5
626
+ },
627
+ n_results=5,
628
+ include=["documents", "metadatas", "embeddings"]
629
+ )
630
+
631
+ assert results is not None
632
+ assert "ids" in results
633
+ assert "distances" in results
634
+ assert len(results["ids"]) > 0
635
+ print(f" Found {len(results['ids'])} results")
636
+
637
+ # Verify distances are reasonable
638
+ distances = results["distances"]
639
+ assert len(distances) > 0
640
+ for dist in distances:
641
+ assert dist >= 0, f"Distance should be non-negative, got {dist}"
642
+ min_distance = min(distances)
643
+ assert min_distance < 10.0, f"At least one distance should be reasonable, got min={min_distance}"
644
+
645
+ finally:
646
+ # Cleanup
647
+ self._cleanup_collection(client, collection_name)
648
+
649
+ def test_seekdb_server_hybrid_search_with_metadata_filter(self):
650
+ """Test hybrid_search with metadata filters using SeekdbServer"""
651
+ # Create SeekdbServer client
652
+ client = seekdbclient.Client(
653
+ host=SERVER_HOST,
654
+ port=SERVER_PORT,
655
+ database=SERVER_DATABASE,
656
+ user=SERVER_USER,
657
+ password=SERVER_PASSWORD
658
+ )
659
+
660
+ # Test connection
661
+ try:
662
+ result = client._server.execute("SELECT 1 as test")
663
+ assert result is not None
664
+ except Exception as e:
665
+ pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
666
+
667
+ # Create test collection
668
+ collection_name = f"test_hybrid_search_{int(time.time())}"
669
+ collection = self._create_test_collection(client, collection_name, dimension=3)
670
+
671
+ try:
672
+ # Insert test data
673
+ self._insert_test_data(client, collection_name)
674
+
675
+ # Wait a bit for indexes to be ready
676
+ time.sleep(1)
677
+
678
+ # Test: Hybrid search with metadata filter
679
+ print(f"\n✅ Testing hybrid_search with metadata filter (SeekdbServer)")
680
+ results = collection.hybrid_search(
681
+ query={
682
+ "where_document": {
683
+ "$contains": "machine"
684
+ },
685
+ "where": {
686
+ "$and": [
687
+ {"category": {"$eq": "AI"}},
688
+ {"page": {"$gte": 1}},
689
+ {"page": {"$lte": 5}}
690
+ ]
691
+ },
692
+ "n_results": 10
693
+ },
694
+ knn={
695
+ "query_embeddings": [1.0, 2.0, 3.0],
696
+ "where": {
697
+ "$and": [
698
+ {"category": {"$eq": "AI"}},
699
+ {"score": {"$gte": 90}}
700
+ ]
701
+ },
702
+ "n_results": 10
703
+ },
704
+ n_results=5,
705
+ include=["documents", "metadatas"]
706
+ )
707
+
708
+ assert results is not None
709
+ assert len(results["ids"]) > 0
710
+ print(f" Found {len(results['ids'])} results with metadata filters")
711
+
712
+ # Verify metadata filters are applied
713
+ for metadata in results["metadatas"]:
714
+ if metadata:
715
+ assert metadata.get("category") == "AI"
716
+
717
+ finally:
718
+ # Cleanup
719
+ self._cleanup_collection(client, collection_name)
720
+
721
+ def test_seekdb_server_hybrid_search_with_logical_operators(self):
722
+ """Test hybrid_search with logical operators in metadata filters using SeekdbServer"""
723
+ # Create SeekdbServer client
724
+ client = seekdbclient.Client(
725
+ host=SERVER_HOST,
726
+ port=SERVER_PORT,
727
+ database=SERVER_DATABASE,
728
+ user=SERVER_USER,
729
+ password=SERVER_PASSWORD
730
+ )
731
+
732
+ # Test connection
733
+ try:
734
+ result = client._server.execute("SELECT 1 as test")
735
+ assert result is not None
736
+ except Exception as e:
737
+ pytest.skip(f"SeekdbServer connection failed ({SERVER_HOST}:{SERVER_PORT}): {e}")
738
+
739
+ # Create test collection
740
+ collection_name = f"test_hybrid_search_{int(time.time())}"
741
+ collection = self._create_test_collection(client, collection_name, dimension=3)
742
+
743
+ try:
744
+ # Insert test data
745
+ self._insert_test_data(client, collection_name)
746
+
747
+ # Wait a bit for indexes to be ready
748
+ time.sleep(1)
749
+
750
+ # Test: Hybrid search with logical operators
751
+ print(f"\n✅ Testing hybrid_search with logical operators (SeekdbServer)")
752
+ results = collection.hybrid_search(
753
+ query={
754
+ "where_document": {
755
+ "$and": [
756
+ {"$contains": "machine"},
757
+ {"$contains": "learning"}
758
+ ]
759
+ },
760
+ "where": {
761
+ "$or": [
762
+ {"tag": {"$eq": "ml"}},
763
+ {"tag": {"$eq": "python"}}
764
+ ]
765
+ },
766
+ "n_results": 10
767
+ },
768
+ knn={
769
+ "query_embeddings": [1.0, 2.0, 3.0],
770
+ "where": {
771
+ "tag": {"$in": ["ml", "python"]}
772
+ },
773
+ "n_results": 10
774
+ },
775
+ rank={"rrf": {}},
776
+ n_results=5,
777
+ include=["documents", "metadatas"]
778
+ )
779
+
780
+ assert results is not None
781
+ assert len(results["ids"]) > 0
782
+ print(f" Found {len(results['ids'])} results with logical operators")
783
+
784
+ # Verify logical operators are applied
785
+ for metadata in results["metadatas"]:
786
+ if metadata and "tag" in metadata:
787
+ assert metadata["tag"] in ["ml", "python"]
788
+
789
+ finally:
790
+ # Cleanup
791
+ self._cleanup_collection(client, collection_name)
792
+
793
+
794
+ def test_embedded_hybrid_search_full_text_only(self):
795
+ """Test hybrid_search with only full-text search (query) using SeekdbEmbedded"""
796
+ if not os.path.exists(SEEKDB_PATH):
797
+ pytest.skip(
798
+ f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
799
+ f"Set SEEKDB_PATH environment variable to run this test"
800
+ )
801
+
802
+ # Check if seekdb package is available
803
+ try:
804
+ import seekdb
805
+ except ImportError:
806
+ pytest.skip("SeekDB embedded package is not installed")
807
+
808
+ # Create embedded client
809
+ client = seekdbclient.Client(
810
+ path=SEEKDB_PATH,
811
+ database=SEEKDB_DATABASE
812
+ )
813
+
814
+ assert client is not None
815
+ assert hasattr(client, '_server')
816
+ assert isinstance(client._server, seekdbclient.SeekdbEmbeddedClient)
817
+
818
+ # Create test collection
819
+ collection_name = f"test_hybrid_search_{int(time.time())}"
820
+ collection = self._create_test_collection(client, collection_name, dimension=3)
821
+
822
+ try:
823
+ # Insert test data
824
+ self._insert_test_data(client, collection_name)
825
+
826
+ # Wait a bit for indexes to be ready
827
+ time.sleep(1)
828
+
829
+ # Test 1: Full-text search only
830
+ print(f"\n✅ Testing hybrid_search with full-text search only (SeekdbEmbedded)")
831
+ results = collection.hybrid_search(
832
+ query={
833
+ "where_document": {
834
+ "$contains": "machine learning"
835
+ }
836
+ },
837
+ n_results=5,
838
+ include=["documents", "metadatas"]
839
+ )
840
+
841
+ assert results is not None
842
+ assert "ids" in results
843
+ assert "documents" in results
844
+ assert "metadatas" in results
845
+ assert len(results["ids"]) > 0
846
+ print(f" Found {len(results['ids'])} results")
847
+
848
+ # Verify results contain "machine learning"
849
+ for doc in results["documents"]:
850
+ if doc:
851
+ assert "machine" in doc.lower() or "learning" in doc.lower()
852
+
853
+ finally:
854
+ # Cleanup
855
+ self._cleanup_collection(client, collection_name)
856
+
857
+ def test_embedded_hybrid_search_vector_only(self):
858
+ """Test hybrid_search with only vector search (knn) using SeekdbEmbedded"""
859
+ if not os.path.exists(SEEKDB_PATH):
860
+ pytest.skip(
861
+ f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
862
+ f"Set SEEKDB_PATH environment variable to run this test"
863
+ )
864
+
865
+ # Check if seekdb package is available
866
+ try:
867
+ import seekdb
868
+ except ImportError:
869
+ pytest.skip("SeekDB embedded package is not installed")
870
+
871
+ # Create embedded client
872
+ client = seekdbclient.Client(
873
+ path=SEEKDB_PATH,
874
+ database=SEEKDB_DATABASE
875
+ )
876
+
877
+ # Create test collection
878
+ collection_name = f"test_hybrid_search_{int(time.time())}"
879
+ collection = self._create_test_collection(client, collection_name, dimension=3)
880
+
881
+ try:
882
+ # Insert test data
883
+ self._insert_test_data(client, collection_name)
884
+
885
+ # Wait a bit for indexes to be ready
886
+ time.sleep(1)
887
+
888
+ # Test: Vector search only
889
+ print(f"\n✅ Testing hybrid_search with vector search only (SeekdbEmbedded)")
890
+ results = collection.hybrid_search(
891
+ knn={
892
+ "query_embeddings": [1.0, 2.0, 3.0],
893
+ "n_results": 5
894
+ },
895
+ n_results=5,
896
+ include=["documents", "metadatas", "embeddings"]
897
+ )
898
+
899
+ assert results is not None
900
+ assert "ids" in results
901
+ assert "distances" in results
902
+ assert len(results["ids"]) > 0
903
+ print(f" Found {len(results['ids'])} results")
904
+
905
+ # Verify distances are reasonable
906
+ distances = results["distances"]
907
+ assert len(distances) > 0
908
+ for dist in distances:
909
+ assert dist >= 0, f"Distance should be non-negative, got {dist}"
910
+ min_distance = min(distances)
911
+ assert min_distance < 10.0, f"At least one distance should be reasonable, got min={min_distance}"
912
+
913
+ finally:
914
+ # Cleanup
915
+ self._cleanup_collection(client, collection_name)
916
+
917
+ def test_embedded_hybrid_search_combined(self):
918
+ """Test hybrid_search with both full-text and vector search using SeekdbEmbedded"""
919
+ if not os.path.exists(SEEKDB_PATH):
920
+ pytest.skip(
921
+ f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
922
+ f"Set SEEKDB_PATH environment variable to run this test"
923
+ )
924
+
925
+ # Check if seekdb package is available
926
+ try:
927
+ import seekdb
928
+ except ImportError:
929
+ pytest.skip("SeekDB embedded package is not installed")
930
+
931
+ # Create embedded client
932
+ client = seekdbclient.Client(
933
+ path=SEEKDB_PATH,
934
+ database=SEEKDB_DATABASE
935
+ )
936
+
937
+ # Create test collection
938
+ collection_name = f"test_hybrid_search_{int(time.time())}"
939
+ collection = self._create_test_collection(client, collection_name, dimension=3)
940
+
941
+ try:
942
+ # Insert test data
943
+ self._insert_test_data(client, collection_name)
944
+
945
+ # Wait a bit for indexes to be ready
946
+ time.sleep(1)
947
+
948
+ # Test: Combined full-text and vector search
949
+ print(f"\n✅ Testing hybrid_search with both full-text and vector search (SeekdbEmbedded)")
950
+ results = collection.hybrid_search(
951
+ query={
952
+ "where_document": {
953
+ "$contains": "machine learning"
954
+ },
955
+ "n_results": 10
956
+ },
957
+ knn={
958
+ "query_embeddings": [1.0, 2.0, 3.0],
959
+ "n_results": 10
960
+ },
961
+ rank={
962
+ "rrf": {
963
+ "rank_window_size": 60,
964
+ "rank_constant": 60
965
+ }
966
+ },
967
+ n_results=5,
968
+ include=["documents", "metadatas", "embeddings"]
969
+ )
970
+
971
+ assert results is not None
972
+ assert "ids" in results
973
+ assert len(results["ids"]) > 0
974
+ print(f" Found {len(results['ids'])} results after RRF ranking")
975
+
976
+ finally:
977
+ # Cleanup
978
+ self._cleanup_collection(client, collection_name)
979
+
980
+ def test_embedded_hybrid_search_with_metadata_filter(self):
981
+ """Test hybrid_search with metadata filters using SeekdbEmbedded"""
982
+ if not os.path.exists(SEEKDB_PATH):
983
+ pytest.skip(
984
+ f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
985
+ f"Set SEEKDB_PATH environment variable to run this test"
986
+ )
987
+
988
+ # Check if seekdb package is available
989
+ try:
990
+ import seekdb
991
+ except ImportError:
992
+ pytest.skip("SeekDB embedded package is not installed")
993
+
994
+ # Create embedded client
995
+ client = seekdbclient.Client(
996
+ path=SEEKDB_PATH,
997
+ database=SEEKDB_DATABASE
998
+ )
999
+
1000
+ # Create test collection
1001
+ collection_name = f"test_hybrid_search_{int(time.time())}"
1002
+ collection = self._create_test_collection(client, collection_name, dimension=3)
1003
+
1004
+ try:
1005
+ # Insert test data
1006
+ self._insert_test_data(client, collection_name)
1007
+
1008
+ # Wait a bit for indexes to be ready
1009
+ time.sleep(1)
1010
+
1011
+ # Test: Hybrid search with metadata filter
1012
+ print(f"\n✅ Testing hybrid_search with metadata filter (SeekdbEmbedded)")
1013
+ results = collection.hybrid_search(
1014
+ query={
1015
+ "where_document": {
1016
+ "$contains": "machine"
1017
+ },
1018
+ "where": {
1019
+ "$and": [
1020
+ {"category": {"$eq": "AI"}},
1021
+ {"page": {"$gte": 1}},
1022
+ {"page": {"$lte": 5}}
1023
+ ]
1024
+ },
1025
+ "n_results": 10
1026
+ },
1027
+ knn={
1028
+ "query_embeddings": [1.0, 2.0, 3.0],
1029
+ "where": {
1030
+ "$and": [
1031
+ {"category": {"$eq": "AI"}},
1032
+ {"score": {"$gte": 90}}
1033
+ ]
1034
+ },
1035
+ "n_results": 10
1036
+ },
1037
+ n_results=5,
1038
+ include=["documents", "metadatas"]
1039
+ )
1040
+
1041
+ assert results is not None
1042
+ assert len(results["ids"]) > 0
1043
+ print(f" Found {len(results['ids'])} results with metadata filters")
1044
+
1045
+ # Verify metadata filters are applied
1046
+ for metadata in results["metadatas"]:
1047
+ if metadata:
1048
+ assert metadata.get("category") == "AI"
1049
+
1050
+ finally:
1051
+ # Cleanup
1052
+ self._cleanup_collection(client, collection_name)
1053
+
1054
+ def test_embedded_hybrid_search_with_logical_operators(self):
1055
+ """Test hybrid_search with logical operators in metadata filters using SeekdbEmbedded"""
1056
+ if not os.path.exists(SEEKDB_PATH):
1057
+ pytest.skip(
1058
+ f"SeekDB data directory does not exist: {SEEKDB_PATH}\n"
1059
+ f"Set SEEKDB_PATH environment variable to run this test"
1060
+ )
1061
+
1062
+ # Check if seekdb package is available
1063
+ try:
1064
+ import seekdb
1065
+ except ImportError:
1066
+ pytest.skip("SeekDB embedded package is not installed")
1067
+
1068
+ # Create embedded client
1069
+ client = seekdbclient.Client(
1070
+ path=SEEKDB_PATH,
1071
+ database=SEEKDB_DATABASE
1072
+ )
1073
+
1074
+ # Create test collection
1075
+ collection_name = f"test_hybrid_search_{int(time.time())}"
1076
+ collection = self._create_test_collection(client, collection_name, dimension=3)
1077
+
1078
+ try:
1079
+ # Insert test data
1080
+ self._insert_test_data(client, collection_name)
1081
+
1082
+ # Wait a bit for indexes to be ready
1083
+ time.sleep(1)
1084
+
1085
+ # Test: Hybrid search with logical operators
1086
+ print(f"\n✅ Testing hybrid_search with logical operators (SeekdbEmbedded)")
1087
+ results = collection.hybrid_search(
1088
+ query={
1089
+ "where_document": {
1090
+ "$and": [
1091
+ {"$contains": "machine"},
1092
+ {"$contains": "learning"}
1093
+ ]
1094
+ },
1095
+ "where": {
1096
+ "$or": [
1097
+ {"tag": {"$eq": "ml"}},
1098
+ {"tag": {"$eq": "python"}}
1099
+ ]
1100
+ },
1101
+ "n_results": 10
1102
+ },
1103
+ knn={
1104
+ "query_embeddings": [1.0, 2.0, 3.0],
1105
+ "where": {
1106
+ "tag": {"$in": ["ml", "python"]}
1107
+ },
1108
+ "n_results": 10
1109
+ },
1110
+ rank={"rrf": {}},
1111
+ n_results=5,
1112
+ include=["documents", "metadatas"]
1113
+ )
1114
+
1115
+ assert results is not None
1116
+ assert len(results["ids"]) > 0
1117
+ print(f" Found {len(results['ids'])} results with logical operators")
1118
+
1119
+ # Verify logical operators are applied
1120
+ for metadata in results["metadatas"]:
1121
+ if metadata and "tag" in metadata:
1122
+ assert metadata["tag"] in ["ml", "python"]
1123
+
1124
+ finally:
1125
+ # Cleanup
1126
+ self._cleanup_collection(client, collection_name)