pyseekdb 0.1.0.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyseekdb/__init__.py +90 -0
- pyseekdb/client/__init__.py +324 -0
- pyseekdb/client/admin_client.py +202 -0
- pyseekdb/client/base_connection.py +82 -0
- pyseekdb/client/client_base.py +1921 -0
- pyseekdb/client/client_oceanbase_server.py +258 -0
- pyseekdb/client/client_seekdb_embedded.py +324 -0
- pyseekdb/client/client_seekdb_server.py +226 -0
- pyseekdb/client/collection.py +485 -0
- pyseekdb/client/database.py +55 -0
- pyseekdb/client/filters.py +357 -0
- pyseekdb/client/meta_info.py +15 -0
- pyseekdb/client/query_result.py +122 -0
- pyseekdb/client/sql_utils.py +48 -0
- pyseekdb/examples/comprehensive_example.py +412 -0
- pyseekdb/examples/simple_example.py +113 -0
- pyseekdb/tests/__init__.py +0 -0
- pyseekdb/tests/test_admin_database_management.py +307 -0
- pyseekdb/tests/test_client_creation.py +425 -0
- pyseekdb/tests/test_collection_dml.py +652 -0
- pyseekdb/tests/test_collection_get.py +550 -0
- pyseekdb/tests/test_collection_hybrid_search.py +1126 -0
- pyseekdb/tests/test_collection_query.py +428 -0
- pyseekdb-0.1.0.dev3.dist-info/LICENSE +202 -0
- pyseekdb-0.1.0.dev3.dist-info/METADATA +856 -0
- pyseekdb-0.1.0.dev3.dist-info/RECORD +27 -0
- pyseekdb-0.1.0.dev3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Comprehensive Example: Complete guide to all SeekDBClient features
|
|
3
|
+
|
|
4
|
+
This example demonstrates all available operations:
|
|
5
|
+
1. Client connection (all modes)
|
|
6
|
+
2. Collection management
|
|
7
|
+
3. DML operations (add, update, upsert, delete)
|
|
8
|
+
4. DQL operations (query, get, hybrid_search)
|
|
9
|
+
5. Filter operators
|
|
10
|
+
6. Collection information methods
|
|
11
|
+
|
|
12
|
+
This is a complete reference for all client capabilities.
|
|
13
|
+
"""
|
|
14
|
+
import uuid
|
|
15
|
+
import random
|
|
16
|
+
import seekdbclient
|
|
17
|
+
|
|
18
|
+
# ============================================================================
|
|
19
|
+
# PART 1: CLIENT CONNECTION
|
|
20
|
+
# ============================================================================
|
|
21
|
+
|
|
22
|
+
# Option 1: Embedded mode (local SeekDB)
|
|
23
|
+
# client = seekdbclient.Client(
|
|
24
|
+
# path="./seekdb",
|
|
25
|
+
# database="test"
|
|
26
|
+
# )
|
|
27
|
+
|
|
28
|
+
# Option 2: Server mode (remote SeekDB server)
|
|
29
|
+
client = seekdbclient.Client(
|
|
30
|
+
host="127.0.0.1",
|
|
31
|
+
port=2881,
|
|
32
|
+
database="test",
|
|
33
|
+
user="root",
|
|
34
|
+
password=""
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Option 3: OceanBase mode
|
|
38
|
+
# ob_client = seekdbclient.OBClient(
|
|
39
|
+
# host="127.0.0.1",
|
|
40
|
+
# port=11402,
|
|
41
|
+
# tenant="mysql",
|
|
42
|
+
# database="test",
|
|
43
|
+
# user="root",
|
|
44
|
+
# password=""
|
|
45
|
+
# )
|
|
46
|
+
|
|
47
|
+
# ============================================================================
|
|
48
|
+
# PART 2: COLLECTION MANAGEMENT
|
|
49
|
+
# ============================================================================
|
|
50
|
+
|
|
51
|
+
collection_name = "comprehensive_example"
|
|
52
|
+
dimension = 128
|
|
53
|
+
|
|
54
|
+
# 2.1 Create a collection
|
|
55
|
+
collection = client.get_or_create_collection(
|
|
56
|
+
name=collection_name,
|
|
57
|
+
dimension=dimension
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# 2.2 Check if collection exists
|
|
61
|
+
exists = client.has_collection(collection_name)
|
|
62
|
+
|
|
63
|
+
# 2.3 Get collection object
|
|
64
|
+
retrieved_collection = client.get_collection(collection_name)
|
|
65
|
+
|
|
66
|
+
# 2.4 List all collections
|
|
67
|
+
all_collections = client.list_collections()
|
|
68
|
+
|
|
69
|
+
# 2.5 Get or create collection (creates if doesn't exist)
|
|
70
|
+
collection2 = client.get_or_create_collection(
|
|
71
|
+
name="another_collection",
|
|
72
|
+
dimension=64
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# ============================================================================
|
|
76
|
+
# PART 3: DML OPERATIONS - ADD DATA
|
|
77
|
+
# ============================================================================
|
|
78
|
+
|
|
79
|
+
# Generate sample data
|
|
80
|
+
random.seed(42)
|
|
81
|
+
documents = [
|
|
82
|
+
"Machine learning is transforming the way we solve problems",
|
|
83
|
+
"Python programming language is widely used in data science",
|
|
84
|
+
"Vector databases enable efficient similarity search",
|
|
85
|
+
"Neural networks mimic the structure of the human brain",
|
|
86
|
+
"Natural language processing helps computers understand human language",
|
|
87
|
+
"Deep learning requires large amounts of training data",
|
|
88
|
+
"Reinforcement learning agents learn through trial and error",
|
|
89
|
+
"Computer vision enables machines to interpret visual information"
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# Generate vectors (in real usage, use an embedding model)
|
|
93
|
+
vectors = []
|
|
94
|
+
for i in range(len(documents)):
|
|
95
|
+
vector = [random.random() for _ in range(dimension)]
|
|
96
|
+
vectors.append(vector)
|
|
97
|
+
|
|
98
|
+
ids = [str(uuid.uuid4()) for _ in documents]
|
|
99
|
+
|
|
100
|
+
# 3.1 Add single item
|
|
101
|
+
single_id = str(uuid.uuid4())
|
|
102
|
+
collection.add(
|
|
103
|
+
ids=single_id,
|
|
104
|
+
documents="This is a single document",
|
|
105
|
+
vectors=[random.random() for _ in range(dimension)],
|
|
106
|
+
metadatas={"type": "single", "category": "test"}
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# 3.2 Add multiple items
|
|
110
|
+
collection.add(
|
|
111
|
+
ids=ids,
|
|
112
|
+
documents=documents,
|
|
113
|
+
vectors=vectors,
|
|
114
|
+
metadatas=[
|
|
115
|
+
{"category": "AI", "score": 95, "tag": "ml", "year": 2023},
|
|
116
|
+
{"category": "Programming", "score": 88, "tag": "python", "year": 2022},
|
|
117
|
+
{"category": "Database", "score": 92, "tag": "vector", "year": 2023},
|
|
118
|
+
{"category": "AI", "score": 90, "tag": "neural", "year": 2022},
|
|
119
|
+
{"category": "NLP", "score": 87, "tag": "language", "year": 2023},
|
|
120
|
+
{"category": "AI", "score": 93, "tag": "deep", "year": 2023},
|
|
121
|
+
{"category": "AI", "score": 85, "tag": "reinforcement", "year": 2022},
|
|
122
|
+
{"category": "CV", "score": 91, "tag": "vision", "year": 2023}
|
|
123
|
+
]
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# 3.3 Add with only vectors (no documents)
|
|
127
|
+
vector_only_ids = [str(uuid.uuid4()) for _ in range(2)]
|
|
128
|
+
collection.add(
|
|
129
|
+
ids=vector_only_ids,
|
|
130
|
+
vectors=[[random.random() for _ in range(dimension)] for _ in range(2)],
|
|
131
|
+
metadatas=[{"type": "vector_only"}, {"type": "vector_only"}]
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# ============================================================================
|
|
135
|
+
# PART 4: DML OPERATIONS - UPDATE DATA
|
|
136
|
+
# ============================================================================
|
|
137
|
+
|
|
138
|
+
# 4.1 Update single item
|
|
139
|
+
collection.update(
|
|
140
|
+
ids=ids[0],
|
|
141
|
+
metadatas={"category": "AI", "score": 98, "tag": "ml", "year": 2024, "updated": True}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# 4.2 Update multiple items
|
|
145
|
+
collection.update(
|
|
146
|
+
ids=ids[1:3],
|
|
147
|
+
documents=["Updated document 1", "Updated document 2"],
|
|
148
|
+
metadatas=[
|
|
149
|
+
{"category": "Programming", "score": 95, "updated": True},
|
|
150
|
+
{"category": "Database", "score": 97, "updated": True}
|
|
151
|
+
]
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# 4.3 Update vectors
|
|
155
|
+
new_vectors = [[random.random() for _ in range(dimension)] for _ in range(2)]
|
|
156
|
+
collection.update(
|
|
157
|
+
ids=ids[2:4],
|
|
158
|
+
vectors=new_vectors
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# ============================================================================
|
|
162
|
+
# PART 5: DML OPERATIONS - UPSERT DATA
|
|
163
|
+
# ============================================================================
|
|
164
|
+
|
|
165
|
+
# 5.1 Upsert existing item (will update)
|
|
166
|
+
collection.upsert(
|
|
167
|
+
ids=ids[0],
|
|
168
|
+
documents="Upserted document (was updated)",
|
|
169
|
+
vectors=[random.random() for _ in range(dimension)],
|
|
170
|
+
metadatas={"category": "AI", "upserted": True}
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# 5.2 Upsert new item (will insert)
|
|
174
|
+
new_id = str(uuid.uuid4())
|
|
175
|
+
collection.upsert(
|
|
176
|
+
ids=new_id,
|
|
177
|
+
documents="This is a new document from upsert",
|
|
178
|
+
vectors=[random.random() for _ in range(dimension)],
|
|
179
|
+
metadatas={"category": "New", "upserted": True}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# 5.3 Upsert multiple items
|
|
183
|
+
upsert_ids = [ids[4], str(uuid.uuid4())] # One existing, one new
|
|
184
|
+
collection.upsert(
|
|
185
|
+
ids=upsert_ids,
|
|
186
|
+
documents=["Upserted doc 1", "Upserted doc 2"],
|
|
187
|
+
vectors=[[random.random() for _ in range(dimension)] for _ in range(2)],
|
|
188
|
+
metadatas=[{"upserted": True}, {"upserted": True}]
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# ============================================================================
|
|
192
|
+
# PART 6: DQL OPERATIONS - QUERY (VECTOR SIMILARITY SEARCH)
|
|
193
|
+
# ============================================================================
|
|
194
|
+
|
|
195
|
+
# 6.1 Basic vector similarity query
|
|
196
|
+
query_vector = vectors[0] # Query with first document's vector
|
|
197
|
+
results = collection.query(
|
|
198
|
+
query_embeddings=query_vector,
|
|
199
|
+
n_results=3
|
|
200
|
+
)
|
|
201
|
+
print(f"Query results: {len(results)} items")
|
|
202
|
+
|
|
203
|
+
# 6.2 Query with metadata filter
|
|
204
|
+
results = collection.query(
|
|
205
|
+
query_embeddings=query_vector,
|
|
206
|
+
where={"category": {"$eq": "AI"}},
|
|
207
|
+
n_results=5
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# 6.3 Query with comparison operators
|
|
211
|
+
results = collection.query(
|
|
212
|
+
query_embeddings=query_vector,
|
|
213
|
+
where={"score": {"$gte": 90}},
|
|
214
|
+
n_results=5
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# 6.4 Query with $in operator
|
|
218
|
+
results = collection.query(
|
|
219
|
+
query_embeddings=query_vector,
|
|
220
|
+
where={"tag": {"$in": ["ml", "python", "neural"]}},
|
|
221
|
+
n_results=5
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# 6.5 Query with logical operators ($or)
|
|
225
|
+
results = collection.query(
|
|
226
|
+
query_embeddings=query_vector,
|
|
227
|
+
where={
|
|
228
|
+
"$or": [
|
|
229
|
+
{"category": {"$eq": "AI"}},
|
|
230
|
+
{"tag": {"$eq": "python"}}
|
|
231
|
+
]
|
|
232
|
+
},
|
|
233
|
+
n_results=5
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# 6.6 Query with logical operators ($and)
|
|
237
|
+
results = collection.query(
|
|
238
|
+
query_embeddings=query_vector,
|
|
239
|
+
where={
|
|
240
|
+
"$and": [
|
|
241
|
+
{"category": {"$eq": "AI"}},
|
|
242
|
+
{"score": {"$gte": 90}}
|
|
243
|
+
]
|
|
244
|
+
},
|
|
245
|
+
n_results=5
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# 6.7 Query with document filter
|
|
249
|
+
results = collection.query(
|
|
250
|
+
query_embeddings=query_vector,
|
|
251
|
+
where_document={"$contains": "machine learning"},
|
|
252
|
+
n_results=5
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# 6.8 Query with combined filters
|
|
256
|
+
results = collection.query(
|
|
257
|
+
query_embeddings=query_vector,
|
|
258
|
+
where={"category": {"$eq": "AI"}, "year": {"$gte": 2023}},
|
|
259
|
+
where_document={"$contains": "learning"},
|
|
260
|
+
n_results=5
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# 6.9 Query with multiple vectors (batch query)
|
|
264
|
+
batch_vectors = [vectors[0], vectors[1]]
|
|
265
|
+
batch_results = collection.query(
|
|
266
|
+
query_embeddings=batch_vectors,
|
|
267
|
+
n_results=2
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# 6.10 Query with specific fields
|
|
271
|
+
results = collection.query(
|
|
272
|
+
query_embeddings=query_vector,
|
|
273
|
+
include=["documents", "metadatas", "embeddings"],
|
|
274
|
+
n_results=2
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# ============================================================================
|
|
278
|
+
# PART 7: DQL OPERATIONS - GET (RETRIEVE BY IDS OR FILTERS)
|
|
279
|
+
# ============================================================================
|
|
280
|
+
|
|
281
|
+
# 7.1 Get by single ID
|
|
282
|
+
result = collection.get(ids=ids[0])
|
|
283
|
+
|
|
284
|
+
# 7.2 Get by multiple IDs
|
|
285
|
+
results = collection.get(ids=ids[:3])
|
|
286
|
+
|
|
287
|
+
# 7.3 Get by metadata filter
|
|
288
|
+
results = collection.get(
|
|
289
|
+
where={"category": {"$eq": "AI"}},
|
|
290
|
+
limit=5
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# 7.4 Get with comparison operators
|
|
294
|
+
results = collection.get(
|
|
295
|
+
where={"score": {"$gte": 90}},
|
|
296
|
+
limit=5
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# 7.5 Get with $in operator
|
|
300
|
+
results = collection.get(
|
|
301
|
+
where={"tag": {"$in": ["ml", "python"]}},
|
|
302
|
+
limit=5
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# 7.6 Get with logical operators
|
|
306
|
+
results = collection.get(
|
|
307
|
+
where={
|
|
308
|
+
"$or": [
|
|
309
|
+
{"category": {"$eq": "AI"}},
|
|
310
|
+
{"category": {"$eq": "Programming"}}
|
|
311
|
+
]
|
|
312
|
+
},
|
|
313
|
+
limit=5
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# 7.7 Get by document filter
|
|
317
|
+
results = collection.get(
|
|
318
|
+
where_document={"$contains": "Python"},
|
|
319
|
+
limit=5
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# 7.8 Get with pagination
|
|
323
|
+
results_page1 = collection.get(limit=2, offset=0)
|
|
324
|
+
results_page2 = collection.get(limit=2, offset=2)
|
|
325
|
+
|
|
326
|
+
# 7.9 Get with specific fields
|
|
327
|
+
results = collection.get(
|
|
328
|
+
ids=ids[:2],
|
|
329
|
+
include=["documents", "metadatas", "embeddings"]
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# 7.10 Get all data
|
|
333
|
+
all_results = collection.get(limit=100)
|
|
334
|
+
|
|
335
|
+
# ============================================================================
|
|
336
|
+
# PART 8: DQL OPERATIONS - HYBRID SEARCH
|
|
337
|
+
# ============================================================================
|
|
338
|
+
|
|
339
|
+
# 8.1 Hybrid search with full-text and vector search
|
|
340
|
+
# Note: This requires query_embeddings to be provided directly
|
|
341
|
+
# In real usage, you might have an embedding function
|
|
342
|
+
hybrid_results = collection.hybrid_search(
|
|
343
|
+
query={
|
|
344
|
+
"where_document": {"$contains": "machine learning"},
|
|
345
|
+
"where": {"category": {"$eq": "AI"}},
|
|
346
|
+
"n_results": 10
|
|
347
|
+
},
|
|
348
|
+
knn={
|
|
349
|
+
"query_embeddings": [vectors[0]],
|
|
350
|
+
"where": {"year": {"$gte": 2022}},
|
|
351
|
+
"n_results": 10
|
|
352
|
+
},
|
|
353
|
+
rank={"rrf": {}}, # Reciprocal Rank Fusion
|
|
354
|
+
n_results=5,
|
|
355
|
+
include=["documents", "metadatas"]
|
|
356
|
+
)
|
|
357
|
+
print(f"Hybrid search: {len(hybrid_results.get('ids', []))} results")
|
|
358
|
+
|
|
359
|
+
# ============================================================================
|
|
360
|
+
# PART 9: DML OPERATIONS - DELETE DATA
|
|
361
|
+
# ============================================================================
|
|
362
|
+
|
|
363
|
+
# 9.1 Delete by IDs
|
|
364
|
+
delete_ids = [vector_only_ids[0], new_id]
|
|
365
|
+
collection.delete(ids=delete_ids)
|
|
366
|
+
|
|
367
|
+
# 9.2 Delete by metadata filter
|
|
368
|
+
collection.delete(where={"type": {"$eq": "vector_only"}})
|
|
369
|
+
|
|
370
|
+
# 9.3 Delete by document filter
|
|
371
|
+
collection.delete(where_document={"$contains": "Updated document"})
|
|
372
|
+
|
|
373
|
+
# 9.4 Delete with combined filters
|
|
374
|
+
collection.delete(
|
|
375
|
+
where={"category": {"$eq": "CV"}},
|
|
376
|
+
where_document={"$contains": "vision"}
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# ============================================================================
|
|
380
|
+
# PART 10: COLLECTION INFORMATION
|
|
381
|
+
# ============================================================================
|
|
382
|
+
|
|
383
|
+
# 10.1 Get collection count
|
|
384
|
+
count = collection.count()
|
|
385
|
+
print(f"Collection count: {count} items")
|
|
386
|
+
|
|
387
|
+
# 10.2 Get collection description
|
|
388
|
+
info = collection.describe()
|
|
389
|
+
print(f"Collection info: {info}")
|
|
390
|
+
|
|
391
|
+
# 10.3 Preview first few items in collection
|
|
392
|
+
preview = collection.peek(limit=5)
|
|
393
|
+
print(f"Preview: {len(preview)} items")
|
|
394
|
+
for item in preview:
|
|
395
|
+
print(f" ID: {item._id}, Document: {item.document}")
|
|
396
|
+
|
|
397
|
+
# 10.4 Count collections in database
|
|
398
|
+
collection_count = client.count_collection()
|
|
399
|
+
print(f"Database has {collection_count} collections")
|
|
400
|
+
|
|
401
|
+
# ============================================================================
|
|
402
|
+
# PART 11: CLEANUP
|
|
403
|
+
# ============================================================================
|
|
404
|
+
|
|
405
|
+
# Delete test collections
|
|
406
|
+
try:
|
|
407
|
+
client.delete_collection("another_collection")
|
|
408
|
+
except Exception as e:
|
|
409
|
+
print(f"Could not delete 'another_collection': {e}")
|
|
410
|
+
|
|
411
|
+
# Uncomment to delete main collection
|
|
412
|
+
client.delete_collection(collection_name)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simple Example: Basic usage of SeekDBClient
|
|
3
|
+
|
|
4
|
+
This example demonstrates the most common operations:
|
|
5
|
+
1. Create a client connection
|
|
6
|
+
2. Create a collection
|
|
7
|
+
3. Add data to the collection
|
|
8
|
+
4. Query the collection
|
|
9
|
+
5. Print query results
|
|
10
|
+
|
|
11
|
+
This is a minimal example to get you started quickly.
|
|
12
|
+
"""
|
|
13
|
+
import uuid
|
|
14
|
+
import seekdbclient
|
|
15
|
+
|
|
16
|
+
# ==================== Step 1: Create Client Connection ====================
|
|
17
|
+
# You can use embedded mode, server mode, or OceanBase mode
|
|
18
|
+
# For this example, we'll use server mode (you can change to embedded or OceanBase)
|
|
19
|
+
|
|
20
|
+
# Server mode (connecting to remote SeekDB server)
|
|
21
|
+
client = seekdbclient.Client(
|
|
22
|
+
host="127.0.0.1",
|
|
23
|
+
port=2881,
|
|
24
|
+
database="test",
|
|
25
|
+
user="root",
|
|
26
|
+
password=""
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Alternative: Embedded mode (local SeekDB)
|
|
30
|
+
# client = seekdbclient.Client(
|
|
31
|
+
# path="./seekdb",
|
|
32
|
+
# database="test"
|
|
33
|
+
# )
|
|
34
|
+
|
|
35
|
+
# Alternative: OceanBase mode
|
|
36
|
+
# client = seekdbclient.OBClient(
|
|
37
|
+
# host="127.0.0.1",
|
|
38
|
+
# port=11402,
|
|
39
|
+
# tenant="mysql",
|
|
40
|
+
# database="test",
|
|
41
|
+
# user="root",
|
|
42
|
+
# password=""
|
|
43
|
+
# )
|
|
44
|
+
|
|
45
|
+
# ==================== Step 2: Create a Collection ====================
|
|
46
|
+
# A collection is like a table that stores documents with vector embeddings
|
|
47
|
+
collection_name = "my_simple_collection"
|
|
48
|
+
dimension = 128 # Vector dimension (must match your embedding model)
|
|
49
|
+
|
|
50
|
+
# Create collection
|
|
51
|
+
collection = client.create_collection(
|
|
52
|
+
name=collection_name,
|
|
53
|
+
dimension=dimension
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# ==================== Step 3: Add Data to Collection ====================
|
|
57
|
+
# Generate some sample data
|
|
58
|
+
documents = [
|
|
59
|
+
"Machine learning is a subset of artificial intelligence",
|
|
60
|
+
"Python is a popular programming language",
|
|
61
|
+
"Vector databases enable semantic search",
|
|
62
|
+
"Neural networks are inspired by the human brain",
|
|
63
|
+
"Natural language processing helps computers understand text"
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
# Generate simple vectors (in real usage, you would use an embedding model)
|
|
67
|
+
# For demonstration, we'll create random vectors
|
|
68
|
+
import random
|
|
69
|
+
random.seed(42) # For reproducibility
|
|
70
|
+
|
|
71
|
+
vectors = []
|
|
72
|
+
for i in range(len(documents)):
|
|
73
|
+
# Generate a random vector of dimension 128
|
|
74
|
+
vector = [random.random() for _ in range(dimension)]
|
|
75
|
+
vectors.append(vector)
|
|
76
|
+
|
|
77
|
+
# Generate unique IDs for each document
|
|
78
|
+
# ids = [str(uuid.uuid4()) for _ in documents]
|
|
79
|
+
ids = ["id1", "id2", "id3", "id4", "id5"]
|
|
80
|
+
|
|
81
|
+
# Add data to collection
|
|
82
|
+
collection.add(
|
|
83
|
+
ids=ids,
|
|
84
|
+
documents=documents,
|
|
85
|
+
vectors=vectors,
|
|
86
|
+
metadatas=[
|
|
87
|
+
{"category": "AI", "index": 0},
|
|
88
|
+
{"category": "Programming", "index": 1},
|
|
89
|
+
{"category": "Database", "index": 2},
|
|
90
|
+
{"category": "AI", "index": 3},
|
|
91
|
+
{"category": "NLP", "index": 4}
|
|
92
|
+
]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# ==================== Step 4: Query the Collection ====================
|
|
96
|
+
# Create a query vector (in real usage, you would embed your query text)
|
|
97
|
+
# For demonstration, we'll use a vector similar to the first document
|
|
98
|
+
query_vector = vectors[0] # Query with vector similar to first document
|
|
99
|
+
|
|
100
|
+
# Perform vector similarity search
|
|
101
|
+
results = collection.query(
|
|
102
|
+
query_embeddings=query_vector,
|
|
103
|
+
n_results=3 # Return top 3 most similar documents
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ==================== Step 5: Print Query Results ====================
|
|
108
|
+
print(f"Query results: {len(results)} items found")
|
|
109
|
+
for i, item in enumerate(results, 1):
|
|
110
|
+
print(f"Result {i}: ID={item._id}, Distance={item.distance:.4f}, Document={item.document[:50]}...")
|
|
111
|
+
|
|
112
|
+
# ==================== Step 6: Delete the Collection ====================
|
|
113
|
+
client.delete_collection(collection_name)
|
|
File without changes
|