pyseekdb 0.1.0.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyseekdb/__init__.py +90 -0
- pyseekdb/client/__init__.py +324 -0
- pyseekdb/client/admin_client.py +202 -0
- pyseekdb/client/base_connection.py +82 -0
- pyseekdb/client/client_base.py +1921 -0
- pyseekdb/client/client_oceanbase_server.py +258 -0
- pyseekdb/client/client_seekdb_embedded.py +324 -0
- pyseekdb/client/client_seekdb_server.py +226 -0
- pyseekdb/client/collection.py +485 -0
- pyseekdb/client/database.py +55 -0
- pyseekdb/client/filters.py +357 -0
- pyseekdb/client/meta_info.py +15 -0
- pyseekdb/client/query_result.py +122 -0
- pyseekdb/client/sql_utils.py +48 -0
- pyseekdb/examples/comprehensive_example.py +412 -0
- pyseekdb/examples/simple_example.py +113 -0
- pyseekdb/tests/__init__.py +0 -0
- pyseekdb/tests/test_admin_database_management.py +307 -0
- pyseekdb/tests/test_client_creation.py +425 -0
- pyseekdb/tests/test_collection_dml.py +652 -0
- pyseekdb/tests/test_collection_get.py +550 -0
- pyseekdb/tests/test_collection_hybrid_search.py +1126 -0
- pyseekdb/tests/test_collection_query.py +428 -0
- pyseekdb-0.1.0.dev3.dist-info/LICENSE +202 -0
- pyseekdb-0.1.0.dev3.dist-info/METADATA +856 -0
- pyseekdb-0.1.0.dev3.dist-info/RECORD +27 -0
- pyseekdb-0.1.0.dev3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,856 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pyseekdb
|
|
3
|
+
Version: 0.1.0.dev3
|
|
4
|
+
Summary: A unified Python client for SeekDB that supports embedded, server, and OceanBase connection modes with vector database capabilities.
|
|
5
|
+
Home-page: https://github.com/oceanbase/pyseekdb/
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Author: SeekDBClient Team
|
|
8
|
+
Author-email: open_oceanbase@oceanbase.com
|
|
9
|
+
Requires-Python: >=3.8,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
|
19
|
+
Requires-Dist: seekdb
|
|
20
|
+
Project-URL: Bug Tracker, https://github.com/oceanbase/pyseekdb/issues
|
|
21
|
+
Project-URL: Repository, https://github.com/oceanbase/pyseekdb/
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# SeekDBClient
|
|
25
|
+
|
|
26
|
+
SeekDBClient is a unified Python client that wraps three database connection modes—embedded SeekDB, remote SeekDB servers, and OceanBase—behind a single, concise API.
|
|
27
|
+
|
|
28
|
+
## Table of Contents
|
|
29
|
+
|
|
30
|
+
1. [Installation](#installation)
|
|
31
|
+
2. [Client Connection](#1-client-connection)
|
|
32
|
+
3. [AdminClient Connection and Database Management](#2-adminclient-connection-and-database-management)
|
|
33
|
+
4. [Collection (Table) Management](#3-collection-table-management)
|
|
34
|
+
5. [DML Operations](#4-dml-operations)
|
|
35
|
+
6. [DQL Operations](#5-dql-operations)
|
|
36
|
+
7. [Testing](#testing)
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# install from pypi
|
|
42
|
+
pip install seekdbclient
|
|
43
|
+
# install from source code
|
|
44
|
+
poetry install
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## 1. Client Connection
|
|
48
|
+
|
|
49
|
+
The `Client` class provides a unified interface for connecting to SeekDB in different modes. It automatically selects the appropriate connection mode based on the parameters provided.
|
|
50
|
+
|
|
51
|
+
### 1.1 Embedded SeekDB Client
|
|
52
|
+
|
|
53
|
+
Connect to a local embedded SeekDB instance:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import seekdbclient
|
|
57
|
+
|
|
58
|
+
# Create embedded client
|
|
59
|
+
client = seekdbclient.Client(
|
|
60
|
+
path="./seekdb", # Path to SeekDB data directory
|
|
61
|
+
database="demo" # Database name
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Execute SQL queries
|
|
65
|
+
rows = client.execute("SELECT 1")
|
|
66
|
+
print(rows)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### 1.2 Remote SeekDB Server Client
|
|
70
|
+
|
|
71
|
+
Connect to a remote SeekDB server:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import seekdbclient
|
|
75
|
+
|
|
76
|
+
# Create server client
|
|
77
|
+
client = seekdbclient.Client(
|
|
78
|
+
host="127.0.0.1", # Server host
|
|
79
|
+
port=2881, # Server port (default: 2881)
|
|
80
|
+
database="demo", # Database name
|
|
81
|
+
user="root", # Username (default: "root")
|
|
82
|
+
password="" # Password
|
|
83
|
+
)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 1.3 OceanBase Client
|
|
87
|
+
|
|
88
|
+
Connect to OceanBase database:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import seekdbclient
|
|
92
|
+
|
|
93
|
+
# Create OceanBase client
|
|
94
|
+
client = seekdbclient.OBClient(
|
|
95
|
+
host="127.0.0.1", # Server host
|
|
96
|
+
port=11402, # OceanBase port
|
|
97
|
+
tenant="mysql", # Tenant name
|
|
98
|
+
database="test", # Database name
|
|
99
|
+
user="root", # Username
|
|
100
|
+
password="" # Password
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### 1.4 Client Methods and Properties
|
|
105
|
+
|
|
106
|
+
| Method / Property | Description |
|
|
107
|
+
|-----------------------|----------------------------------------------------------------|
|
|
108
|
+
| `execute(sql)` | Execute SQL statement and return cursor results (commits automatically when needed) |
|
|
109
|
+
| `is_connected()` | Check whether an underlying connection is active |
|
|
110
|
+
| `get_raw_connection()`| Access the underlying seekdb / pymysql connection |
|
|
111
|
+
| `mode` | Returns the concrete client class name (`SeekdbEmbeddedClient`, `SeekdbServerClient`, or `OceanBaseServerClient`) |
|
|
112
|
+
| `create_collection()` | Create a new collection (see Collection Management) |
|
|
113
|
+
| `get_collection()` | Get an existing collection object |
|
|
114
|
+
| `delete_collection()` | Delete a collection |
|
|
115
|
+
| `list_collections()` | List all collections in the current database |
|
|
116
|
+
| `has_collection()` | Check if a collection exists |
|
|
117
|
+
| `get_or_create_collection()` | Get an existing collection or create it if it doesn't exist |
|
|
118
|
+
| `count_collection()` | Count the number of collections in the current database |
|
|
119
|
+
|
|
120
|
+
**Note:** The `Client` factory function returns a proxy that only exposes collection operations. For database management operations, use `AdminClient` (see section 2).
|
|
121
|
+
|
|
122
|
+
## 2. AdminClient Connection and Database Management
|
|
123
|
+
|
|
124
|
+
The `AdminClient` class provides database management operations. It uses the same connection modes as `Client` but only exposes database management methods.
|
|
125
|
+
|
|
126
|
+
### 2.1 Embedded/Server AdminClient
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
import seekdbclient
|
|
130
|
+
|
|
131
|
+
# Embedded mode - Database management
|
|
132
|
+
admin = seekdbclient.AdminClient(path="./seekdb")
|
|
133
|
+
|
|
134
|
+
# Server mode - Database management
|
|
135
|
+
admin = seekdbclient.AdminClient(
|
|
136
|
+
host="127.0.0.1",
|
|
137
|
+
port=2881,
|
|
138
|
+
user="root",
|
|
139
|
+
password=""
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Use context manager
|
|
143
|
+
with seekdbclient.AdminClient(host="127.0.0.1", port=2881, user="root") as admin:
|
|
144
|
+
# Create database
|
|
145
|
+
admin.create_database("my_database")
|
|
146
|
+
|
|
147
|
+
# List all databases
|
|
148
|
+
databases = admin.list_databases()
|
|
149
|
+
for db in databases:
|
|
150
|
+
print(f"Database: {db.name}")
|
|
151
|
+
|
|
152
|
+
# Get database information
|
|
153
|
+
db = admin.get_database("my_database")
|
|
154
|
+
print(f"Database: {db.name}, Charset: {db.charset}")
|
|
155
|
+
|
|
156
|
+
# Delete database
|
|
157
|
+
admin.delete_database("my_database")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### 2.2 OceanBase AdminClient
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
import seekdbclient
|
|
164
|
+
|
|
165
|
+
# OceanBase mode - Database management (multi-tenant)
|
|
166
|
+
admin = seekdbclient.OBAdminClient(
|
|
167
|
+
host="127.0.0.1",
|
|
168
|
+
port=11402,
|
|
169
|
+
tenant="mysql", # Tenant name
|
|
170
|
+
user="root",
|
|
171
|
+
password=""
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Use context manager
|
|
175
|
+
with seekdbclient.OBAdminClient(
|
|
176
|
+
host="127.0.0.1",
|
|
177
|
+
port=11402,
|
|
178
|
+
tenant="mysql",
|
|
179
|
+
user="root"
|
|
180
|
+
) as admin:
|
|
181
|
+
# Create database in tenant
|
|
182
|
+
admin.create_database("analytics", tenant="mysql")
|
|
183
|
+
|
|
184
|
+
# List databases in tenant
|
|
185
|
+
databases = admin.list_databases(tenant="mysql")
|
|
186
|
+
for db in databases:
|
|
187
|
+
print(f"Database: {db.name}, Tenant: {db.tenant}")
|
|
188
|
+
|
|
189
|
+
# Get database
|
|
190
|
+
db = admin.get_database("analytics", tenant="mysql")
|
|
191
|
+
|
|
192
|
+
# Delete database
|
|
193
|
+
admin.delete_database("analytics", tenant="mysql")
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### 2.3 AdminClient Methods
|
|
197
|
+
|
|
198
|
+
| Method | Description |
|
|
199
|
+
|---------------------------|----------------------------------------------------|
|
|
200
|
+
| `create_database(name, tenant=DEFAULT_TENANT)` | Create a new database (tenant ignored for embedded/server mode) |
|
|
201
|
+
| `get_database(name, tenant=DEFAULT_TENANT)` | Get database object with metadata (tenant ignored for embedded/server mode) |
|
|
202
|
+
| `delete_database(name, tenant=DEFAULT_TENANT)` | Delete a database (tenant ignored for embedded/server mode) |
|
|
203
|
+
| `list_databases(limit=None, offset=None, tenant=DEFAULT_TENANT)` | List all databases with optional pagination (tenant ignored for embedded/server mode) |
|
|
204
|
+
|
|
205
|
+
**Parameters:**
|
|
206
|
+
- `name` (str): Database name
|
|
207
|
+
- `tenant` (str, optional): Tenant name (required for OceanBase, ignored for embedded/server mode)
|
|
208
|
+
- `limit` (int, optional): Maximum number of results to return
|
|
209
|
+
- `offset` (int, optional): Number of results to skip for pagination
|
|
210
|
+
|
|
211
|
+
**Note:**
|
|
212
|
+
- Embedded/Server mode: No tenant concept (tenant=None in Database objects)
|
|
213
|
+
- OceanBase mode: Multi-tenant architecture (tenant is set in Database objects)
|
|
214
|
+
|
|
215
|
+
### 2.4 Database Object
|
|
216
|
+
|
|
217
|
+
The `get_database()` and `list_databases()` methods return `Database` objects with the following properties:
|
|
218
|
+
|
|
219
|
+
- `name` (str): Database name
|
|
220
|
+
- `tenant` (str, optional): Tenant name (None for embedded/server mode)
|
|
221
|
+
- `charset` (str, optional): Character set
|
|
222
|
+
- `collation` (str, optional): Collation
|
|
223
|
+
- `metadata` (dict): Additional metadata
|
|
224
|
+
|
|
225
|
+
## 3. Collection (Table) Management
|
|
226
|
+
|
|
227
|
+
Collections are the primary data structures in SeekDBClient, similar to tables in traditional databases. Each collection stores documents with vector embeddings, metadata, and full-text search capabilities.
|
|
228
|
+
|
|
229
|
+
### 3.1 Creating a Collection
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
import seekdbclient
|
|
233
|
+
|
|
234
|
+
# Create a client
|
|
235
|
+
client = seekdbclient.Client(host="127.0.0.1", port=2881, database="test")
|
|
236
|
+
|
|
237
|
+
# Create a collection with vector dimension
|
|
238
|
+
collection = client.create_collection(
|
|
239
|
+
name="my_collection",
|
|
240
|
+
dimension=128 # Vector dimension (required)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Get or create collection (creates if doesn't exist)
|
|
244
|
+
collection = client.get_or_create_collection(
|
|
245
|
+
name="my_collection",
|
|
246
|
+
dimension=128
|
|
247
|
+
)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### 3.2 Getting a Collection
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
# Get an existing collection
|
|
254
|
+
collection = client.get_collection("my_collection")
|
|
255
|
+
|
|
256
|
+
# Check if collection exists
|
|
257
|
+
if client.has_collection("my_collection"):
|
|
258
|
+
collection = client.get_collection("my_collection")
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### 3.3 Listing Collections
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
# List all collections
|
|
265
|
+
collections = client.list_collections()
|
|
266
|
+
for coll in collections:
|
|
267
|
+
print(f"Collection: {coll.name}, Dimension: {coll.dimension}")
|
|
268
|
+
|
|
269
|
+
# Count collections in database
|
|
270
|
+
collection_count = client.count_collection()
|
|
271
|
+
print(f"Database has {collection_count} collections")
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### 3.4 Deleting a Collection
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
# Delete a collection
|
|
278
|
+
client.delete_collection("my_collection")
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### 3.5 Collection Properties
|
|
282
|
+
|
|
283
|
+
Each `Collection` object has the following properties:
|
|
284
|
+
|
|
285
|
+
- `name` (str): Collection name
|
|
286
|
+
- `id` (str, optional): Collection unique identifier
|
|
287
|
+
- `dimension` (int, optional): Vector dimension
|
|
288
|
+
- `metadata` (dict): Collection metadata
|
|
289
|
+
- `client`: Reference to the client that created it
|
|
290
|
+
|
|
291
|
+
## 4. DML Operations
|
|
292
|
+
|
|
293
|
+
DML (Data Manipulation Language) operations allow you to insert, update, and delete data in collections.
|
|
294
|
+
|
|
295
|
+
### 4.1 Add Data
|
|
296
|
+
|
|
297
|
+
The `add()` method inserts new records into a collection. If a record with the same ID already exists, an error will be raised.
|
|
298
|
+
|
|
299
|
+
```python
|
|
300
|
+
# Add single item
|
|
301
|
+
collection.add(
|
|
302
|
+
ids="item1",
|
|
303
|
+
vectors=[0.1, 0.2, 0.3],
|
|
304
|
+
documents="This is a document",
|
|
305
|
+
metadatas={"category": "AI", "score": 95}
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Add multiple items
|
|
309
|
+
collection.add(
|
|
310
|
+
ids=["item1", "item2", "item3"],
|
|
311
|
+
vectors=[
|
|
312
|
+
[0.1, 0.2, 0.3],
|
|
313
|
+
[0.4, 0.5, 0.6],
|
|
314
|
+
[0.7, 0.8, 0.9]
|
|
315
|
+
],
|
|
316
|
+
documents=[
|
|
317
|
+
"Document 1",
|
|
318
|
+
"Document 2",
|
|
319
|
+
"Document 3"
|
|
320
|
+
],
|
|
321
|
+
metadatas=[
|
|
322
|
+
{"category": "AI", "score": 95},
|
|
323
|
+
{"category": "ML", "score": 88},
|
|
324
|
+
{"category": "DL", "score": 92}
|
|
325
|
+
]
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Add with only vectors
|
|
329
|
+
collection.add(
|
|
330
|
+
ids=["vec1", "vec2"],
|
|
331
|
+
vectors=[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Add with only documents (no vectors)
|
|
335
|
+
collection.add(
|
|
336
|
+
ids="doc1",
|
|
337
|
+
documents="Text document without vector"
|
|
338
|
+
)
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
**Parameters:**
|
|
342
|
+
- `ids` (str or List[str]): Single ID or list of IDs (required)
|
|
343
|
+
- `vectors` (List[float] or List[List[float]], optional): Single vector or list of vectors
|
|
344
|
+
- `documents` (str or List[str], optional): Single document or list of documents
|
|
345
|
+
- `metadatas` (dict or List[dict], optional): Single metadata dict or list of metadata dicts
|
|
346
|
+
|
|
347
|
+
### 4.2 Update Data
|
|
348
|
+
|
|
349
|
+
The `update()` method updates existing records in a collection. Records must exist, otherwise an error will be raised.
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
# Update single item
|
|
353
|
+
collection.update(
|
|
354
|
+
ids="item1",
|
|
355
|
+
metadatas={"category": "AI", "score": 98} # Update metadata only
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Update multiple items
|
|
359
|
+
collection.update(
|
|
360
|
+
ids=["item1", "item2"],
|
|
361
|
+
vectors=[[0.9, 0.8, 0.7], [0.6, 0.5, 0.4]], # Update vectors
|
|
362
|
+
documents=["Updated document 1", "Updated document 2"] # Update documents
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Update specific fields
|
|
366
|
+
collection.update(
|
|
367
|
+
ids="item1",
|
|
368
|
+
documents="New document text" # Only update document
|
|
369
|
+
)
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
**Parameters:**
|
|
373
|
+
- `ids` (str or List[str]): Single ID or list of IDs to update (required)
|
|
374
|
+
- `vectors` (List[float] or List[List[float]], optional): New vectors
|
|
375
|
+
- `documents` (str or List[str], optional): New documents
|
|
376
|
+
- `metadatas` (dict or List[dict], optional): New metadata
|
|
377
|
+
|
|
378
|
+
### 4.3 Upsert Data
|
|
379
|
+
|
|
380
|
+
The `upsert()` method inserts new records or updates existing ones. If a record with the given ID exists, it will be updated; otherwise, a new record will be inserted.
|
|
381
|
+
|
|
382
|
+
```python
|
|
383
|
+
# Upsert single item (insert or update)
|
|
384
|
+
collection.upsert(
|
|
385
|
+
ids="item1",
|
|
386
|
+
vectors=[0.1, 0.2, 0.3],
|
|
387
|
+
documents="Document text",
|
|
388
|
+
metadatas={"category": "AI", "score": 95}
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# Upsert multiple items
|
|
392
|
+
collection.upsert(
|
|
393
|
+
ids=["item1", "item2", "item3"],
|
|
394
|
+
vectors=[
|
|
395
|
+
[0.1, 0.2, 0.3],
|
|
396
|
+
[0.4, 0.5, 0.6],
|
|
397
|
+
[0.7, 0.8, 0.9]
|
|
398
|
+
],
|
|
399
|
+
documents=["Doc 1", "Doc 2", "Doc 3"],
|
|
400
|
+
metadatas=[
|
|
401
|
+
{"category": "AI"},
|
|
402
|
+
{"category": "ML"},
|
|
403
|
+
{"category": "DL"}
|
|
404
|
+
]
|
|
405
|
+
)
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
**Parameters:**
|
|
409
|
+
- `ids` (str or List[str]): Single ID or list of IDs (required)
|
|
410
|
+
- `vectors` (List[float] or List[List[float]], optional): Vectors
|
|
411
|
+
- `documents` (str or List[str], optional): Documents
|
|
412
|
+
- `metadatas` (dict or List[dict], optional): Metadata
|
|
413
|
+
|
|
414
|
+
### 4.4 Delete Data
|
|
415
|
+
|
|
416
|
+
The `delete()` method removes records from a collection. You can delete by IDs, metadata filters, or document filters.
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
# Delete by IDs
|
|
420
|
+
collection.delete(ids=["item1", "item2", "item3"])
|
|
421
|
+
|
|
422
|
+
# Delete by single ID
|
|
423
|
+
collection.delete(ids="item1")
|
|
424
|
+
|
|
425
|
+
# Delete by metadata filter
|
|
426
|
+
collection.delete(where={"category": {"$eq": "AI"}})
|
|
427
|
+
|
|
428
|
+
# Delete by comparison operator
|
|
429
|
+
collection.delete(where={"score": {"$lt": 50}})
|
|
430
|
+
|
|
431
|
+
# Delete by document filter
|
|
432
|
+
collection.delete(where_document={"$contains": "obsolete"})
|
|
433
|
+
|
|
434
|
+
# Delete with combined filters
|
|
435
|
+
collection.delete(
|
|
436
|
+
where={"category": {"$eq": "AI"}},
|
|
437
|
+
where_document={"$contains": "deprecated"}
|
|
438
|
+
)
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
**Parameters:**
|
|
442
|
+
- `ids` (str or List[str], optional): Single ID or list of IDs to delete
|
|
443
|
+
- `where` (dict, optional): Metadata filter conditions (see Filter Operators section)
|
|
444
|
+
- `where_document` (dict, optional): Document filter conditions
|
|
445
|
+
|
|
446
|
+
**Note:** At least one of `ids`, `where`, or `where_document` must be provided.
|
|
447
|
+
|
|
448
|
+
## 5. DQL Operations
|
|
449
|
+
|
|
450
|
+
DQL (Data Query Language) operations allow you to retrieve data from collections using various query methods.
|
|
451
|
+
|
|
452
|
+
### 5.1 Query (Vector Similarity Search)
|
|
453
|
+
|
|
454
|
+
The `query()` method performs vector similarity search to find the most similar documents to the query vector(s).
|
|
455
|
+
|
|
456
|
+
```python
|
|
457
|
+
# Basic vector similarity query
|
|
458
|
+
results = collection.query(
|
|
459
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
460
|
+
n_results=3
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# Iterate over results
|
|
464
|
+
for item in results:
|
|
465
|
+
print(f"ID: {item._id}, Distance: {item.distance}")
|
|
466
|
+
print(f"Document: {item.document}")
|
|
467
|
+
print(f"Metadata: {item.metadata}")
|
|
468
|
+
|
|
469
|
+
# Query with metadata filter
|
|
470
|
+
results = collection.query(
|
|
471
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
472
|
+
where={"category": {"$eq": "AI"}},
|
|
473
|
+
n_results=5
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Query with comparison operator
|
|
477
|
+
results = collection.query(
|
|
478
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
479
|
+
where={"score": {"$gte": 90}},
|
|
480
|
+
n_results=5
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Query with document filter
|
|
484
|
+
results = collection.query(
|
|
485
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
486
|
+
where_document={"$contains": "machine learning"},
|
|
487
|
+
n_results=5
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
# Query with combined filters
|
|
491
|
+
results = collection.query(
|
|
492
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
493
|
+
where={"category": {"$eq": "AI"}, "score": {"$gte": 90}},
|
|
494
|
+
where_document={"$contains": "machine"},
|
|
495
|
+
n_results=5
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Query with $in operator
|
|
499
|
+
results = collection.query(
|
|
500
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
501
|
+
where={"tag": {"$in": ["ml", "python"]}},
|
|
502
|
+
n_results=5
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Query with logical operators ($or)
|
|
506
|
+
results = collection.query(
|
|
507
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
508
|
+
where={
|
|
509
|
+
"$or": [
|
|
510
|
+
{"category": {"$eq": "AI"}},
|
|
511
|
+
{"tag": {"$eq": "python"}}
|
|
512
|
+
]
|
|
513
|
+
},
|
|
514
|
+
n_results=5
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Query with multiple vectors (batch query)
|
|
518
|
+
results = collection.query(
|
|
519
|
+
query_embeddings=[[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
|
|
520
|
+
n_results=2
|
|
521
|
+
)
|
|
522
|
+
# Returns List[QueryResult], one for each query vector
|
|
523
|
+
for i, result in enumerate(results):
|
|
524
|
+
print(f"Query {i}: {len(result)} results")
|
|
525
|
+
|
|
526
|
+
# Query with specific fields
|
|
527
|
+
results = collection.query(
|
|
528
|
+
query_embeddings=[1.0, 2.0, 3.0],
|
|
529
|
+
include=["documents", "metadatas", "embeddings"],
|
|
530
|
+
n_results=3
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Query by texts (will be embedded automatically if embedding function is available)
|
|
534
|
+
# Note: Currently requires query_embeddings to be provided directly
|
|
535
|
+
results = collection.query(
|
|
536
|
+
query_texts=["my query text"],
|
|
537
|
+
n_results=10
|
|
538
|
+
)
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
**Parameters:**
|
|
542
|
+
- `query_embeddings` (List[float] or List[List[float]], optional): Single vector or list of vectors for batch queries
|
|
543
|
+
- `query_texts` (str or List[str], optional): Query text(s) to be embedded (requires embedding function)
|
|
544
|
+
- `n_results` (int, required): Number of similar results to return (default: 10)
|
|
545
|
+
- `where` (dict, optional): Metadata filter conditions (see Filter Operators section)
|
|
546
|
+
- `where_document` (dict, optional): Document content filter
|
|
547
|
+
- `include` (List[str], optional): List of fields to include: `["documents", "metadatas", "embeddings"]`
|
|
548
|
+
|
|
549
|
+
**Returns:**
|
|
550
|
+
- If single vector/text provided: `QueryResult` object containing query results
|
|
551
|
+
- If multiple vectors/texts provided: `List[QueryResult]` objects, one for each query vector
|
|
552
|
+
|
|
553
|
+
**QueryResult Object:**
|
|
554
|
+
- Iterable: `for item in results: ...`
|
|
555
|
+
- Indexable: `results[0]` to get first item
|
|
556
|
+
- Each item has:
|
|
557
|
+
- `_id`: Record ID (always included)
|
|
558
|
+
- `document`: Document text (if included)
|
|
559
|
+
- `embedding`: Vector embedding (if included)
|
|
560
|
+
- `metadata`: Metadata dictionary (if included)
|
|
561
|
+
- `distance`: Similarity distance (always included for query)
|
|
562
|
+
|
|
563
|
+
### 5.2 Get (Retrieve by IDs or Filters)
|
|
564
|
+
|
|
565
|
+
The `get()` method retrieves documents from a collection without vector similarity search. It supports filtering by IDs, metadata, and document content.
|
|
566
|
+
|
|
567
|
+
```python
|
|
568
|
+
# Get by single ID
|
|
569
|
+
results = collection.get(ids="123")
|
|
570
|
+
|
|
571
|
+
# Get by multiple IDs
|
|
572
|
+
results = collection.get(ids=["1", "2", "3"])
|
|
573
|
+
|
|
574
|
+
# Get by metadata filter
|
|
575
|
+
results = collection.get(
|
|
576
|
+
where={"category": {"$eq": "AI"}},
|
|
577
|
+
limit=10
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
# Get by comparison operator
|
|
581
|
+
results = collection.get(
|
|
582
|
+
where={"score": {"$gte": 90}},
|
|
583
|
+
limit=10
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
# Get by $in operator
|
|
587
|
+
results = collection.get(
|
|
588
|
+
where={"tag": {"$in": ["ml", "python"]}},
|
|
589
|
+
limit=10
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# Get by logical operators ($or)
|
|
593
|
+
results = collection.get(
|
|
594
|
+
where={
|
|
595
|
+
"$or": [
|
|
596
|
+
{"category": {"$eq": "AI"}},
|
|
597
|
+
{"tag": {"$eq": "python"}}
|
|
598
|
+
]
|
|
599
|
+
},
|
|
600
|
+
limit=10
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
# Get by document content filter
|
|
604
|
+
results = collection.get(
|
|
605
|
+
where_document={"$contains": "machine learning"},
|
|
606
|
+
limit=10
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Get with combined filters
|
|
610
|
+
results = collection.get(
|
|
611
|
+
where={"category": {"$eq": "AI"}},
|
|
612
|
+
where_document={"$contains": "machine"},
|
|
613
|
+
limit=10
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
# Get with pagination
|
|
617
|
+
results = collection.get(limit=2, offset=1)
|
|
618
|
+
|
|
619
|
+
# Get with specific fields
|
|
620
|
+
results = collection.get(
|
|
621
|
+
ids=["1", "2"],
|
|
622
|
+
include=["documents", "metadatas", "embeddings"]
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Get all data (up to limit)
|
|
626
|
+
results = collection.get(limit=100)
|
|
627
|
+
```
|
|
628
|
+
|
|
629
|
+
**Parameters:**
|
|
630
|
+
- `ids` (str or List[str], optional): Single ID or list of IDs to retrieve
|
|
631
|
+
- `where` (dict, optional): Metadata filter conditions (see Filter Operators section)
|
|
632
|
+
- `where_document` (dict, optional): Document content filter using `$contains` for full-text search
|
|
633
|
+
- `limit` (int, optional): Maximum number of results to return
|
|
634
|
+
- `offset` (int, optional): Number of results to skip for pagination
|
|
635
|
+
- `include` (List[str], optional): List of fields to include: `["documents", "metadatas", "embeddings"]`
|
|
636
|
+
|
|
637
|
+
**Returns:**
|
|
638
|
+
- If single ID provided: `QueryResult` object containing get results for that ID
|
|
639
|
+
- If multiple IDs provided: `List[QueryResult]` objects, one for each ID
|
|
640
|
+
- If filters provided (no IDs): `QueryResult` object containing all matching results
|
|
641
|
+
|
|
642
|
+
**Note:** If no parameters provided, returns all data (up to limit).
|
|
643
|
+
|
|
644
|
+
### 5.3 Hybrid Search
|
|
645
|
+
|
|
646
|
+
The `hybrid_search()` method combines full-text search and vector similarity search with ranking.
|
|
647
|
+
|
|
648
|
+
```python
|
|
649
|
+
# Hybrid search with both full-text and vector search
|
|
650
|
+
results = collection.hybrid_search(
|
|
651
|
+
query={
|
|
652
|
+
"where_document": {"$contains": "machine learning"},
|
|
653
|
+
"where": {"category": {"$eq": "science"}},
|
|
654
|
+
"n_results": 10
|
|
655
|
+
},
|
|
656
|
+
knn={
|
|
657
|
+
"query_texts": ["AI research"],
|
|
658
|
+
"where": {"year": {"$gte": 2020}},
|
|
659
|
+
"n_results": 10
|
|
660
|
+
},
|
|
661
|
+
rank={"rrf": {}}, # Reciprocal Rank Fusion
|
|
662
|
+
n_results=5,
|
|
663
|
+
include=["documents", "metadatas", "embeddings"]
|
|
664
|
+
)
|
|
665
|
+
```
|
|
666
|
+
|
|
667
|
+
**Parameters:**
|
|
668
|
+
- `query` (dict, optional): Full-text search configuration with:
|
|
669
|
+
- `where_document`: Document filter conditions
|
|
670
|
+
- `where`: Metadata filter conditions
|
|
671
|
+
- `n_results`: Number of results for full-text search
|
|
672
|
+
- `knn` (dict, optional): Vector search configuration with:
|
|
673
|
+
- `query_texts` or `query_embeddings`: Query vectors/texts
|
|
674
|
+
- `where`: Metadata filter conditions
|
|
675
|
+
- `n_results`: Number of results for vector search
|
|
676
|
+
- `rank` (dict, optional): Ranking configuration (e.g., `{"rrf": {"rank_window_size": 60, "rank_constant": 60}}`)
|
|
677
|
+
- `n_results` (int): Final number of results to return after ranking (default: 10)
|
|
678
|
+
- `include` (List[str], optional): Fields to include in results
|
|
679
|
+
|
|
680
|
+
**Returns:**
|
|
681
|
+
Search results dictionary containing ids, distances, metadatas, documents, embeddings, etc.
|
|
682
|
+
|
|
683
|
+
### 5.4 Filter Operators
|
|
684
|
+
|
|
685
|
+
#### Metadata Filters (`where` parameter)
|
|
686
|
+
|
|
687
|
+
- `$eq`: Equal to
|
|
688
|
+
```python
|
|
689
|
+
where={"category": {"$eq": "AI"}}
|
|
690
|
+
```
|
|
691
|
+
|
|
692
|
+
- `$ne`: Not equal to
|
|
693
|
+
```python
|
|
694
|
+
where={"status": {"$ne": "deleted"}}
|
|
695
|
+
```
|
|
696
|
+
|
|
697
|
+
- `$gt`: Greater than
|
|
698
|
+
```python
|
|
699
|
+
where={"score": {"$gt": 90}}
|
|
700
|
+
```
|
|
701
|
+
|
|
702
|
+
- `$gte`: Greater than or equal to
|
|
703
|
+
```python
|
|
704
|
+
where={"score": {"$gte": 90}}
|
|
705
|
+
```
|
|
706
|
+
|
|
707
|
+
- `$lt`: Less than
|
|
708
|
+
```python
|
|
709
|
+
where={"score": {"$lt": 50}}
|
|
710
|
+
```
|
|
711
|
+
|
|
712
|
+
- `$lte`: Less than or equal to
|
|
713
|
+
```python
|
|
714
|
+
where={"score": {"$lte": 50}}
|
|
715
|
+
```
|
|
716
|
+
|
|
717
|
+
- `$in`: Value in array
|
|
718
|
+
```python
|
|
719
|
+
where={"tag": {"$in": ["ml", "python", "ai"]}}
|
|
720
|
+
```
|
|
721
|
+
|
|
722
|
+
- `$nin`: Value not in array
|
|
723
|
+
```python
|
|
724
|
+
where={"tag": {"$nin": ["deprecated", "old"]}}
|
|
725
|
+
```
|
|
726
|
+
|
|
727
|
+
- `$or`: Logical OR
|
|
728
|
+
```python
|
|
729
|
+
where={
|
|
730
|
+
"$or": [
|
|
731
|
+
{"category": {"$eq": "AI"}},
|
|
732
|
+
{"tag": {"$eq": "python"}}
|
|
733
|
+
]
|
|
734
|
+
}
|
|
735
|
+
```
|
|
736
|
+
|
|
737
|
+
- `$and`: Logical AND
|
|
738
|
+
```python
|
|
739
|
+
where={
|
|
740
|
+
"$and": [
|
|
741
|
+
{"category": {"$eq": "AI"}},
|
|
742
|
+
{"score": {"$gte": 90}}
|
|
743
|
+
]
|
|
744
|
+
}
|
|
745
|
+
```
|
|
746
|
+
|
|
747
|
+
#### Document Filters (`where_document` parameter)
|
|
748
|
+
|
|
749
|
+
- `$contains`: Full-text search (contains substring)
|
|
750
|
+
```python
|
|
751
|
+
where_document={"$contains": "machine learning"}
|
|
752
|
+
```
|
|
753
|
+
|
|
754
|
+
- `$regex`: Regular expression matching (if supported)
|
|
755
|
+
```python
|
|
756
|
+
where_document={"$regex": "pattern.*"}
|
|
757
|
+
```
|
|
758
|
+
|
|
759
|
+
- `$or`: Logical OR for document filters
|
|
760
|
+
```python
|
|
761
|
+
where_document={
|
|
762
|
+
"$or": [
|
|
763
|
+
{"$contains": "machine learning"},
|
|
764
|
+
{"$contains": "artificial intelligence"}
|
|
765
|
+
]
|
|
766
|
+
}
|
|
767
|
+
```
|
|
768
|
+
|
|
769
|
+
- `$and`: Logical AND for document filters
|
|
770
|
+
```python
|
|
771
|
+
where_document={
|
|
772
|
+
"$and": [
|
|
773
|
+
{"$contains": "machine"},
|
|
774
|
+
{"$contains": "learning"}
|
|
775
|
+
]
|
|
776
|
+
}
|
|
777
|
+
```
|
|
778
|
+
|
|
779
|
+
### 5.5 Collection Information Methods
|
|
780
|
+
|
|
781
|
+
```python
|
|
782
|
+
# Get item count
|
|
783
|
+
count = collection.count()
|
|
784
|
+
print(f"Collection has {count} items")
|
|
785
|
+
|
|
786
|
+
# Get detailed collection information
|
|
787
|
+
info = collection.describe()
|
|
788
|
+
print(f"Name: {info['name']}, Dimension: {info['dimension']}")
|
|
789
|
+
|
|
790
|
+
# Preview first few items in collection
|
|
791
|
+
preview = collection.peek(limit=5)
|
|
792
|
+
for item in preview:
|
|
793
|
+
print(f"ID: {item._id}, Document: {item.document}")
|
|
794
|
+
|
|
795
|
+
# Count collections in database
|
|
796
|
+
collection_count = client.count_collection()
|
|
797
|
+
print(f"Database has {collection_count} collections")
|
|
798
|
+
```
|
|
799
|
+
|
|
800
|
+
**Methods:**
|
|
801
|
+
- `collection.count()` - Get the number of items in the collection
|
|
802
|
+
- `collection.describe()` - Get detailed collection information
|
|
803
|
+
- `collection.peek(limit=10)` - Quickly preview the first few items in the collection
|
|
804
|
+
- `client.count_collection()` - Count the number of collections in the current database
|
|
805
|
+
|
|
806
|
+
## Testing
|
|
807
|
+
|
|
808
|
+
```bash
|
|
809
|
+
# Run all tests
|
|
810
|
+
python3 -m pytest seekdbclient/tests/ -v
|
|
811
|
+
|
|
812
|
+
# Run tests with log output
|
|
813
|
+
python3 -m pytest seekdbclient/tests/ -v -s
|
|
814
|
+
|
|
815
|
+
# Run specific test
|
|
816
|
+
python3 -m pytest seekdbclient/tests/test_client_creation.py::TestClientCreation::test_create_server_client -v
|
|
817
|
+
|
|
818
|
+
# Run specific test file
|
|
819
|
+
python3 -m pytest seekdbclient/tests/test_client_creation.py -v
|
|
820
|
+
```
|
|
821
|
+
|
|
822
|
+
### Environment Variables (Optional)
|
|
823
|
+
|
|
824
|
+
`test_client_creation.py` honors the following overrides:
|
|
825
|
+
|
|
826
|
+
```bash
|
|
827
|
+
export SEEKDB_PATH=/data/seekdb
|
|
828
|
+
export SEEKDB_DATABASE=demo
|
|
829
|
+
export SERVER_HOST=127.0.0.1
|
|
830
|
+
export SERVER_PORT=2881 # SeekDB Server port
|
|
831
|
+
export SERVER_USER=root
|
|
832
|
+
export SERVER_PASSWORD=secret
|
|
833
|
+
export OB_HOST=127.0.0.1
|
|
834
|
+
export OB_PORT=11402 # OceanBase port
|
|
835
|
+
export OB_TENANT=mysql # OceanBase tenant
|
|
836
|
+
export OB_USER=root
|
|
837
|
+
export OB_PASSWORD=
|
|
838
|
+
```
|
|
839
|
+
|
|
840
|
+
## Architecture
|
|
841
|
+
|
|
842
|
+
- **ClientAPI**: Collection operations interface
|
|
843
|
+
- **AdminAPI**: Database operations interface
|
|
844
|
+
- **ServerAPI (BaseClient)**: Implements both interfaces
|
|
845
|
+
- **_ClientProxy**: Exposes only collection operations
|
|
846
|
+
- **_AdminClientProxy**: Exposes only database operations
|
|
847
|
+
|
|
848
|
+
```
|
|
849
|
+
Client() → _ClientProxy → BaseClient (ServerAPI)
|
|
850
|
+
AdminClient() → _AdminClientProxy → BaseClient (ServerAPI)
|
|
851
|
+
```
|
|
852
|
+
|
|
853
|
+
## License
|
|
854
|
+
|
|
855
|
+
This package is licensed under Apache 2.0.
|
|
856
|
+
|