erioon 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- erioon/auth.py +1 -1
- erioon/client.py +73 -78
- erioon/collection.py +215 -127
- erioon/create.py +265 -0
- erioon/database.py +50 -19
- erioon/delete.py +257 -0
- erioon/functions.py +350 -0
- erioon/ping.py +37 -0
- erioon/read.py +241 -0
- erioon/update.py +123 -0
- {erioon-0.0.7.dist-info → erioon-0.0.9.dist-info}/METADATA +1 -1
- erioon-0.0.9.dist-info/RECORD +15 -0
- erioon-0.0.7.dist-info/RECORD +0 -9
- {erioon-0.0.7.dist-info → erioon-0.0.9.dist-info}/LICENSE +0 -0
- {erioon-0.0.7.dist-info → erioon-0.0.9.dist-info}/WHEEL +0 -0
- {erioon-0.0.7.dist-info → erioon-0.0.9.dist-info}/top_level.txt +0 -0
erioon/functions.py
ADDED
@@ -0,0 +1,350 @@
|
|
1
|
+
import msgpack
|
2
|
+
from azure.storage.blob import ContainerClient
|
3
|
+
import json
|
4
|
+
import uuid
|
5
|
+
from io import BytesIO
|
6
|
+
import datetime
|
7
|
+
from threading import Thread
|
8
|
+
|
9
|
+
def create_container_if_not_exists(container_name, container_url):
|
10
|
+
"""
|
11
|
+
Checks if the Blob Storage container exists; if not, creates it.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
container_name: Name of the container to check/create.
|
15
|
+
container_url: SAS URL to the blob container.
|
16
|
+
"""
|
17
|
+
container_client = ContainerClient.from_container_url(container_url)
|
18
|
+
if not container_client.exists():
|
19
|
+
container_client.create_container()
|
20
|
+
|
21
|
+
def get_shard_file_name(user_id_cont, database, collection, container_url, next_shard_number=False):
|
22
|
+
"""
|
23
|
+
Determines the filename of the current (or next) shard MessagePack file for writing data.
|
24
|
+
|
25
|
+
The filename format is: {database}/{collection}/{collection}_{shard_number}.msgpack
|
26
|
+
|
27
|
+
Args:
|
28
|
+
user_id_cont: User identifier/context.
|
29
|
+
database: Database name.
|
30
|
+
collection: Collection name.
|
31
|
+
container_url: Blob Storage container SAS URL.
|
32
|
+
next_shard_number: If True, returns filename for the next shard (increment shard number).
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
Filename string of the shard to be used.
|
36
|
+
"""
|
37
|
+
container_client = ContainerClient.from_container_url(container_url)
|
38
|
+
|
39
|
+
base_shard_name = f"{database}/{collection}/{collection}"
|
40
|
+
|
41
|
+
files = container_client.list_blobs(name_starts_with=base_shard_name)
|
42
|
+
existing_shards = [int(blob.name.split('_')[-1].split('.')[0]) for blob in files if blob.name.endswith('.msgpack')]
|
43
|
+
|
44
|
+
if existing_shards:
|
45
|
+
next_shard = max(existing_shards) + 1 if next_shard_number else max(existing_shards)
|
46
|
+
else:
|
47
|
+
next_shard = 1
|
48
|
+
|
49
|
+
return f"{base_shard_name}_{next_shard}.msgpack"
|
50
|
+
|
51
|
+
def get_shard_limit(user_id_cont, database, collection, container_url):
|
52
|
+
"""
|
53
|
+
Retrieves the maximum number of records allowed in a single shard from the
|
54
|
+
collection_settings.json file, or returns a default limit if file doesn't exist.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
user_id_cont: User identifier/context.
|
58
|
+
database: Database name.
|
59
|
+
collection: Collection name.
|
60
|
+
container_url: Blob Storage container SAS URL.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
Integer shard limit (default 100000).
|
64
|
+
"""
|
65
|
+
container_client = ContainerClient.from_container_url(container_url)
|
66
|
+
config_blob_client = container_client.get_blob_client(blob=f"{database}/{collection}/collection_settings.json")
|
67
|
+
|
68
|
+
if not config_blob_client.exists():
|
69
|
+
return 100000
|
70
|
+
|
71
|
+
config_data = json.loads(config_blob_client.download_blob().readall())
|
72
|
+
return config_data.get("shard_limit", 100000)
|
73
|
+
|
74
|
+
def create_msgpack_file(user_id_cont, database, collection, data, container_url):
|
75
|
+
"""
|
76
|
+
Writes the given record data into the appropriate MessagePack shard file.
|
77
|
+
Automatically manages shard rollover based on shard size limit.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
user_id_cont: User identifier/context.
|
81
|
+
database: Database name.
|
82
|
+
collection: Collection name.
|
83
|
+
data: The record data dict to store.
|
84
|
+
container_url: Blob Storage container SAS URL.
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
The filename of the shard where the record was stored.
|
88
|
+
"""
|
89
|
+
container_client = ContainerClient.from_container_url(container_url)
|
90
|
+
|
91
|
+
msgpack_filename = get_shard_file_name(user_id_cont, database, collection, container_url)
|
92
|
+
|
93
|
+
msgpack_blob_client = container_client.get_blob_client(blob=msgpack_filename)
|
94
|
+
|
95
|
+
existing_records = []
|
96
|
+
max_records_per_shard = get_shard_limit(user_id_cont, database, collection, container_url)
|
97
|
+
|
98
|
+
if msgpack_blob_client.exists():
|
99
|
+
with BytesIO(msgpack_blob_client.download_blob().readall()) as existing_file:
|
100
|
+
existing_records = msgpack.unpackb(existing_file.read(), raw=False)
|
101
|
+
|
102
|
+
if len(existing_records) >= max_records_per_shard:
|
103
|
+
msgpack_filename = get_shard_file_name(user_id_cont, database, collection, container_url, next_shard_number=True)
|
104
|
+
msgpack_blob_client = container_client.get_blob_client(blob=msgpack_filename)
|
105
|
+
existing_records = []
|
106
|
+
|
107
|
+
existing_records.append(data)
|
108
|
+
|
109
|
+
with BytesIO() as out_file:
|
110
|
+
out_file.write(msgpack.packb(existing_records, use_bin_type=True))
|
111
|
+
out_file.seek(0)
|
112
|
+
msgpack_blob_client.upload_blob(out_file, overwrite=True)
|
113
|
+
|
114
|
+
return msgpack_filename
|
115
|
+
|
116
|
+
def save_logs(user_id_cont, database, collection, method, log_type, log_message, count, container_url):
|
117
|
+
"""
|
118
|
+
Saves an individual log entry into logs.json inside the container.
|
119
|
+
Each log entry is keyed by a UUID and includes metadata and timestamp.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
user_id_cont: User identifier/context.
|
123
|
+
database: Database name.
|
124
|
+
collection: Collection name.
|
125
|
+
method: HTTP method or operation type (e.g. POST, GET).
|
126
|
+
log_type: Log type e.g. SUCCESS, ERROR.
|
127
|
+
log_message: Detailed message or data for the log.
|
128
|
+
count: Number of records affected or relevant.
|
129
|
+
container_url: Blob Storage container SAS URL.
|
130
|
+
"""
|
131
|
+
container_client = ContainerClient.from_container_url(container_url)
|
132
|
+
blob_path = f"{database}/{collection}/logs.json"
|
133
|
+
index_blob_client = container_client.get_blob_client(blob=blob_path)
|
134
|
+
|
135
|
+
try:
|
136
|
+
existing_blob = index_blob_client.download_blob().readall()
|
137
|
+
logs_data = json.loads(existing_blob)
|
138
|
+
except Exception:
|
139
|
+
logs_data = {}
|
140
|
+
|
141
|
+
log_id = str(uuid.uuid4())
|
142
|
+
|
143
|
+
logs_data[log_id] = {
|
144
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
145
|
+
"method": method.upper(),
|
146
|
+
"type": log_type.upper(),
|
147
|
+
"log": log_message,
|
148
|
+
"count": count
|
149
|
+
}
|
150
|
+
|
151
|
+
# Upload updated logs
|
152
|
+
index_blob_client.upload_blob(
|
153
|
+
data=json.dumps(logs_data, indent=2),
|
154
|
+
overwrite=True
|
155
|
+
)
|
156
|
+
|
157
|
+
def async_log(user_id, db, collection, method, status, message, count, container_url):
|
158
|
+
"""
|
159
|
+
Executes the save_logs function asynchronously in a separate thread,
|
160
|
+
allowing non-blocking log operations.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
user_id: User identifier/context.
|
164
|
+
db: Database name.
|
165
|
+
collection: Collection name.
|
166
|
+
method: Operation method.
|
167
|
+
status: Log status (SUCCESS, ERROR, etc.).
|
168
|
+
message: Log message or data.
|
169
|
+
count: Number of affected records.
|
170
|
+
container_url: Blob Storage container SAS URL.
|
171
|
+
"""
|
172
|
+
Thread(target=save_logs, args=(user_id, db, collection, method, status, message, count, container_url)).start()
|
173
|
+
|
174
|
+
def generate_unique_id(existing_ids):
|
175
|
+
"""
|
176
|
+
Generates a new UUID string that does not collide with any IDs in existing_ids.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
existing_ids: Iterable of already existing _id strings.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
Unique UUID string not in existing_ids.
|
183
|
+
"""
|
184
|
+
while True:
|
185
|
+
new_id = str(uuid.uuid4())
|
186
|
+
if new_id not in existing_ids:
|
187
|
+
return new_id
|
188
|
+
|
189
|
+
def update_index_file_insert(user_id_cont, database, collection, record_id, shard_number, container_url):
|
190
|
+
"""
|
191
|
+
Updates index.json to register a newly inserted record_id under the appropriate shard.
|
192
|
+
|
193
|
+
The index.json structure is a list of dicts mapping shard names to list of record IDs:
|
194
|
+
[
|
195
|
+
{ "collection_1": ["id1", "id2", ...] },
|
196
|
+
{ "collection_2": ["id3", "id4", ...] }
|
197
|
+
]
|
198
|
+
|
199
|
+
Args:
|
200
|
+
user_id_cont: User identifier/context.
|
201
|
+
database: Database name.
|
202
|
+
collection: Collection name.
|
203
|
+
record_id: The _id of the inserted record.
|
204
|
+
shard_number: The shard number where the record was stored.
|
205
|
+
container_url: Blob Storage container SAS URL.
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
The record_id inserted.
|
209
|
+
"""
|
210
|
+
container_client = ContainerClient.from_container_url(container_url)
|
211
|
+
index_blob_client = container_client.get_blob_client(blob=f"{database}/{collection}/index.json")
|
212
|
+
|
213
|
+
index_data = []
|
214
|
+
|
215
|
+
if index_blob_client.exists():
|
216
|
+
try:
|
217
|
+
index_data = json.loads(index_blob_client.download_blob().readall())
|
218
|
+
except Exception:
|
219
|
+
index_data = []
|
220
|
+
|
221
|
+
shard_key = f"{collection}_{shard_number}"
|
222
|
+
shard_found = False
|
223
|
+
|
224
|
+
for shard in index_data:
|
225
|
+
if shard_key in shard:
|
226
|
+
shard[shard_key].append(record_id)
|
227
|
+
shard_found = True
|
228
|
+
break
|
229
|
+
|
230
|
+
if not shard_found:
|
231
|
+
index_data.append({shard_key: [record_id]})
|
232
|
+
|
233
|
+
index_blob_client.upload_blob(json.dumps(index_data), overwrite=True)
|
234
|
+
|
235
|
+
return record_id
|
236
|
+
|
237
|
+
def update_index_file_delete(user_id_cont, database, collection, record_id, shard_number, container_url):
|
238
|
+
"""
|
239
|
+
Removes a record_id from the index.json under the correct shard upon deletion.
|
240
|
+
|
241
|
+
Cleans up empty shard entries after removal.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
user_id_cont: User identifier/context.
|
245
|
+
database: Database name.
|
246
|
+
collection: Collection name.
|
247
|
+
record_id: The _id of the deleted record.
|
248
|
+
shard_number: The shard number from which to remove the record.
|
249
|
+
container_url: Blob Storage container SAS URL.
|
250
|
+
|
251
|
+
Returns:
|
252
|
+
The record_id deleted.
|
253
|
+
"""
|
254
|
+
container_client = ContainerClient.from_container_url(container_url)
|
255
|
+
index_blob_client = container_client.get_blob_client(blob=f"{database}/{collection}/index.json")
|
256
|
+
|
257
|
+
index_data = []
|
258
|
+
|
259
|
+
if index_blob_client.exists():
|
260
|
+
try:
|
261
|
+
index_data = json.loads(index_blob_client.download_blob().readall())
|
262
|
+
except Exception:
|
263
|
+
index_data = []
|
264
|
+
|
265
|
+
shard_key = f"{collection}_{shard_number}"
|
266
|
+
|
267
|
+
for shard in index_data:
|
268
|
+
if shard_key in shard:
|
269
|
+
if record_id in shard[shard_key]:
|
270
|
+
shard[shard_key].remove(record_id)
|
271
|
+
if not shard[shard_key]:
|
272
|
+
index_data.remove(shard)
|
273
|
+
break
|
274
|
+
|
275
|
+
index_blob_client.upload_blob(json.dumps(index_data), overwrite=True)
|
276
|
+
|
277
|
+
return record_id
|
278
|
+
|
279
|
+
def calculate_shard_number(user_id_cont, database, collection, container_url):
|
280
|
+
"""
|
281
|
+
Determines the shard number for storing a new record.
|
282
|
+
|
283
|
+
Logic:
|
284
|
+
- Lists existing shard files in the collection directory.
|
285
|
+
- Extracts shard numbers from filenames.
|
286
|
+
- Returns the highest shard number found, or 1 if none found.
|
287
|
+
|
288
|
+
Args:
|
289
|
+
user_id_cont: User identifier/context.
|
290
|
+
database: Database name.
|
291
|
+
collection: Collection name.
|
292
|
+
container_url: Blob Storage container SAS URL.
|
293
|
+
|
294
|
+
Returns:
|
295
|
+
Integer shard number to use.
|
296
|
+
"""
|
297
|
+
container_client = ContainerClient.from_container_url(container_url)
|
298
|
+
|
299
|
+
directory_path = f"{database}/{collection}/"
|
300
|
+
blob_list = container_client.list_blobs(name_starts_with=directory_path)
|
301
|
+
|
302
|
+
shard_numbers = []
|
303
|
+
for blob in blob_list:
|
304
|
+
try:
|
305
|
+
parts = blob.name.split("_")
|
306
|
+
if blob.name.endswith(".msgpack"):
|
307
|
+
num = int(parts[1].split(".")[0])
|
308
|
+
shard_numbers.append(num)
|
309
|
+
except Exception:
|
310
|
+
continue
|
311
|
+
if shard_numbers:
|
312
|
+
next_shard = max(shard_numbers)
|
313
|
+
else:
|
314
|
+
next_shard = 1
|
315
|
+
return next_shard
|
316
|
+
|
317
|
+
def check_nested_key(data, key_path, value):
|
318
|
+
"""
|
319
|
+
Recursively checks whether a nested key in a dictionary or list of dictionaries
|
320
|
+
matches the specified value.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
data (dict or list): The data structure (dict or list of dicts) to search.
|
324
|
+
key_path (str): Dot-separated path to the nested key (e.g. "a.b.c").
|
325
|
+
value: The value to compare against.
|
326
|
+
|
327
|
+
Returns:
|
328
|
+
bool: True if the key exists at the nested path and equals the value, else False.
|
329
|
+
"""
|
330
|
+
keys = key_path.split('.')
|
331
|
+
|
332
|
+
if not keys:
|
333
|
+
return False
|
334
|
+
|
335
|
+
current_key = keys[0]
|
336
|
+
remaining_keys = keys[1:]
|
337
|
+
|
338
|
+
if isinstance(data, dict):
|
339
|
+
if current_key in data:
|
340
|
+
if not remaining_keys:
|
341
|
+
if data[current_key] == value:
|
342
|
+
return True
|
343
|
+
else:
|
344
|
+
return check_nested_key(data[current_key], '.'.join(remaining_keys), value)
|
345
|
+
elif isinstance(data, list):
|
346
|
+
for item in data:
|
347
|
+
if isinstance(item, dict):
|
348
|
+
if check_nested_key(item, '.'.join(remaining_keys), value):
|
349
|
+
return True
|
350
|
+
return False
|
erioon/ping.py
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
from erioon.functions import async_log
|
2
|
+
from azure.storage.blob import ContainerClient
|
3
|
+
|
4
|
+
def handle_connection_ping(user_id, db_id, coll_id, container_url):
|
5
|
+
"""
|
6
|
+
Checks if a specific collection exists within an Blob Storage container
|
7
|
+
and logs the status of the connection attempt asynchronously.
|
8
|
+
|
9
|
+
Parameters:
|
10
|
+
- user_id (str): Identifier of the user making the request.
|
11
|
+
- db_id (str): Database identifier (used as a folder prefix).
|
12
|
+
- coll_id (str): Collection identifier (used as a folder prefix).
|
13
|
+
- container_url (str): URL of the Blob Storage container.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
- tuple(dict, int): A tuple containing a status dictionary and an HTTP status code.
|
17
|
+
- If collection is found, returns status "OK" and HTTP 200.
|
18
|
+
- If collection is missing, returns status "KO" with HTTP 404.
|
19
|
+
- On any exception, returns status "KO" with HTTP 500.
|
20
|
+
"""
|
21
|
+
try:
|
22
|
+
container_client = ContainerClient.from_container_url(container_url)
|
23
|
+
directory_path = f"{db_id}/{coll_id}/"
|
24
|
+
|
25
|
+
blobs = container_client.list_blobs(name_starts_with=directory_path)
|
26
|
+
blob_names = [blob.name for blob in blobs]
|
27
|
+
|
28
|
+
if not blob_names:
|
29
|
+
async_log(user_id, db_id, coll_id, "PING", "ERROR", f"No collection {coll_id} found.", 1, container_url)
|
30
|
+
return {"status": "KO", "error": f"No collection {coll_id} found."}, 404
|
31
|
+
|
32
|
+
async_log(user_id, db_id, coll_id, "PING", "SUCCESS", "Connection successful", 1, container_url)
|
33
|
+
return {"status": "OK", "message": "Connection successful"}, 200
|
34
|
+
|
35
|
+
except Exception as e:
|
36
|
+
async_log(user_id, db_id, coll_id, "PING", "ERROR", f"Connection failed: {str(e)}", 1, container_url)
|
37
|
+
return {"status": "KO", "error": "Connection failed", "message": str(e)}, 500
|
erioon/read.py
ADDED
@@ -0,0 +1,241 @@
|
|
1
|
+
import io
|
2
|
+
import msgpack
|
3
|
+
from azure.storage.blob import ContainerClient
|
4
|
+
from erioon.functions import async_log
|
5
|
+
from sklearn.neighbors import KNeighborsClassifier
|
6
|
+
|
7
|
+
|
8
|
+
def handle_get_all(user_id, db_id, coll_id, limit, container_url):
|
9
|
+
"""
|
10
|
+
Retrieves up to a specified number of records from a collection stored in Blob Storage
|
11
|
+
and logs the operation status asynchronously.
|
12
|
+
|
13
|
+
Parameters:
|
14
|
+
- user_id (str): Identifier of the user making the request.
|
15
|
+
- db_id (str): Database identifier (used as the directory prefix).
|
16
|
+
- coll_id (str): Collection identifier (subdirectory under the database).
|
17
|
+
- limit (int): Maximum number of records to retrieve (must not exceed 1,000,000).
|
18
|
+
- container_url (str): URL to the Blob Storage container.
|
19
|
+
|
20
|
+
Behavior:
|
21
|
+
- Scans all blobs in the specified collection path (`db_id/coll_id/`).
|
22
|
+
- Reads shard files, each containing a list of records.
|
23
|
+
- Skips duplicate records by checking their `_id`.
|
24
|
+
- Stops reading once the record limit is reached.
|
25
|
+
- Skips empty or non-conforming blobs gracefully.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
- tuple(dict, int): A tuple containing:
|
29
|
+
- A status dictionary with:
|
30
|
+
- "status": "OK" or "KO"
|
31
|
+
- "count": number of records returned (0 if none)
|
32
|
+
- "results": list of records (only for successful responses)
|
33
|
+
- "error": error message (on failure)
|
34
|
+
- HTTP status code:
|
35
|
+
- 200 if data is successfully returned.
|
36
|
+
- 404 if collection is missing or no data found.
|
37
|
+
- 500 on unexpected errors.
|
38
|
+
"""
|
39
|
+
if limit > 1_000_000:
|
40
|
+
async_log(user_id, db_id, coll_id, "GET", "ERROR", "Limit of 1,000,000 exceeded", 1, container_url)
|
41
|
+
return {"status": "KO", "count": 0, "error": "Limit of 1,000,000 exceeded"}, 404
|
42
|
+
|
43
|
+
directory_path = f"{db_id}/{coll_id}/"
|
44
|
+
container_client = ContainerClient.from_container_url(container_url)
|
45
|
+
|
46
|
+
blob_list = container_client.list_blobs(name_starts_with=directory_path)
|
47
|
+
blob_names = [blob.name for blob in blob_list]
|
48
|
+
|
49
|
+
if not blob_names:
|
50
|
+
async_log(user_id, db_id, coll_id, "GET", "ERROR", f"No collection {coll_id} found.", 1, container_url)
|
51
|
+
return {"status": "KO", "count": 0, "error": f"No collection {coll_id} found."}, 404
|
52
|
+
|
53
|
+
results = []
|
54
|
+
seen_ids = set()
|
55
|
+
|
56
|
+
for blob in blob_names:
|
57
|
+
try:
|
58
|
+
if blob.endswith(".msgpack"):
|
59
|
+
blob_client = container_client.get_blob_client(blob)
|
60
|
+
msgpack_data = blob_client.download_blob().readall()
|
61
|
+
|
62
|
+
if not msgpack_data:
|
63
|
+
continue
|
64
|
+
|
65
|
+
with io.BytesIO(msgpack_data) as buffer:
|
66
|
+
unpacked_data = msgpack.unpackb(buffer.read(), raw=False)
|
67
|
+
if isinstance(unpacked_data, list):
|
68
|
+
for record in unpacked_data:
|
69
|
+
if record["_id"] in seen_ids:
|
70
|
+
continue
|
71
|
+
|
72
|
+
results.append(record)
|
73
|
+
seen_ids.add(record["_id"])
|
74
|
+
|
75
|
+
if len(results) >= limit:
|
76
|
+
async_log(user_id, db_id, coll_id, "GET", "SUCCESS", f"OK", len(results), container_url)
|
77
|
+
return {"status": "OK", "count": len(results), "results": results}, 200
|
78
|
+
|
79
|
+
except Exception:
|
80
|
+
continue
|
81
|
+
|
82
|
+
if results:
|
83
|
+
async_log(user_id, db_id, coll_id, "GET", "SUCCESS", f"OK", len(results), container_url)
|
84
|
+
return {"status": "OK", "count": len(results), "results": results}, 200
|
85
|
+
|
86
|
+
async_log(user_id, db_id, coll_id, "GET", "ERROR", "No data found", 1, container_url)
|
87
|
+
return {"status": "KO", "count": 0, "error": "No data found"}, 404
|
88
|
+
|
89
|
+
|
90
|
+
def handle_get_data(user_id, db_id, coll_id, search_criteria, limit, container_url):
|
91
|
+
"""
|
92
|
+
Searches for records within a collection in Blob Storage that match specified search criteria,
|
93
|
+
and logs the query attempt asynchronously.
|
94
|
+
|
95
|
+
Parameters:
|
96
|
+
- user_id (str): Identifier of the user making the request.
|
97
|
+
- db_id (str): Database identifier (used as the directory prefix).
|
98
|
+
- coll_id (str): Collection identifier (subdirectory under the database).
|
99
|
+
- search_criteria (list[dict]): A list of key-value conditions to match (supports dot notation for nested keys).
|
100
|
+
- limit (int): Maximum number of matching records to return.
|
101
|
+
- container_url (str): URL to the Blob Storage container.
|
102
|
+
|
103
|
+
Behavior:
|
104
|
+
- Iterates over blobs in the collection path (`db_id/coll_id/`).
|
105
|
+
- Filters shard blobs containing lists of records.
|
106
|
+
- Each record is checked against all `search_criteria`.
|
107
|
+
- Supports nested key matching using dot notation (e.g., "user.name").
|
108
|
+
- Skips duplicates based on `_id`.
|
109
|
+
- Stops when enough matching records are found or blobs are exhausted.
|
110
|
+
- Handles and skips corrupted or unreadable blobs gracefully.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
- tuple(dict, int): A tuple containing:
|
114
|
+
- A status dictionary with:
|
115
|
+
- "status": "OK" or "KO"
|
116
|
+
- "count": number of records returned (0 if none)
|
117
|
+
- "results": list of records (only for successful responses)
|
118
|
+
- "error": error message (on failure)
|
119
|
+
- HTTP status code:
|
120
|
+
- 200 if matching records are found.
|
121
|
+
- 404 if the collection or matching data is not found.
|
122
|
+
- 500 on unexpected errors.
|
123
|
+
"""
|
124
|
+
directory_path = f"{db_id}/{coll_id}/"
|
125
|
+
container_client = ContainerClient.from_container_url(container_url)
|
126
|
+
|
127
|
+
blob_list = container_client.list_blobs(name_starts_with=directory_path)
|
128
|
+
blob_names = [blob.name for blob in blob_list]
|
129
|
+
|
130
|
+
if not blob_names:
|
131
|
+
async_log(user_id, db_id, coll_id, "GET", "ERROR", f"No collection {coll_id} found.", 1, container_url)
|
132
|
+
return {"status": "KO", "count": 0, "error": f"No collection {coll_id} found."}, 404
|
133
|
+
|
134
|
+
results = []
|
135
|
+
seen_ids = set()
|
136
|
+
|
137
|
+
for blob in blob_names:
|
138
|
+
try:
|
139
|
+
if blob.endswith(".msgpack"):
|
140
|
+
blob_client = container_client.get_blob_client(blob)
|
141
|
+
msgpack_data = blob_client.download_blob().readall()
|
142
|
+
|
143
|
+
if not msgpack_data:
|
144
|
+
continue
|
145
|
+
|
146
|
+
with io.BytesIO(msgpack_data) as buffer:
|
147
|
+
unpacked_data = msgpack.unpackb(buffer.read(), raw=False)
|
148
|
+
if isinstance(unpacked_data, list):
|
149
|
+
for record in unpacked_data:
|
150
|
+
if record["_id"] in seen_ids:
|
151
|
+
continue
|
152
|
+
|
153
|
+
match_found = False
|
154
|
+
for criteria in search_criteria:
|
155
|
+
print(criteria)
|
156
|
+
key, value = list(criteria.items())[0]
|
157
|
+
|
158
|
+
if key == "_id" and record.get("_id") == value:
|
159
|
+
match_found = True
|
160
|
+
else:
|
161
|
+
keys = key.split(".")
|
162
|
+
nested_value = record
|
163
|
+
for k in keys:
|
164
|
+
if isinstance(nested_value, dict) and k in nested_value:
|
165
|
+
nested_value = nested_value[k]
|
166
|
+
else:
|
167
|
+
nested_value = None
|
168
|
+
break
|
169
|
+
if nested_value == value:
|
170
|
+
match_found = True
|
171
|
+
|
172
|
+
if match_found:
|
173
|
+
results.append(record)
|
174
|
+
seen_ids.add(record["_id"])
|
175
|
+
if len(results) >= limit:
|
176
|
+
async_log(user_id, db_id, coll_id, "GET", "SUCCESS", "OK", len(results), container_url)
|
177
|
+
return {"status": "OK", "count": len(results), "results": results}, 200
|
178
|
+
break
|
179
|
+
|
180
|
+
except Exception:
|
181
|
+
continue
|
182
|
+
|
183
|
+
async_log(user_id, db_id, coll_id, "GET", "ERROR", "No matching record found", 1, container_url)
|
184
|
+
return {"status": "KO", "count": 0, "error": "No matching record found"}, 404
|
185
|
+
|
186
|
+
|
187
|
+
def handle_classify_vector(user_id, db_id, coll_id, container_url, k=3):
|
188
|
+
# 1. Retrieve all data from collection
|
189
|
+
response, status = handle_get_all(user_id, db_id, coll_id, limit=10000, container_url=container_url)
|
190
|
+
if status != 200:
|
191
|
+
return {"status": "KO", "message": "Failed to fetch data for classification", "error": response.get("error", "")}, status
|
192
|
+
|
193
|
+
records = response.get("results", [])
|
194
|
+
if not records:
|
195
|
+
return {"status": "KO", "message": "No data found for classification"}, 404
|
196
|
+
|
197
|
+
# 2. Prepare dataset for classification
|
198
|
+
vectors = []
|
199
|
+
labels = []
|
200
|
+
unknown_vectors = []
|
201
|
+
unknown_ids = []
|
202
|
+
|
203
|
+
for rec in records:
|
204
|
+
vec = rec.get("vector")
|
205
|
+
meta = rec.get("metadata", {})
|
206
|
+
if not vec:
|
207
|
+
continue
|
208
|
+
vectors.append(vec)
|
209
|
+
labels.append(meta.get("class", "unknown"))
|
210
|
+
|
211
|
+
# If some records don’t have classes, you might want to separate them or remove them
|
212
|
+
known_vectors = []
|
213
|
+
known_labels = []
|
214
|
+
for v, l in zip(vectors, labels):
|
215
|
+
if l != "unknown" and l is not None:
|
216
|
+
known_vectors.append(v)
|
217
|
+
known_labels.append(l)
|
218
|
+
else:
|
219
|
+
unknown_vectors.append(v)
|
220
|
+
unknown_ids.append(rec.get("_id"))
|
221
|
+
|
222
|
+
if not known_vectors:
|
223
|
+
return {"status": "KO", "message": "No labeled data for training classification"}, 404
|
224
|
+
|
225
|
+
# 3. Train k-NN classifier on known labeled vectors
|
226
|
+
knn = KNeighborsClassifier(n_neighbors=k)
|
227
|
+
knn.fit(known_vectors, known_labels)
|
228
|
+
|
229
|
+
# 4. Predict class for unknown vectors (if any)
|
230
|
+
predictions = {}
|
231
|
+
if unknown_vectors:
|
232
|
+
predicted_labels = knn.predict(unknown_vectors)
|
233
|
+
for _id, pred in zip(unknown_ids, predicted_labels):
|
234
|
+
predictions[_id] = pred
|
235
|
+
|
236
|
+
# 5. Return predictions (or full classification result)
|
237
|
+
return {
|
238
|
+
"status": "OK",
|
239
|
+
"message": f"Classification done on {len(unknown_vectors)} unknown vectors",
|
240
|
+
"predictions": predictions,
|
241
|
+
}, 200
|