erioon 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
erioon/functions.py DELETED
@@ -1,350 +0,0 @@
1
- import msgpack
2
- from azure.storage.blob import ContainerClient
3
- import json
4
- import uuid
5
- from io import BytesIO
6
- import datetime
7
- from threading import Thread
8
-
9
- def create_container_if_not_exists(container_name, container_url):
10
- """
11
- Checks if the Blob Storage container exists; if not, creates it.
12
-
13
- Args:
14
- container_name: Name of the container to check/create.
15
- container_url: SAS URL to the blob container.
16
- """
17
- container_client = ContainerClient.from_container_url(container_url)
18
- if not container_client.exists():
19
- container_client.create_container()
20
-
21
- def get_shard_file_name(user_id_cont, database, collection, container_url, next_shard_number=False):
22
- """
23
- Determines the filename of the current (or next) shard MessagePack file for writing data.
24
-
25
- The filename format is: {database}/{collection}/{collection}_{shard_number}.msgpack
26
-
27
- Args:
28
- user_id_cont: User identifier/context.
29
- database: Database name.
30
- collection: Collection name.
31
- container_url: Blob Storage container SAS URL.
32
- next_shard_number: If True, returns filename for the next shard (increment shard number).
33
-
34
- Returns:
35
- Filename string of the shard to be used.
36
- """
37
- container_client = ContainerClient.from_container_url(container_url)
38
-
39
- base_shard_name = f"{database}/{collection}/{collection}"
40
-
41
- files = container_client.list_blobs(name_starts_with=base_shard_name)
42
- existing_shards = [int(blob.name.split('_')[-1].split('.')[0]) for blob in files if blob.name.endswith('.msgpack')]
43
-
44
- if existing_shards:
45
- next_shard = max(existing_shards) + 1 if next_shard_number else max(existing_shards)
46
- else:
47
- next_shard = 1
48
-
49
- return f"{base_shard_name}_{next_shard}.msgpack"
50
-
51
- def get_shard_limit(user_id_cont, database, collection, container_url):
52
- """
53
- Retrieves the maximum number of records allowed in a single shard from the
54
- collection_settings.json file, or returns a default limit if file doesn't exist.
55
-
56
- Args:
57
- user_id_cont: User identifier/context.
58
- database: Database name.
59
- collection: Collection name.
60
- container_url: Blob Storage container SAS URL.
61
-
62
- Returns:
63
- Integer shard limit (default 100000).
64
- """
65
- container_client = ContainerClient.from_container_url(container_url)
66
- config_blob_client = container_client.get_blob_client(blob=f"{database}/{collection}/collection_settings.json")
67
-
68
- if not config_blob_client.exists():
69
- return 100000
70
-
71
- config_data = json.loads(config_blob_client.download_blob().readall())
72
- return config_data.get("shard_limit", 100000)
73
-
74
- def create_msgpack_file(user_id_cont, database, collection, data, container_url):
75
- """
76
- Writes the given record data into the appropriate MessagePack shard file.
77
- Automatically manages shard rollover based on shard size limit.
78
-
79
- Args:
80
- user_id_cont: User identifier/context.
81
- database: Database name.
82
- collection: Collection name.
83
- data: The record data dict to store.
84
- container_url: Blob Storage container SAS URL.
85
-
86
- Returns:
87
- The filename of the shard where the record was stored.
88
- """
89
- container_client = ContainerClient.from_container_url(container_url)
90
-
91
- msgpack_filename = get_shard_file_name(user_id_cont, database, collection, container_url)
92
-
93
- msgpack_blob_client = container_client.get_blob_client(blob=msgpack_filename)
94
-
95
- existing_records = []
96
- max_records_per_shard = get_shard_limit(user_id_cont, database, collection, container_url)
97
-
98
- if msgpack_blob_client.exists():
99
- with BytesIO(msgpack_blob_client.download_blob().readall()) as existing_file:
100
- existing_records = msgpack.unpackb(existing_file.read(), raw=False)
101
-
102
- if len(existing_records) >= max_records_per_shard:
103
- msgpack_filename = get_shard_file_name(user_id_cont, database, collection, container_url, next_shard_number=True)
104
- msgpack_blob_client = container_client.get_blob_client(blob=msgpack_filename)
105
- existing_records = []
106
-
107
- existing_records.append(data)
108
-
109
- with BytesIO() as out_file:
110
- out_file.write(msgpack.packb(existing_records, use_bin_type=True))
111
- out_file.seek(0)
112
- msgpack_blob_client.upload_blob(out_file, overwrite=True)
113
-
114
- return msgpack_filename
115
-
116
- def save_logs(user_id_cont, database, collection, method, log_type, log_message, count, container_url):
117
- """
118
- Saves an individual log entry into logs.json inside the container.
119
- Each log entry is keyed by a UUID and includes metadata and timestamp.
120
-
121
- Args:
122
- user_id_cont: User identifier/context.
123
- database: Database name.
124
- collection: Collection name.
125
- method: HTTP method or operation type (e.g. POST, GET).
126
- log_type: Log type e.g. SUCCESS, ERROR.
127
- log_message: Detailed message or data for the log.
128
- count: Number of records affected or relevant.
129
- container_url: Blob Storage container SAS URL.
130
- """
131
- container_client = ContainerClient.from_container_url(container_url)
132
- blob_path = f"{database}/{collection}/logs.json"
133
- index_blob_client = container_client.get_blob_client(blob=blob_path)
134
-
135
- try:
136
- existing_blob = index_blob_client.download_blob().readall()
137
- logs_data = json.loads(existing_blob)
138
- except Exception:
139
- logs_data = {}
140
-
141
- log_id = str(uuid.uuid4())
142
-
143
- logs_data[log_id] = {
144
- "timestamp": datetime.datetime.now().isoformat(),
145
- "method": method.upper(),
146
- "type": log_type.upper(),
147
- "log": log_message,
148
- "count": count
149
- }
150
-
151
- # Upload updated logs
152
- index_blob_client.upload_blob(
153
- data=json.dumps(logs_data, indent=2),
154
- overwrite=True
155
- )
156
-
157
- def async_log(user_id, db, collection, method, status, message, count, container_url):
158
- """
159
- Executes the save_logs function asynchronously in a separate thread,
160
- allowing non-blocking log operations.
161
-
162
- Args:
163
- user_id: User identifier/context.
164
- db: Database name.
165
- collection: Collection name.
166
- method: Operation method.
167
- status: Log status (SUCCESS, ERROR, etc.).
168
- message: Log message or data.
169
- count: Number of affected records.
170
- container_url: Blob Storage container SAS URL.
171
- """
172
- Thread(target=save_logs, args=(user_id, db, collection, method, status, message, count, container_url)).start()
173
-
174
- def generate_unique_id(existing_ids):
175
- """
176
- Generates a new UUID string that does not collide with any IDs in existing_ids.
177
-
178
- Args:
179
- existing_ids: Iterable of already existing _id strings.
180
-
181
- Returns:
182
- Unique UUID string not in existing_ids.
183
- """
184
- while True:
185
- new_id = str(uuid.uuid4())
186
- if new_id not in existing_ids:
187
- return new_id
188
-
189
- def update_index_file_insert(user_id_cont, database, collection, record_id, shard_number, container_url):
190
- """
191
- Updates index.json to register a newly inserted record_id under the appropriate shard.
192
-
193
- The index.json structure is a list of dicts mapping shard names to list of record IDs:
194
- [
195
- { "collection_1": ["id1", "id2", ...] },
196
- { "collection_2": ["id3", "id4", ...] }
197
- ]
198
-
199
- Args:
200
- user_id_cont: User identifier/context.
201
- database: Database name.
202
- collection: Collection name.
203
- record_id: The _id of the inserted record.
204
- shard_number: The shard number where the record was stored.
205
- container_url: Blob Storage container SAS URL.
206
-
207
- Returns:
208
- The record_id inserted.
209
- """
210
- container_client = ContainerClient.from_container_url(container_url)
211
- index_blob_client = container_client.get_blob_client(blob=f"{database}/{collection}/index.json")
212
-
213
- index_data = []
214
-
215
- if index_blob_client.exists():
216
- try:
217
- index_data = json.loads(index_blob_client.download_blob().readall())
218
- except Exception:
219
- index_data = []
220
-
221
- shard_key = f"{collection}_{shard_number}"
222
- shard_found = False
223
-
224
- for shard in index_data:
225
- if shard_key in shard:
226
- shard[shard_key].append(record_id)
227
- shard_found = True
228
- break
229
-
230
- if not shard_found:
231
- index_data.append({shard_key: [record_id]})
232
-
233
- index_blob_client.upload_blob(json.dumps(index_data), overwrite=True)
234
-
235
- return record_id
236
-
237
- def update_index_file_delete(user_id_cont, database, collection, record_id, shard_number, container_url):
238
- """
239
- Removes a record_id from the index.json under the correct shard upon deletion.
240
-
241
- Cleans up empty shard entries after removal.
242
-
243
- Args:
244
- user_id_cont: User identifier/context.
245
- database: Database name.
246
- collection: Collection name.
247
- record_id: The _id of the deleted record.
248
- shard_number: The shard number from which to remove the record.
249
- container_url: Blob Storage container SAS URL.
250
-
251
- Returns:
252
- The record_id deleted.
253
- """
254
- container_client = ContainerClient.from_container_url(container_url)
255
- index_blob_client = container_client.get_blob_client(blob=f"{database}/{collection}/index.json")
256
-
257
- index_data = []
258
-
259
- if index_blob_client.exists():
260
- try:
261
- index_data = json.loads(index_blob_client.download_blob().readall())
262
- except Exception:
263
- index_data = []
264
-
265
- shard_key = f"{collection}_{shard_number}"
266
-
267
- for shard in index_data:
268
- if shard_key in shard:
269
- if record_id in shard[shard_key]:
270
- shard[shard_key].remove(record_id)
271
- if not shard[shard_key]:
272
- index_data.remove(shard)
273
- break
274
-
275
- index_blob_client.upload_blob(json.dumps(index_data), overwrite=True)
276
-
277
- return record_id
278
-
279
- def calculate_shard_number(user_id_cont, database, collection, container_url):
280
- """
281
- Determines the shard number for storing a new record.
282
-
283
- Logic:
284
- - Lists existing shard files in the collection directory.
285
- - Extracts shard numbers from filenames.
286
- - Returns the highest shard number found, or 1 if none found.
287
-
288
- Args:
289
- user_id_cont: User identifier/context.
290
- database: Database name.
291
- collection: Collection name.
292
- container_url: Blob Storage container SAS URL.
293
-
294
- Returns:
295
- Integer shard number to use.
296
- """
297
- container_client = ContainerClient.from_container_url(container_url)
298
-
299
- directory_path = f"{database}/{collection}/"
300
- blob_list = container_client.list_blobs(name_starts_with=directory_path)
301
-
302
- shard_numbers = []
303
- for blob in blob_list:
304
- try:
305
- parts = blob.name.split("_")
306
- if blob.name.endswith(".msgpack"):
307
- num = int(parts[1].split(".")[0])
308
- shard_numbers.append(num)
309
- except Exception:
310
- continue
311
- if shard_numbers:
312
- next_shard = max(shard_numbers)
313
- else:
314
- next_shard = 1
315
- return next_shard
316
-
317
- def check_nested_key(data, key_path, value):
318
- """
319
- Recursively checks whether a nested key in a dictionary or list of dictionaries
320
- matches the specified value.
321
-
322
- Args:
323
- data (dict or list): The data structure (dict or list of dicts) to search.
324
- key_path (str): Dot-separated path to the nested key (e.g. "a.b.c").
325
- value: The value to compare against.
326
-
327
- Returns:
328
- bool: True if the key exists at the nested path and equals the value, else False.
329
- """
330
- keys = key_path.split('.')
331
-
332
- if not keys:
333
- return False
334
-
335
- current_key = keys[0]
336
- remaining_keys = keys[1:]
337
-
338
- if isinstance(data, dict):
339
- if current_key in data:
340
- if not remaining_keys:
341
- if data[current_key] == value:
342
- return True
343
- else:
344
- return check_nested_key(data[current_key], '.'.join(remaining_keys), value)
345
- elif isinstance(data, list):
346
- for item in data:
347
- if isinstance(item, dict):
348
- if check_nested_key(item, '.'.join(remaining_keys), value):
349
- return True
350
- return False
erioon/ping.py DELETED
@@ -1,37 +0,0 @@
1
- from erioon.functions import async_log
2
- from azure.storage.blob import ContainerClient
3
-
4
- def handle_connection_ping(user_id, db_id, coll_id, container_url):
5
- """
6
- Checks if a specific collection exists within an Blob Storage container
7
- and logs the status of the connection attempt asynchronously.
8
-
9
- Parameters:
10
- - user_id (str): Identifier of the user making the request.
11
- - db_id (str): Database identifier (used as a folder prefix).
12
- - coll_id (str): Collection identifier (used as a folder prefix).
13
- - container_url (str): URL of the Blob Storage container.
14
-
15
- Returns:
16
- - tuple(dict, int): A tuple containing a status dictionary and an HTTP status code.
17
- - If collection is found, returns status "OK" and HTTP 200.
18
- - If collection is missing, returns status "KO" with HTTP 404.
19
- - On any exception, returns status "KO" with HTTP 500.
20
- """
21
- try:
22
- container_client = ContainerClient.from_container_url(container_url)
23
- directory_path = f"{db_id}/{coll_id}/"
24
-
25
- blobs = container_client.list_blobs(name_starts_with=directory_path)
26
- blob_names = [blob.name for blob in blobs]
27
-
28
- if not blob_names:
29
- async_log(user_id, db_id, coll_id, "PING", "ERROR", f"No collection {coll_id} found.", 1, container_url)
30
- return {"status": "KO", "error": f"No collection {coll_id} found."}, 404
31
-
32
- async_log(user_id, db_id, coll_id, "PING", "SUCCESS", "Connection successful", 1, container_url)
33
- return {"status": "OK", "message": "Connection successful"}, 200
34
-
35
- except Exception as e:
36
- async_log(user_id, db_id, coll_id, "PING", "ERROR", f"Connection failed: {str(e)}", 1, container_url)
37
- return {"status": "KO", "error": "Connection failed", "message": str(e)}, 500
erioon/read.py DELETED
@@ -1,241 +0,0 @@
1
- import io
2
- import msgpack
3
- from azure.storage.blob import ContainerClient
4
- from erioon.functions import async_log
5
- from sklearn.neighbors import KNeighborsClassifier
6
-
7
-
8
- def handle_get_all(user_id, db_id, coll_id, limit, container_url):
9
- """
10
- Retrieves up to a specified number of records from a collection stored in Blob Storage
11
- and logs the operation status asynchronously.
12
-
13
- Parameters:
14
- - user_id (str): Identifier of the user making the request.
15
- - db_id (str): Database identifier (used as the directory prefix).
16
- - coll_id (str): Collection identifier (subdirectory under the database).
17
- - limit (int): Maximum number of records to retrieve (must not exceed 1,000,000).
18
- - container_url (str): URL to the Blob Storage container.
19
-
20
- Behavior:
21
- - Scans all blobs in the specified collection path (`db_id/coll_id/`).
22
- - Reads shard files, each containing a list of records.
23
- - Skips duplicate records by checking their `_id`.
24
- - Stops reading once the record limit is reached.
25
- - Skips empty or non-conforming blobs gracefully.
26
-
27
- Returns:
28
- - tuple(dict, int): A tuple containing:
29
- - A status dictionary with:
30
- - "status": "OK" or "KO"
31
- - "count": number of records returned (0 if none)
32
- - "results": list of records (only for successful responses)
33
- - "error": error message (on failure)
34
- - HTTP status code:
35
- - 200 if data is successfully returned.
36
- - 404 if collection is missing or no data found.
37
- - 500 on unexpected errors.
38
- """
39
- if limit > 1_000_000:
40
- async_log(user_id, db_id, coll_id, "GET", "ERROR", "Limit of 1,000,000 exceeded", 1, container_url)
41
- return {"status": "KO", "count": 0, "error": "Limit of 1,000,000 exceeded"}, 404
42
-
43
- directory_path = f"{db_id}/{coll_id}/"
44
- container_client = ContainerClient.from_container_url(container_url)
45
-
46
- blob_list = container_client.list_blobs(name_starts_with=directory_path)
47
- blob_names = [blob.name for blob in blob_list]
48
-
49
- if not blob_names:
50
- async_log(user_id, db_id, coll_id, "GET", "ERROR", f"No collection {coll_id} found.", 1, container_url)
51
- return {"status": "KO", "count": 0, "error": f"No collection {coll_id} found."}, 404
52
-
53
- results = []
54
- seen_ids = set()
55
-
56
- for blob in blob_names:
57
- try:
58
- if blob.endswith(".msgpack"):
59
- blob_client = container_client.get_blob_client(blob)
60
- msgpack_data = blob_client.download_blob().readall()
61
-
62
- if not msgpack_data:
63
- continue
64
-
65
- with io.BytesIO(msgpack_data) as buffer:
66
- unpacked_data = msgpack.unpackb(buffer.read(), raw=False)
67
- if isinstance(unpacked_data, list):
68
- for record in unpacked_data:
69
- if record["_id"] in seen_ids:
70
- continue
71
-
72
- results.append(record)
73
- seen_ids.add(record["_id"])
74
-
75
- if len(results) >= limit:
76
- async_log(user_id, db_id, coll_id, "GET", "SUCCESS", f"OK", len(results), container_url)
77
- return {"status": "OK", "count": len(results), "results": results}, 200
78
-
79
- except Exception:
80
- continue
81
-
82
- if results:
83
- async_log(user_id, db_id, coll_id, "GET", "SUCCESS", f"OK", len(results), container_url)
84
- return {"status": "OK", "count": len(results), "results": results}, 200
85
-
86
- async_log(user_id, db_id, coll_id, "GET", "ERROR", "No data found", 1, container_url)
87
- return {"status": "KO", "count": 0, "error": "No data found"}, 404
88
-
89
-
90
- def handle_get_data(user_id, db_id, coll_id, search_criteria, limit, container_url):
91
- """
92
- Searches for records within a collection in Blob Storage that match specified search criteria,
93
- and logs the query attempt asynchronously.
94
-
95
- Parameters:
96
- - user_id (str): Identifier of the user making the request.
97
- - db_id (str): Database identifier (used as the directory prefix).
98
- - coll_id (str): Collection identifier (subdirectory under the database).
99
- - search_criteria (list[dict]): A list of key-value conditions to match (supports dot notation for nested keys).
100
- - limit (int): Maximum number of matching records to return.
101
- - container_url (str): URL to the Blob Storage container.
102
-
103
- Behavior:
104
- - Iterates over blobs in the collection path (`db_id/coll_id/`).
105
- - Filters shard blobs containing lists of records.
106
- - Each record is checked against all `search_criteria`.
107
- - Supports nested key matching using dot notation (e.g., "user.name").
108
- - Skips duplicates based on `_id`.
109
- - Stops when enough matching records are found or blobs are exhausted.
110
- - Handles and skips corrupted or unreadable blobs gracefully.
111
-
112
- Returns:
113
- - tuple(dict, int): A tuple containing:
114
- - A status dictionary with:
115
- - "status": "OK" or "KO"
116
- - "count": number of records returned (0 if none)
117
- - "results": list of records (only for successful responses)
118
- - "error": error message (on failure)
119
- - HTTP status code:
120
- - 200 if matching records are found.
121
- - 404 if the collection or matching data is not found.
122
- - 500 on unexpected errors.
123
- """
124
- directory_path = f"{db_id}/{coll_id}/"
125
- container_client = ContainerClient.from_container_url(container_url)
126
-
127
- blob_list = container_client.list_blobs(name_starts_with=directory_path)
128
- blob_names = [blob.name for blob in blob_list]
129
-
130
- if not blob_names:
131
- async_log(user_id, db_id, coll_id, "GET", "ERROR", f"No collection {coll_id} found.", 1, container_url)
132
- return {"status": "KO", "count": 0, "error": f"No collection {coll_id} found."}, 404
133
-
134
- results = []
135
- seen_ids = set()
136
-
137
- for blob in blob_names:
138
- try:
139
- if blob.endswith(".msgpack"):
140
- blob_client = container_client.get_blob_client(blob)
141
- msgpack_data = blob_client.download_blob().readall()
142
-
143
- if not msgpack_data:
144
- continue
145
-
146
- with io.BytesIO(msgpack_data) as buffer:
147
- unpacked_data = msgpack.unpackb(buffer.read(), raw=False)
148
- if isinstance(unpacked_data, list):
149
- for record in unpacked_data:
150
- if record["_id"] in seen_ids:
151
- continue
152
-
153
- match_found = False
154
- for criteria in search_criteria:
155
- print(criteria)
156
- key, value = list(criteria.items())[0]
157
-
158
- if key == "_id" and record.get("_id") == value:
159
- match_found = True
160
- else:
161
- keys = key.split(".")
162
- nested_value = record
163
- for k in keys:
164
- if isinstance(nested_value, dict) and k in nested_value:
165
- nested_value = nested_value[k]
166
- else:
167
- nested_value = None
168
- break
169
- if nested_value == value:
170
- match_found = True
171
-
172
- if match_found:
173
- results.append(record)
174
- seen_ids.add(record["_id"])
175
- if len(results) >= limit:
176
- async_log(user_id, db_id, coll_id, "GET", "SUCCESS", "OK", len(results), container_url)
177
- return {"status": "OK", "count": len(results), "results": results}, 200
178
- break
179
-
180
- except Exception:
181
- continue
182
-
183
- async_log(user_id, db_id, coll_id, "GET", "ERROR", "No matching record found", 1, container_url)
184
- return {"status": "KO", "count": 0, "error": "No matching record found"}, 404
185
-
186
-
187
- def handle_classify_vector(user_id, db_id, coll_id, container_url, k=3):
188
- # 1. Retrieve all data from collection
189
- response, status = handle_get_all(user_id, db_id, coll_id, limit=10000, container_url=container_url)
190
- if status != 200:
191
- return {"status": "KO", "message": "Failed to fetch data for classification", "error": response.get("error", "")}, status
192
-
193
- records = response.get("results", [])
194
- if not records:
195
- return {"status": "KO", "message": "No data found for classification"}, 404
196
-
197
- # 2. Prepare dataset for classification
198
- vectors = []
199
- labels = []
200
- unknown_vectors = []
201
- unknown_ids = []
202
-
203
- for rec in records:
204
- vec = rec.get("vector")
205
- meta = rec.get("metadata", {})
206
- if not vec:
207
- continue
208
- vectors.append(vec)
209
- labels.append(meta.get("class", "unknown"))
210
-
211
- # If some records don’t have classes, you might want to separate them or remove them
212
- known_vectors = []
213
- known_labels = []
214
- for v, l in zip(vectors, labels):
215
- if l != "unknown" and l is not None:
216
- known_vectors.append(v)
217
- known_labels.append(l)
218
- else:
219
- unknown_vectors.append(v)
220
- unknown_ids.append(rec.get("_id"))
221
-
222
- if not known_vectors:
223
- return {"status": "KO", "message": "No labeled data for training classification"}, 404
224
-
225
- # 3. Train k-NN classifier on known labeled vectors
226
- knn = KNeighborsClassifier(n_neighbors=k)
227
- knn.fit(known_vectors, known_labels)
228
-
229
- # 4. Predict class for unknown vectors (if any)
230
- predictions = {}
231
- if unknown_vectors:
232
- predicted_labels = knn.predict(unknown_vectors)
233
- for _id, pred in zip(unknown_ids, predicted_labels):
234
- predictions[_id] = pred
235
-
236
- # 5. Return predictions (or full classification result)
237
- return {
238
- "status": "OK",
239
- "message": f"Classification done on {len(unknown_vectors)} unknown vectors",
240
- "predictions": predictions,
241
- }, 200