matrice-compute 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +9 -0
- matrice_compute/action_instance.py +1508 -0
- matrice_compute/actions_manager.py +226 -0
- matrice_compute/actions_scaledown_manager.py +57 -0
- matrice_compute/instance_manager.py +270 -0
- matrice_compute/instance_utils.py +707 -0
- matrice_compute/prechecks.py +538 -0
- matrice_compute/py.typed +0 -0
- matrice_compute/resources_tracker.py +478 -0
- matrice_compute/scaling.py +880 -0
- matrice_compute/shutdown_manager.py +314 -0
- matrice_compute/task_utils.py +77 -0
- matrice_compute-0.1.1.dist-info/METADATA +28 -0
- matrice_compute-0.1.1.dist-info/RECORD +17 -0
- matrice_compute-0.1.1.dist-info/WHEEL +5 -0
- matrice_compute-0.1.1.dist-info/licenses/LICENSE.txt +21 -0
- matrice_compute-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,880 @@
|
|
|
1
|
+
"""Module providing scaling functionality."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
from matrice_common.utils import log_errors
|
|
6
|
+
from kafka import KafkaProducer, KafkaConsumer
|
|
7
|
+
import uuid
|
|
8
|
+
import json
|
|
9
|
+
import time
|
|
10
|
+
import base64
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Scaling:
|
|
14
|
+
|
|
15
|
+
"""Class providing scaling functionality for compute instances."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, session, instance_id=None):
|
|
18
|
+
"""Initialize Scaling instance.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
session: Session object for making RPC calls
|
|
22
|
+
instance_id: ID of the compute instance
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
Exception: If instance_id is not provided
|
|
26
|
+
"""
|
|
27
|
+
if not instance_id:
|
|
28
|
+
msg = "Instance id not set for this instance. Cannot perform the operation for job-scheduler without instance id"
|
|
29
|
+
logging.error(msg)
|
|
30
|
+
raise ValueError(msg)
|
|
31
|
+
self.instance_id = instance_id
|
|
32
|
+
self.session = session
|
|
33
|
+
self.rpc = session.rpc
|
|
34
|
+
used_ports_str = os.environ.get("USED_PORTS", "")
|
|
35
|
+
self.used_ports = set(int(p) for p in used_ports_str.split(",") if p.strip())
|
|
36
|
+
logging.info(
|
|
37
|
+
"Initialized Scaling with instance_id: %s",
|
|
38
|
+
instance_id,
|
|
39
|
+
)
|
|
40
|
+
self.kafka_config = {
|
|
41
|
+
"bootstrap_servers": self.get_kafka_bootstrap_servers(),
|
|
42
|
+
"api_request_topic": "action_requests",
|
|
43
|
+
"api_response_topic": "action_responses",
|
|
44
|
+
"scaling_request_topic": "compute_requests",
|
|
45
|
+
"scaling_response_topic": "compute_responses"
|
|
46
|
+
}
|
|
47
|
+
self.kafka_producer = KafkaProducer(
|
|
48
|
+
bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
49
|
+
value_serializer=lambda v: json.dumps(v).encode("utf-8"),)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@log_errors(default_return=(None, "Error creating Kafka producer", "Kafka producer creation failed"), log_error=True)
|
|
54
|
+
def get_kafka_bootstrap_servers(self):
|
|
55
|
+
"""Get Kafka bootstrap servers from API and decode base64 fields."""
|
|
56
|
+
path = "/v1/actions/get_kafka_info"
|
|
57
|
+
response = self.rpc.get(path=path)
|
|
58
|
+
if not response or not response.get("success"):
|
|
59
|
+
raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
|
|
60
|
+
encoded_ip = response["data"]["ip"]
|
|
61
|
+
encoded_port = response["data"]["port"]
|
|
62
|
+
ip = base64.b64decode(encoded_ip).decode("utf-8")
|
|
63
|
+
port = base64.b64decode(encoded_port).decode("utf-8")
|
|
64
|
+
bootstrap_servers = f"{ip}:{port}"
|
|
65
|
+
return bootstrap_servers
|
|
66
|
+
|
|
67
|
+
@log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
|
|
68
|
+
def handle_response(self, resp, success_message, error_message):
|
|
69
|
+
"""Helper function to handle API response.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
resp: Response from API call
|
|
73
|
+
success_message: Message to log on success
|
|
74
|
+
error_message: Message to log on error
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Tuple of (data, error, message)
|
|
78
|
+
"""
|
|
79
|
+
if resp.get("success"):
|
|
80
|
+
data = resp.get("data")
|
|
81
|
+
error = None
|
|
82
|
+
message = success_message
|
|
83
|
+
logging.info(message)
|
|
84
|
+
else:
|
|
85
|
+
data = resp.get("data")
|
|
86
|
+
error = resp.get("message")
|
|
87
|
+
message = error_message
|
|
88
|
+
logging.error("%s: %s", message, error)
|
|
89
|
+
return data, error, message
|
|
90
|
+
|
|
91
|
+
@log_errors(log_error=True)
|
|
92
|
+
def get_downscaled_ids(self):
|
|
93
|
+
"""Get IDs of downscaled instances.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Tuple of (data, error, message) from API response
|
|
97
|
+
"""
|
|
98
|
+
logging.info(
|
|
99
|
+
"Getting downscaled ids for instance %s",
|
|
100
|
+
self.instance_id,
|
|
101
|
+
)
|
|
102
|
+
path = f"/v1/scaling/down_scaled_ids/{self.instance_id}"
|
|
103
|
+
resp = self.rpc.get(path=path)
|
|
104
|
+
return self.handle_response(
|
|
105
|
+
resp,
|
|
106
|
+
"Downscaled ids info fetched successfully",
|
|
107
|
+
"Could not fetch the Downscaled ids info",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@log_errors(default_return=(None, "API call failed", "Failed to stop instance"), log_error=True)
|
|
111
|
+
def stop_instance(self):
|
|
112
|
+
"""Stop the compute instance.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Tuple of (data, error, message) from API response
|
|
116
|
+
"""
|
|
117
|
+
logging.info(
|
|
118
|
+
"Stopping instance %s",
|
|
119
|
+
self.instance_id,
|
|
120
|
+
)
|
|
121
|
+
path = "/v1/compute/compute_instance/stop"
|
|
122
|
+
resp = self.rpc.put(
|
|
123
|
+
path=path,
|
|
124
|
+
payload={
|
|
125
|
+
"_idInstance": self.instance_id,
|
|
126
|
+
"isForcedStop": False,
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
return self.handle_response(
|
|
130
|
+
resp,
|
|
131
|
+
"Instance stopped successfully",
|
|
132
|
+
"Could not stop the instance",
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
@log_errors(log_error=True)
|
|
136
|
+
def update_jupyter_token(
|
|
137
|
+
self,
|
|
138
|
+
token="",
|
|
139
|
+
):
|
|
140
|
+
path = f"/v1/scaling/update_jupyter_notebook_token/{self.instance_id}"
|
|
141
|
+
payload = {
|
|
142
|
+
"token": token,
|
|
143
|
+
}
|
|
144
|
+
resp = self.rpc.put(path=path, payload=payload)
|
|
145
|
+
return self.handle_response(
|
|
146
|
+
resp,
|
|
147
|
+
"Resources updated successfully",
|
|
148
|
+
"Could not update the resources",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
@log_errors(log_error=True)
|
|
152
|
+
def update_action_status(
|
|
153
|
+
self,
|
|
154
|
+
service_provider="",
|
|
155
|
+
action_record_id="",
|
|
156
|
+
isRunning=True,
|
|
157
|
+
status="",
|
|
158
|
+
docker_start_time=None,
|
|
159
|
+
action_duration=0,
|
|
160
|
+
cpuUtilisation=0.0,
|
|
161
|
+
gpuUtilisation=0.0,
|
|
162
|
+
memoryUtilisation=0.0,
|
|
163
|
+
gpuMemoryUsed=0,
|
|
164
|
+
createdAt=None,
|
|
165
|
+
updatedAt=None,
|
|
166
|
+
):
|
|
167
|
+
"""Update status of an action.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
service_provider: Provider of the service
|
|
171
|
+
action_record_id: ID of the action record
|
|
172
|
+
isRunning: Whether action is running
|
|
173
|
+
status: Status of the action
|
|
174
|
+
docker_start_time: Start time of docker container
|
|
175
|
+
action_duration: Duration of the action
|
|
176
|
+
cpuUtilisation: CPU utilization percentage
|
|
177
|
+
gpuUtilisation: GPU utilization percentage
|
|
178
|
+
memoryUtilisation: Memory utilization percentage
|
|
179
|
+
gpuMemoryUsed: GPU memory used
|
|
180
|
+
createdAt: Creation timestamp
|
|
181
|
+
updatedAt: Last update timestamp
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Tuple of (data, error, message) from API response
|
|
185
|
+
"""
|
|
186
|
+
logging.info(
|
|
187
|
+
"Updating action status for action %s",
|
|
188
|
+
action_record_id,
|
|
189
|
+
)
|
|
190
|
+
path = "/v1/compute/update_action_status"
|
|
191
|
+
payload_scaling = {
|
|
192
|
+
"instanceID": self.instance_id,
|
|
193
|
+
"serviceProvider": service_provider,
|
|
194
|
+
"actionRecordId": action_record_id,
|
|
195
|
+
"isRunning": isRunning,
|
|
196
|
+
"status": status,
|
|
197
|
+
"dockerContainerStartTime": docker_start_time,
|
|
198
|
+
"cpuUtilisation": cpuUtilisation,
|
|
199
|
+
"gpuUtilisation": gpuUtilisation,
|
|
200
|
+
"memoryUtilisation": memoryUtilisation,
|
|
201
|
+
"gpuMemoryUsed": gpuMemoryUsed,
|
|
202
|
+
"actionDuration": action_duration,
|
|
203
|
+
"createdAt": createdAt,
|
|
204
|
+
"updatedAt": updatedAt,
|
|
205
|
+
}
|
|
206
|
+
resp = self.rpc.put(path=path, payload=payload_scaling)
|
|
207
|
+
return self.handle_response(
|
|
208
|
+
resp,
|
|
209
|
+
"Action status details updated successfully",
|
|
210
|
+
"Could not update the action status details ",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
@log_errors(log_error=True)
|
|
214
|
+
def update_status(
|
|
215
|
+
self,
|
|
216
|
+
action_record_id,
|
|
217
|
+
action_type,
|
|
218
|
+
service_name,
|
|
219
|
+
stepCode,
|
|
220
|
+
status,
|
|
221
|
+
status_description,
|
|
222
|
+
):
|
|
223
|
+
"""Update status of an action.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
action_record_id: ID of the action record
|
|
227
|
+
action_type: Type of action
|
|
228
|
+
service_name: Name of the service
|
|
229
|
+
stepCode: Code indicating step in process
|
|
230
|
+
status: Status to update
|
|
231
|
+
status_description: Description of the status
|
|
232
|
+
"""
|
|
233
|
+
logging.info(
|
|
234
|
+
"Updating status for action %s",
|
|
235
|
+
action_record_id,
|
|
236
|
+
)
|
|
237
|
+
url = "/v1/actions"
|
|
238
|
+
payload = {
|
|
239
|
+
"_id": action_record_id,
|
|
240
|
+
"action": action_type,
|
|
241
|
+
"serviceName": service_name,
|
|
242
|
+
"stepCode": stepCode,
|
|
243
|
+
"status": status,
|
|
244
|
+
"statusDescription": status_description,
|
|
245
|
+
}
|
|
246
|
+
self.rpc.put(path=url, payload=payload)
|
|
247
|
+
|
|
248
|
+
@log_errors(log_error=True)
|
|
249
|
+
def get_shutdown_details(self):
|
|
250
|
+
"""Get shutdown details for the instance.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Tuple of (data, error, message) from API response
|
|
254
|
+
"""
|
|
255
|
+
logging.info(
|
|
256
|
+
"Getting shutdown details for instance %s",
|
|
257
|
+
self.instance_id,
|
|
258
|
+
)
|
|
259
|
+
path = f"/v1/compute/get_shutdown_details/{self.instance_id}"
|
|
260
|
+
resp = self.rpc.get(path=path)
|
|
261
|
+
return self.handle_response(
|
|
262
|
+
resp,
|
|
263
|
+
"Shutdown info fetched successfully",
|
|
264
|
+
"Could not fetch the shutdown details",
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
@log_errors(log_error=True)
|
|
268
|
+
def get_tasks_details(self):
|
|
269
|
+
"""Get task details for the instance.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
Tuple of (data, error, message) from API response
|
|
273
|
+
"""
|
|
274
|
+
logging.info(
|
|
275
|
+
"Getting tasks details for instance %s",
|
|
276
|
+
self.instance_id,
|
|
277
|
+
)
|
|
278
|
+
path = f"/v1/actions/fetch_instance_action_details/{self.instance_id}/action_details"
|
|
279
|
+
resp = self.rpc.get(path=path)
|
|
280
|
+
return self.handle_response(
|
|
281
|
+
resp,
|
|
282
|
+
"Task details fetched successfully",
|
|
283
|
+
"Could not fetch the task details",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
@log_errors(log_error=True)
|
|
287
|
+
def get_action_details(self, action_status_id):
|
|
288
|
+
"""Get details for a specific action using Kafka, fallback to REST, then cache."""
|
|
289
|
+
logging.info("Getting action details for action %s", action_status_id)
|
|
290
|
+
api = "get_action_details"
|
|
291
|
+
payload = {"actionRecordId": action_status_id}
|
|
292
|
+
# Try Kafka first
|
|
293
|
+
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
294
|
+
api=api,
|
|
295
|
+
payload=payload,
|
|
296
|
+
request_topic=self.kafka_config["api_request_topic"],
|
|
297
|
+
response_topic=self.kafka_config["api_response_topic"],
|
|
298
|
+
timeout=600
|
|
299
|
+
)
|
|
300
|
+
if kafka_response_received:
|
|
301
|
+
return data, error, message
|
|
302
|
+
|
|
303
|
+
# Only if Kafka transport failed or timed out, try REST
|
|
304
|
+
try:
|
|
305
|
+
path = f"/v1/actions/action/{action_status_id}/details"
|
|
306
|
+
resp = self.rpc.get(path=path)
|
|
307
|
+
return self.handle_response(
|
|
308
|
+
resp,
|
|
309
|
+
"Task details fetched successfully (REST fallback)",
|
|
310
|
+
"Could not fetch the task details (REST fallback)",
|
|
311
|
+
)
|
|
312
|
+
except Exception as e:
|
|
313
|
+
logging.error("REST fallback failed: %s", e)
|
|
314
|
+
self._cache_failed_request(api, payload)
|
|
315
|
+
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
@log_errors(log_error=True)
|
|
319
|
+
def update_action(
|
|
320
|
+
self,
|
|
321
|
+
id="",
|
|
322
|
+
step_code="",
|
|
323
|
+
action_type="",
|
|
324
|
+
status="",
|
|
325
|
+
sub_action="",
|
|
326
|
+
status_description="",
|
|
327
|
+
service="",
|
|
328
|
+
job_params=None,
|
|
329
|
+
):
|
|
330
|
+
"""Update an action using Kafka, fallback to REST, then cache."""
|
|
331
|
+
if job_params is None:
|
|
332
|
+
job_params = {}
|
|
333
|
+
logging.info("Updating action %s", id)
|
|
334
|
+
api = "update_action"
|
|
335
|
+
payload = {
|
|
336
|
+
"_id": id,
|
|
337
|
+
"stepCode": step_code,
|
|
338
|
+
"action": action_type,
|
|
339
|
+
"status": status,
|
|
340
|
+
"subAction": sub_action,
|
|
341
|
+
"statusDescription": status_description,
|
|
342
|
+
"serviceName": service,
|
|
343
|
+
"jobParams": job_params,
|
|
344
|
+
}
|
|
345
|
+
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
346
|
+
api=api,
|
|
347
|
+
payload=payload,
|
|
348
|
+
request_topic=self.kafka_config["api_request_topic"],
|
|
349
|
+
response_topic=self.kafka_config["api_response_topic"],
|
|
350
|
+
timeout=600
|
|
351
|
+
)
|
|
352
|
+
if kafka_response_received:
|
|
353
|
+
return data, error, message
|
|
354
|
+
try:
|
|
355
|
+
path = "/v1/actions"
|
|
356
|
+
resp = self.rpc.put(path=path, payload=payload)
|
|
357
|
+
return self.handle_response(
|
|
358
|
+
resp,
|
|
359
|
+
"Error logged successfully (REST fallback)",
|
|
360
|
+
"Could not log the errors (REST fallback)",
|
|
361
|
+
)
|
|
362
|
+
except Exception as e:
|
|
363
|
+
logging.error("REST fallback failed (update_action): %s", e)
|
|
364
|
+
self._cache_failed_request(api, payload)
|
|
365
|
+
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
@log_errors(log_error=True)
|
|
369
|
+
def assign_jobs(self, is_gpu):
|
|
370
|
+
"""Assign jobs to the instance using Kafka, fallback to REST, then cache."""
|
|
371
|
+
logging.info("Assigning jobs for instance %s (GPU: %s)", self.instance_id, is_gpu)
|
|
372
|
+
api = "assign_jobs"
|
|
373
|
+
payload = {
|
|
374
|
+
"instanceID": self.instance_id,
|
|
375
|
+
"isGPUInstance": is_gpu,
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
379
|
+
api=api,
|
|
380
|
+
payload=payload,
|
|
381
|
+
request_topic=self.kafka_config["api_request_topic"],
|
|
382
|
+
response_topic=self.kafka_config["api_response_topic"],
|
|
383
|
+
timeout=600
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
if kafka_response_received:
|
|
387
|
+
return data, error, message
|
|
388
|
+
|
|
389
|
+
# Fallback to REST
|
|
390
|
+
try:
|
|
391
|
+
path = f"/v1/actions/assign_jobs/{str(is_gpu)}/{self.instance_id}"
|
|
392
|
+
resp = self.rpc.get(path=path)
|
|
393
|
+
return self.handle_response(
|
|
394
|
+
resp,
|
|
395
|
+
"Pinged successfully (REST fallback)",
|
|
396
|
+
"Could not ping the scaling jobs (REST fallback)",
|
|
397
|
+
)
|
|
398
|
+
except Exception as e:
|
|
399
|
+
logging.error("REST fallback failed (assign_jobs): %s", e)
|
|
400
|
+
self._cache_failed_request(api, payload)
|
|
401
|
+
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
@log_errors(log_error=True)
|
|
405
|
+
def update_available_resources(
|
|
406
|
+
self,
|
|
407
|
+
availableCPU=0,
|
|
408
|
+
availableGPU=0,
|
|
409
|
+
availableMemory=0,
|
|
410
|
+
availableGPUMemory=0,
|
|
411
|
+
):
|
|
412
|
+
"""Update available resources for the instance using Kafka, fallback to REST, then cache."""
|
|
413
|
+
logging.info("Updating available resources for instance %s", self.instance_id)
|
|
414
|
+
payload = {
|
|
415
|
+
"instance_id": self.instance_id,
|
|
416
|
+
"availableMemory": availableMemory,
|
|
417
|
+
"availableCPU": availableCPU,
|
|
418
|
+
"availableGPUMemory": availableGPUMemory,
|
|
419
|
+
"availableGPU": availableGPU,
|
|
420
|
+
}
|
|
421
|
+
api = "update_available_resources"
|
|
422
|
+
correlation_id = str(uuid.uuid4())
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
426
|
+
api=api,
|
|
427
|
+
payload=payload,
|
|
428
|
+
request_topic=self.kafka_config["scaling_request_topic"],
|
|
429
|
+
response_topic=self.kafka_config["scaling_response_topic"],
|
|
430
|
+
timeout=600
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
if kafka_response_received:
|
|
434
|
+
return data, error, message
|
|
435
|
+
try:
|
|
436
|
+
path = f"/v1/scaling/update_available_resources/{self.instance_id}"
|
|
437
|
+
resp = self.rpc.put(path=path, payload=payload)
|
|
438
|
+
return self.handle_response(
|
|
439
|
+
resp,
|
|
440
|
+
"Resources updated successfully (REST fallback)",
|
|
441
|
+
"Could not update the resources (REST fallback)",
|
|
442
|
+
)
|
|
443
|
+
except Exception as e:
|
|
444
|
+
logging.error("REST fallback failed (update_available_resources): %s", e)
|
|
445
|
+
self._cache_failed_request(api, payload)
|
|
446
|
+
return None, f"Failed to update available resources via Kafka and REST: {e}", "Cached for retry"
|
|
447
|
+
|
|
448
|
+
@log_errors(log_error=True)
|
|
449
|
+
def update_action_docker_logs(self, action_record_id, log_content):
|
|
450
|
+
"""Update docker logs for an action using Kafka, fallback to REST, then cache."""
|
|
451
|
+
logging.info("Updating docker logs for action %s", action_record_id)
|
|
452
|
+
api = "update_action_docker_logs"
|
|
453
|
+
payload = {
|
|
454
|
+
"actionRecordId": action_record_id,
|
|
455
|
+
"logContent": log_content,
|
|
456
|
+
|
|
457
|
+
}
|
|
458
|
+
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
459
|
+
api=api,
|
|
460
|
+
payload=payload,
|
|
461
|
+
request_topic=self.kafka_config["api_request_topic"],
|
|
462
|
+
response_topic=self.kafka_config["api_response_topic"],
|
|
463
|
+
timeout=600
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
if kafka_response_received:
|
|
467
|
+
return data, error, message
|
|
468
|
+
try:
|
|
469
|
+
path = "/v1/actions/update_action_docker_logs"
|
|
470
|
+
resp = self.rpc.put(path=path, payload=payload)
|
|
471
|
+
return self.handle_response(
|
|
472
|
+
resp,
|
|
473
|
+
"Docker logs updated successfully (REST fallback)",
|
|
474
|
+
"Could not update the docker logs (REST fallback)",
|
|
475
|
+
)
|
|
476
|
+
except Exception as e:
|
|
477
|
+
logging.error("REST fallback failed (update_action_docker_logs): %s", e)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@log_errors(log_error=True)
|
|
481
|
+
def get_docker_hub_credentials(self):
|
|
482
|
+
"""Get Docker Hub credentials.
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
Tuple of (data, error, message) from API response
|
|
486
|
+
"""
|
|
487
|
+
logging.info("Getting docker credentials")
|
|
488
|
+
path = "/v1/compute/get_docker_hub_credentials"
|
|
489
|
+
resp = self.rpc.get(path=path)
|
|
490
|
+
return self.handle_response(
|
|
491
|
+
resp,
|
|
492
|
+
"Docker credentials fetched successfully",
|
|
493
|
+
"Could not fetch the docker credentials",
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
@log_errors(log_error=True)
|
|
497
|
+
def get_open_ports_config(self):
|
|
498
|
+
"""Get open ports configuration.
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
Tuple of (data, error, message) from API response
|
|
502
|
+
"""
|
|
503
|
+
path = f"/v1/scaling/get_open_ports/{self.instance_id}"
|
|
504
|
+
resp = self.rpc.get(path=path)
|
|
505
|
+
return self.handle_response(
|
|
506
|
+
resp,
|
|
507
|
+
"Open ports config fetched successfully",
|
|
508
|
+
"Could not fetch the open ports config",
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
@log_errors(default_return=None, log_error=True)
|
|
512
|
+
def get_open_port(self):
|
|
513
|
+
"""Get an available open port.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
Port number if available, None otherwise
|
|
517
|
+
"""
|
|
518
|
+
port_range = {"from": 8200, "to": 9000}
|
|
519
|
+
try:
|
|
520
|
+
resp, err, msg = self.get_open_ports_config()
|
|
521
|
+
if not err and resp and resp[0]:
|
|
522
|
+
port_range = resp[0]
|
|
523
|
+
else:
|
|
524
|
+
logging.warning("Using default port range 8200-9000 due to config fetch error")
|
|
525
|
+
except Exception as err:
|
|
526
|
+
logging.warning(
|
|
527
|
+
"Using default port range 8200-9000. Config fetch failed: %s",
|
|
528
|
+
str(err),
|
|
529
|
+
)
|
|
530
|
+
min_port = port_range["from"]
|
|
531
|
+
max_port = port_range["to"]
|
|
532
|
+
for port in range(min_port, max_port):
|
|
533
|
+
if port in self.used_ports:
|
|
534
|
+
continue
|
|
535
|
+
self.used_ports.add(port)
|
|
536
|
+
os.environ["USED_PORTS"] = ",".join(str(p) for p in self.used_ports)
|
|
537
|
+
logging.info("Found available port: %s", port)
|
|
538
|
+
return port
|
|
539
|
+
logging.error(
|
|
540
|
+
"No available ports found in range %s-%s",
|
|
541
|
+
min_port,
|
|
542
|
+
max_port,
|
|
543
|
+
)
|
|
544
|
+
return None
|
|
545
|
+
|
|
546
|
+
@log_errors(default_return="", log_error=False)
|
|
547
|
+
def get_data_processing_image(self):
|
|
548
|
+
"""Get data processing image name.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
Full image name including repository and tag
|
|
552
|
+
"""
|
|
553
|
+
logging.info("Getting data processing image")
|
|
554
|
+
return f"285699223019.dkr.ecr.us-west-2.amazonaws.com/{os.environ.get('ENV', 'prod')}-data-processing:latest"
|
|
555
|
+
|
|
556
|
+
@log_errors(log_error=True)
|
|
557
|
+
def get_model_secret_keys(self, secret_name):
|
|
558
|
+
"""Get model secret keys.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
secret_name: Name of the secret
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
Tuple of (data, error, message) from API response
|
|
565
|
+
"""
|
|
566
|
+
path = f"/v1/scaling/get_models_secret_keys?secret_name={secret_name}"
|
|
567
|
+
resp = self.rpc.get(path=path)
|
|
568
|
+
return self.handle_response(
|
|
569
|
+
resp,
|
|
570
|
+
"Secret keys fetched successfully",
|
|
571
|
+
"Could not fetch the secret keys",
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
@log_errors(log_error=True)
|
|
575
|
+
def get_model_codebase(self, model_family_id):
|
|
576
|
+
"""Get model codebase.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
model_family_id: ID of the model family
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
Tuple of (data, error, message) from API response
|
|
583
|
+
"""
|
|
584
|
+
path = f"/v1/model_store/get_user_code_download_path/{model_family_id}"
|
|
585
|
+
resp = self.rpc.get(path=path)
|
|
586
|
+
return self.handle_response(
|
|
587
|
+
resp,
|
|
588
|
+
"Codebase fetched successfully",
|
|
589
|
+
"Could not fetch the codebase",
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
@log_errors(log_error=True)
|
|
593
|
+
def get_model_codebase_requirements(self, model_family_id):
|
|
594
|
+
"""Get model codebase requirements.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
model_family_id: ID of the model family
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
Tuple of (data, error, message) from API response
|
|
601
|
+
"""
|
|
602
|
+
path = f"/v1/model_store/get_user_requirements_download_path/{model_family_id}"
|
|
603
|
+
resp = self.rpc.get(path=path)
|
|
604
|
+
return self.handle_response(
|
|
605
|
+
resp,
|
|
606
|
+
"Codebase requirements fetched successfully",
|
|
607
|
+
"Could not fetch the codebase requirements",
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
@log_errors(log_error=True)
|
|
611
|
+
def get_model_codebase_script(self, model_family_id):
|
|
612
|
+
"""Get model codebase script.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
model_family_id: ID of the model family
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
Tuple of (data, error, message) from API response
|
|
619
|
+
"""
|
|
620
|
+
path = f"/v1/model_store/get_user_script_download_path/:{model_family_id}"
|
|
621
|
+
resp = self.rpc.get(path=path)
|
|
622
|
+
return self.handle_response(
|
|
623
|
+
resp,
|
|
624
|
+
"Codebase script fetched successfully",
|
|
625
|
+
"Could not fetch the codebase script",
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
@log_errors(log_error=True)
|
|
629
|
+
def add_account_compute_instance(
|
|
630
|
+
self,
|
|
631
|
+
account_number,
|
|
632
|
+
alias,
|
|
633
|
+
service_provider,
|
|
634
|
+
instance_type,
|
|
635
|
+
shut_down_time,
|
|
636
|
+
lease_type,
|
|
637
|
+
launch_duration,
|
|
638
|
+
):
|
|
639
|
+
"""Add a compute instance for an account.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
account_number: Account number
|
|
643
|
+
alias: Instance alias
|
|
644
|
+
service_provider: Cloud service provider
|
|
645
|
+
instance_type: Type of instance
|
|
646
|
+
shut_down_time: Time to shutdown
|
|
647
|
+
lease_type: Type of lease
|
|
648
|
+
launch_duration: Duration to launch
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
Tuple of (data, error, message) from API response
|
|
652
|
+
"""
|
|
653
|
+
path = "/v1/scaling/add_account_compute_instance"
|
|
654
|
+
payload = {
|
|
655
|
+
"accountNumber": account_number,
|
|
656
|
+
"alias": alias,
|
|
657
|
+
"serviceProvider": service_provider,
|
|
658
|
+
"instanceType": instance_type,
|
|
659
|
+
"shutDownTime": shut_down_time,
|
|
660
|
+
"leaseType": lease_type,
|
|
661
|
+
"launchDuration": launch_duration,
|
|
662
|
+
}
|
|
663
|
+
resp = self.rpc.post(path=path, payload=payload)
|
|
664
|
+
return self.handle_response(
|
|
665
|
+
resp,
|
|
666
|
+
"Compute instance added successfully",
|
|
667
|
+
"Could not add the compute instance",
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
@log_errors(log_error=True)
|
|
671
|
+
def stop_account_compute(self, account_number, alias):
|
|
672
|
+
"""Stop a compute instance for an account.
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
account_number: Account number
|
|
676
|
+
alias: Instance alias
|
|
677
|
+
|
|
678
|
+
Returns:
|
|
679
|
+
Tuple of (data, error, message) from API response
|
|
680
|
+
"""
|
|
681
|
+
path = f"/v1/scaling/stop_account_compute/{account_number}/{alias}"
|
|
682
|
+
resp = self.rpc.put(path=path)
|
|
683
|
+
return self.handle_response(
|
|
684
|
+
resp,
|
|
685
|
+
"Compute instance stopped successfully",
|
|
686
|
+
"Could not stop the compute instance",
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
@log_errors(log_error=True)
|
|
690
|
+
def restart_account_compute(self, account_number, alias):
|
|
691
|
+
"""Restart a compute instance for an account.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
account_number: Account number
|
|
695
|
+
alias: Instance alias
|
|
696
|
+
|
|
697
|
+
Returns:
|
|
698
|
+
Tuple of (data, error, message) from API response
|
|
699
|
+
"""
|
|
700
|
+
path = f"/v1/scaling/restart_account_compute/{account_number}/{alias}"
|
|
701
|
+
resp = self.rpc.put(path=path)
|
|
702
|
+
return self.handle_response(
|
|
703
|
+
resp,
|
|
704
|
+
"Compute instance restarted successfully",
|
|
705
|
+
"Could not restart the compute instance",
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
@log_errors(log_error=True)
|
|
709
|
+
def delete_account_compute(self, account_number, alias):
|
|
710
|
+
"""Delete a compute instance for an account.
|
|
711
|
+
|
|
712
|
+
Args:
|
|
713
|
+
account_number: Account number
|
|
714
|
+
alias: Instance alias
|
|
715
|
+
|
|
716
|
+
Returns:
|
|
717
|
+
Tuple of (data, error, message) from API response
|
|
718
|
+
"""
|
|
719
|
+
path = f"/v1/scaling/delete_account_compute/{account_number}/{alias}"
|
|
720
|
+
resp = self.rpc.delete(path=path)
|
|
721
|
+
return self.handle_response(
|
|
722
|
+
resp,
|
|
723
|
+
"Compute instance deleted successfully",
|
|
724
|
+
"Could not delete the compute instance",
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
@log_errors(log_error=True)
|
|
728
|
+
def get_all_instances_type(self):
|
|
729
|
+
"""Get all instance types.
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
Tuple of (data, error, message) from API response
|
|
733
|
+
"""
|
|
734
|
+
path = "/v1/scaling/get_all_instances_type"
|
|
735
|
+
resp = self.rpc.get(path=path)
|
|
736
|
+
return self.handle_response(
|
|
737
|
+
resp,
|
|
738
|
+
"All instance types fetched successfully",
|
|
739
|
+
"Could not fetch the instance types",
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
@log_errors(log_error=True)
|
|
743
|
+
def get_compute_details(self):
|
|
744
|
+
"""Get compute instance details.
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
Tuple of (data, error, message) from API response
|
|
748
|
+
"""
|
|
749
|
+
path = f"/v1/scaling/get_compute_details/{self.instance_id}"
|
|
750
|
+
resp = self.rpc.get(path=path)
|
|
751
|
+
return self.handle_response(
|
|
752
|
+
resp,
|
|
753
|
+
"Compute details fetched successfully",
|
|
754
|
+
"Could not fetch the compute details",
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
@log_errors(log_error=True)
|
|
758
|
+
def get_user_access_key_pair(self, user_id):
|
|
759
|
+
"""Get user access key pair.
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
user_id: ID of the user
|
|
763
|
+
|
|
764
|
+
Returns:
|
|
765
|
+
Tuple of (data, error, message) from API response
|
|
766
|
+
"""
|
|
767
|
+
path = f"/v1/compute/get_user_access_key_pair/{user_id}/{self.instance_id}"
|
|
768
|
+
resp = self.rpc.get(path=path)
|
|
769
|
+
return self.handle_response(
|
|
770
|
+
resp,
|
|
771
|
+
"User access key pair fetched successfully",
|
|
772
|
+
"Could not fetch the user access key pair",
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
@log_errors(log_error=True)
|
|
776
|
+
def get_internal_api_key(self, action_id):
|
|
777
|
+
"""Get internal API key.
|
|
778
|
+
|
|
779
|
+
Args:
|
|
780
|
+
action_id: ID of the action
|
|
781
|
+
|
|
782
|
+
Returns:
|
|
783
|
+
Tuple of (data, error, message) from API response
|
|
784
|
+
"""
|
|
785
|
+
path = f"/v1/scaling/get_internal_api_key/{action_id}/{self.instance_id}"
|
|
786
|
+
resp = self.rpc.get(path=path)
|
|
787
|
+
return self.handle_response(
|
|
788
|
+
resp,
|
|
789
|
+
"internal keys fetched successfully",
|
|
790
|
+
"Could not fetch internal keys",
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
@log_errors(log_error=True)
|
|
794
|
+
def handle_kafka_response(self, msg, success_message, error_message):
|
|
795
|
+
"""
|
|
796
|
+
Helper to process Kafka response messages in a consistent way.
|
|
797
|
+
"""
|
|
798
|
+
if msg.get("status") == "success":
|
|
799
|
+
data = msg.get("data")
|
|
800
|
+
error = None
|
|
801
|
+
message = success_message
|
|
802
|
+
logging.info(message)
|
|
803
|
+
else:
|
|
804
|
+
data = msg.get("data")
|
|
805
|
+
error = msg.get("error", "Unknown error")
|
|
806
|
+
message = error_message
|
|
807
|
+
logging.error("%s: %s", message, error)
|
|
808
|
+
return data, error, message
|
|
809
|
+
|
|
810
|
+
def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=600):
|
|
811
|
+
"""
|
|
812
|
+
Helper to send a request to Kafka and wait for a response.
|
|
813
|
+
Returns (data, error, message, kafka_response_received) where kafka_response_received is True if a response was received (even if error), False if transport error/timeout.
|
|
814
|
+
"""
|
|
815
|
+
correlation_id = str(uuid.uuid4())
|
|
816
|
+
request_message = {
|
|
817
|
+
"correlationId": correlation_id,
|
|
818
|
+
"api": api,
|
|
819
|
+
"payload": payload,
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
consumer = KafkaConsumer(
|
|
823
|
+
response_topic,
|
|
824
|
+
bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
825
|
+
group_id=None,
|
|
826
|
+
value_deserializer=lambda m: json.loads(m.decode("utf-8")),
|
|
827
|
+
auto_offset_reset='latest',
|
|
828
|
+
enable_auto_commit=True,
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
try:
|
|
832
|
+
if hasattr(self.session.rpc, 'AUTH_TOKEN'):
|
|
833
|
+
self.session.rpc.AUTH_TOKEN.set_bearer_token()
|
|
834
|
+
auth_token = self.session.rpc.AUTH_TOKEN.bearer_token
|
|
835
|
+
auth_token = auth_token.replace("Bearer ", "")
|
|
836
|
+
headers = [("Authorization", bytes(f"{auth_token}", "utf-8"))]
|
|
837
|
+
else:
|
|
838
|
+
headers = None
|
|
839
|
+
self.kafka_producer.send(request_topic, request_message, headers=headers)
|
|
840
|
+
# self.kafka_producer.flush()
|
|
841
|
+
logging.info("Sent %s request to Kafka topic %s", api, request_topic)
|
|
842
|
+
except Exception as e:
|
|
843
|
+
logging.error("Kafka producer error: %s", e)
|
|
844
|
+
return None, f"Kafka producer error: {e}", "Kafka send failed", False
|
|
845
|
+
try:
|
|
846
|
+
start = time.time()
|
|
847
|
+
for message in consumer:
|
|
848
|
+
msg = message.value
|
|
849
|
+
if msg.get("correlationId") == correlation_id:
|
|
850
|
+
consumer.close()
|
|
851
|
+
# Always treat a received response as final, even if error
|
|
852
|
+
return self.handle_kafka_response(
|
|
853
|
+
msg,
|
|
854
|
+
f"Fetched via Kafka for {api}",
|
|
855
|
+
f"Kafka error response for {api}"
|
|
856
|
+
) + (True,)
|
|
857
|
+
if time.time() - start > timeout:
|
|
858
|
+
break
|
|
859
|
+
consumer.close()
|
|
860
|
+
logging.warning("Kafka response timeout for %s", api)
|
|
861
|
+
return None, "Kafka response timeout", "Kafka response timeout", False
|
|
862
|
+
except Exception as e:
|
|
863
|
+
logging.error("Kafka consumer error: %s", e)
|
|
864
|
+
return None, f"Kafka consumer error: {e}", "Kafka consumer error", False
|
|
865
|
+
|
|
866
|
+
def _cache_failed_request(self, api, payload):
|
|
867
|
+
"""Cache the failed request for retry. Here, we use a simple file cache as a placeholder."""
|
|
868
|
+
try:
|
|
869
|
+
cache_file = os.path.join(os.path.dirname(__file__), 'request_cache.json')
|
|
870
|
+
if os.path.exists(cache_file):
|
|
871
|
+
with open(cache_file, 'r') as f:
|
|
872
|
+
cache = json.load(f)
|
|
873
|
+
else:
|
|
874
|
+
cache = []
|
|
875
|
+
cache.append({"api": api, "payload": payload, "ts": time.time()})
|
|
876
|
+
with open(cache_file, 'w') as f:
|
|
877
|
+
json.dump(cache, f)
|
|
878
|
+
logging.info("Cached failed request for api %s", api)
|
|
879
|
+
except Exception as e:
|
|
880
|
+
logging.error("Failed to cache request: %s", e)
|