matrice-compute 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +104 -19
- matrice_compute/instance_utils.py +520 -111
- matrice_compute/resources_tracker.py +125 -53
- matrice_compute/scaling.py +658 -406
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/RECORD +9 -9
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/top_level.txt +0 -0
matrice_compute/scaling.py
CHANGED
|
@@ -3,11 +3,12 @@
|
|
|
3
3
|
import os
|
|
4
4
|
import logging
|
|
5
5
|
from matrice_common.utils import log_errors
|
|
6
|
-
|
|
6
|
+
from kafka import KafkaProducer, KafkaConsumer
|
|
7
7
|
import uuid
|
|
8
8
|
import json
|
|
9
9
|
import time
|
|
10
10
|
import base64
|
|
11
|
+
import threading
|
|
11
12
|
|
|
12
13
|
# TODO: update /scaling to /compute
|
|
13
14
|
|
|
@@ -15,12 +16,13 @@ class Scaling:
|
|
|
15
16
|
|
|
16
17
|
"""Class providing scaling functionality for compute instances."""
|
|
17
18
|
|
|
18
|
-
def __init__(self, session, instance_id=None):
|
|
19
|
+
def __init__(self, session, instance_id=None, enable_kafka=True):
|
|
19
20
|
"""Initialize Scaling instance.
|
|
20
21
|
|
|
21
22
|
Args:
|
|
22
23
|
session: Session object for making RPC calls
|
|
23
24
|
instance_id: ID of the compute instance
|
|
25
|
+
enable_kafka: Enable Kafka communication (default True)
|
|
24
26
|
|
|
25
27
|
Raises:
|
|
26
28
|
Exception: If instance_id is not provided
|
|
@@ -34,38 +36,93 @@ class Scaling:
|
|
|
34
36
|
self.rpc = session.rpc
|
|
35
37
|
used_ports_str = os.environ.get("USED_PORTS", "")
|
|
36
38
|
self.used_ports = set(int(p) for p in used_ports_str.split(",") if p.strip())
|
|
39
|
+
|
|
40
|
+
# Kafka configuration and initialization
|
|
41
|
+
self.enable_kafka = enable_kafka
|
|
42
|
+
self.kafka_producer = None
|
|
43
|
+
self.kafka_consumer = None
|
|
44
|
+
self.kafka_thread = None
|
|
45
|
+
self.kafka_running = False
|
|
46
|
+
|
|
47
|
+
# Maps correlation_id to threading.Event for request/response matching
|
|
48
|
+
self.pending_requests = {}
|
|
49
|
+
# Maps correlation_id to response data
|
|
50
|
+
self.response_map = {}
|
|
51
|
+
self.response_lock = threading.Lock()
|
|
52
|
+
|
|
53
|
+
if self.enable_kafka:
|
|
54
|
+
try:
|
|
55
|
+
self.kafka_config = {
|
|
56
|
+
"bootstrap_servers": self.get_kafka_bootstrap_servers(),
|
|
57
|
+
"action_request_topic": "action_requests",
|
|
58
|
+
"action_response_topic": "action_responses",
|
|
59
|
+
"compute_request_topic": "compute_requests",
|
|
60
|
+
"compute_response_topic": "compute_responses"
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Initialize single producer
|
|
64
|
+
self.kafka_producer = KafkaProducer(
|
|
65
|
+
bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
66
|
+
value_serializer=lambda v: json.dumps(v).encode("utf-8"),
|
|
67
|
+
max_block_ms=5000 # Timeout if Kafka is down
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Initialize single consumer for both response topics
|
|
71
|
+
self.kafka_consumer = KafkaConsumer(
|
|
72
|
+
self.kafka_config["action_response_topic"],
|
|
73
|
+
self.kafka_config["compute_response_topic"],
|
|
74
|
+
bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
75
|
+
group_id=f"py_compute_{instance_id}",
|
|
76
|
+
value_deserializer=lambda m: json.loads(m.decode("utf-8")),
|
|
77
|
+
auto_offset_reset='latest',
|
|
78
|
+
enable_auto_commit=True,
|
|
79
|
+
consumer_timeout_ms=1000, # Poll timeout
|
|
80
|
+
session_timeout_ms=60000, # Increase session timeout to 60s (default 30s)
|
|
81
|
+
heartbeat_interval_ms=3000, # Send heartbeat every 3s
|
|
82
|
+
max_poll_interval_ms=300000 # Max time between polls: 5 minutes
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Start background thread to handle responses
|
|
86
|
+
self.kafka_running = True
|
|
87
|
+
self.kafka_thread = threading.Thread(target=self._kafka_response_listener, daemon=True)
|
|
88
|
+
self.kafka_thread.start()
|
|
89
|
+
|
|
90
|
+
logging.info(f"Kafka enabled with bootstrap servers: {self.kafka_config['bootstrap_servers']}")
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logging.warning(f"Failed to initialize Kafka, will use REST API only: {e}")
|
|
93
|
+
self.enable_kafka = False
|
|
94
|
+
self.kafka_producer = None
|
|
95
|
+
self.kafka_consumer = None
|
|
96
|
+
|
|
37
97
|
logging.info(
|
|
38
|
-
"Initialized Scaling with instance_id: %s",
|
|
98
|
+
"Initialized Scaling with instance_id: %s, Kafka enabled: %s",
|
|
39
99
|
instance_id,
|
|
100
|
+
self.enable_kafka
|
|
40
101
|
)
|
|
41
|
-
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
42
|
-
# self.kafka_config = {
|
|
43
|
-
# "bootstrap_servers": self.get_kafka_bootstrap_servers(),
|
|
44
|
-
# "api_request_topic": "action_requests",
|
|
45
|
-
# "api_response_topic": "action_responses",
|
|
46
|
-
# "scaling_request_topic": "compute_requests",
|
|
47
|
-
# "scaling_response_topic": "compute_responses"
|
|
48
|
-
# }
|
|
49
|
-
# self.kafka_producer = KafkaProducer(
|
|
50
|
-
# bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
51
|
-
# value_serializer=lambda v: json.dumps(v).encode("utf-8"),)
|
|
52
102
|
|
|
53
103
|
|
|
54
104
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
105
|
+
@log_errors(default_return=None, log_error=True)
|
|
106
|
+
def get_kafka_bootstrap_servers(self):
|
|
107
|
+
"""Get Kafka bootstrap servers from API and decode base64 fields.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
str: Kafka bootstrap servers in format "ip:port"
|
|
111
|
+
|
|
112
|
+
Raises:
|
|
113
|
+
ValueError: If unable to fetch Kafka configuration
|
|
114
|
+
"""
|
|
115
|
+
path = "/v1/actions/get_kafka_info"
|
|
116
|
+
response = self.rpc.get(path=path)
|
|
117
|
+
if not response or not response.get("success"):
|
|
118
|
+
raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
|
|
119
|
+
encoded_ip = response["data"]["ip"]
|
|
120
|
+
encoded_port = response["data"]["port"]
|
|
121
|
+
ip = base64.b64decode(encoded_ip).decode("utf-8")
|
|
122
|
+
port = base64.b64decode(encoded_port).decode("utf-8")
|
|
123
|
+
bootstrap_servers = f"{ip}:{port}"
|
|
124
|
+
# logging.info(f"Retrieved Kafka bootstrap servers: {bootstrap_servers}")
|
|
125
|
+
return bootstrap_servers
|
|
69
126
|
|
|
70
127
|
@log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
|
|
71
128
|
def handle_response(self, resp, success_message, error_message):
|
|
@@ -90,65 +147,267 @@ class Scaling:
|
|
|
90
147
|
message = error_message
|
|
91
148
|
logging.error("%s: %s", message, error)
|
|
92
149
|
return data, error, message
|
|
93
|
-
|
|
150
|
+
|
|
151
|
+
def _kafka_response_listener(self):
|
|
152
|
+
"""
|
|
153
|
+
Background thread that continuously polls for Kafka responses.
|
|
154
|
+
|
|
155
|
+
This thread runs in the background and listens for responses from both
|
|
156
|
+
action_responses and compute_responses topics. When a response is received,
|
|
157
|
+
it matches the correlation ID to pending requests and wakes up the waiting thread.
|
|
158
|
+
"""
|
|
159
|
+
logging.info("Kafka response listener thread started")
|
|
160
|
+
|
|
161
|
+
while self.kafka_running:
|
|
162
|
+
try:
|
|
163
|
+
# Poll for messages with 1 second timeout
|
|
164
|
+
message_batch = self.kafka_consumer.poll(timeout_ms=1000)
|
|
165
|
+
|
|
166
|
+
if message_batch:
|
|
167
|
+
for topic_partition, messages in message_batch.items():
|
|
168
|
+
for message in messages:
|
|
169
|
+
try:
|
|
170
|
+
msg = message.value
|
|
171
|
+
correlation_id = msg.get("correlationId")
|
|
172
|
+
|
|
173
|
+
if correlation_id:
|
|
174
|
+
with self.response_lock:
|
|
175
|
+
if correlation_id in self.pending_requests:
|
|
176
|
+
# Store response and signal waiting thread
|
|
177
|
+
self.response_map[correlation_id] = msg
|
|
178
|
+
self.pending_requests[correlation_id].set()
|
|
179
|
+
logging.debug(f"Received Kafka response for correlation_id: {correlation_id}")
|
|
180
|
+
else:
|
|
181
|
+
logging.warning(f"Received Kafka message without correlationId: {msg}")
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logging.error(f"Error processing Kafka message: {e}")
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
if self.kafka_running: # Only log if not shutting down
|
|
187
|
+
logging.error(f"Error in Kafka response listener: {e}")
|
|
188
|
+
time.sleep(1) # Avoid tight loop on persistent errors
|
|
189
|
+
|
|
190
|
+
logging.info("Kafka response listener thread stopped")
|
|
191
|
+
|
|
192
|
+
def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=5):
|
|
193
|
+
"""
|
|
194
|
+
Send a request via Kafka and wait for response using the persistent consumer.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
api: API name to call
|
|
198
|
+
payload: Request payload dictionary
|
|
199
|
+
request_topic: Kafka topic to send request to
|
|
200
|
+
response_topic: Kafka topic to receive response from (not used, kept for signature)
|
|
201
|
+
timeout: Timeout in seconds to wait for response
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Tuple of (data, error, message, kafka_success)
|
|
205
|
+
kafka_success is True if response received, False if timeout/error
|
|
206
|
+
"""
|
|
207
|
+
if not self.enable_kafka or not self.kafka_producer:
|
|
208
|
+
return None, "Kafka not enabled", "Kafka not available", False
|
|
209
|
+
|
|
210
|
+
correlation_id = str(uuid.uuid4())
|
|
211
|
+
request_message = {
|
|
212
|
+
"correlationId": correlation_id,
|
|
213
|
+
"api": api,
|
|
214
|
+
"payload": payload,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
# Create event for this request
|
|
218
|
+
event = threading.Event()
|
|
219
|
+
|
|
220
|
+
with self.response_lock:
|
|
221
|
+
self.pending_requests[correlation_id] = event
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
# Add auth token if available
|
|
225
|
+
headers = None
|
|
226
|
+
if hasattr(self.session.rpc, 'AUTH_TOKEN'):
|
|
227
|
+
self.session.rpc.AUTH_TOKEN.set_bearer_token()
|
|
228
|
+
auth_token = self.session.rpc.AUTH_TOKEN.bearer_token
|
|
229
|
+
auth_token = auth_token.replace("Bearer ", "")
|
|
230
|
+
headers = [("Authorization", bytes(f"{auth_token}", "utf-8"))]
|
|
231
|
+
|
|
232
|
+
# Send request
|
|
233
|
+
self.kafka_producer.send(request_topic, request_message, headers=headers)
|
|
234
|
+
logging.info(f"Sent Kafka request for {api} with correlation_id: {correlation_id}")
|
|
235
|
+
|
|
236
|
+
# Wait for response with timeout
|
|
237
|
+
if event.wait(timeout=timeout):
|
|
238
|
+
# Response received
|
|
239
|
+
with self.response_lock:
|
|
240
|
+
response = self.response_map.pop(correlation_id, None)
|
|
241
|
+
self.pending_requests.pop(correlation_id, None)
|
|
242
|
+
|
|
243
|
+
if response:
|
|
244
|
+
if response.get("status") == "success":
|
|
245
|
+
data = response.get("data")
|
|
246
|
+
logging.info(f"Kafka success for {api}")
|
|
247
|
+
return data, None, f"Fetched via Kafka for {api}", True
|
|
248
|
+
else:
|
|
249
|
+
error = response.get("error", "Unknown error")
|
|
250
|
+
logging.error(f"Kafka error response for {api}: {error}")
|
|
251
|
+
return None, error, f"Kafka error response for {api}", True
|
|
252
|
+
else:
|
|
253
|
+
logging.warning(f"Kafka response received but missing data for {api}")
|
|
254
|
+
return None, "Response missing data", "Kafka response error", False
|
|
255
|
+
else:
|
|
256
|
+
# Timeout
|
|
257
|
+
with self.response_lock:
|
|
258
|
+
self.pending_requests.pop(correlation_id, None)
|
|
259
|
+
logging.warning(f"Kafka response timeout for {api} after {timeout} seconds")
|
|
260
|
+
return None, "Kafka response timeout", "Kafka response timeout", False
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
# Cleanup on error
|
|
264
|
+
with self.response_lock:
|
|
265
|
+
self.pending_requests.pop(correlation_id, None)
|
|
266
|
+
logging.error(f"Kafka send error for {api}: {e}")
|
|
267
|
+
return None, f"Kafka error: {e}", "Kafka send failed", False
|
|
268
|
+
|
|
269
|
+
def _hybrid_request(self, api, payload, request_topic, response_topic, rest_fallback_func):
|
|
270
|
+
"""
|
|
271
|
+
Hybrid request method: try Kafka first, fallback to REST, cache if both fail.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
api: API name
|
|
275
|
+
payload: Request payload
|
|
276
|
+
request_topic: Kafka request topic
|
|
277
|
+
response_topic: Kafka response topic
|
|
278
|
+
rest_fallback_func: Function to call for REST fallback (should return same format as handle_response)
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Tuple of (data, error, message) matching the API response pattern
|
|
282
|
+
"""
|
|
283
|
+
# Try Kafka first
|
|
284
|
+
if self.enable_kafka:
|
|
285
|
+
data, error, message, kafka_success = self._send_kafka_request(
|
|
286
|
+
api, payload, request_topic, response_topic, timeout=5
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
if kafka_success and error is None:
|
|
290
|
+
# Kafka succeeded
|
|
291
|
+
return data, error, message
|
|
292
|
+
|
|
293
|
+
# Kafka returned an error response (not transport error)
|
|
294
|
+
if kafka_success and error is not None:
|
|
295
|
+
logging.warning(f"Kafka returned error for {api}, falling back to REST")
|
|
296
|
+
|
|
297
|
+
# Kafka failed or disabled, try REST
|
|
298
|
+
logging.info(f"Using REST API for {api}")
|
|
299
|
+
try:
|
|
300
|
+
rest_response = rest_fallback_func()
|
|
301
|
+
|
|
302
|
+
# Return REST response (success or failure)
|
|
303
|
+
if rest_response and len(rest_response) == 3:
|
|
304
|
+
return rest_response
|
|
305
|
+
else:
|
|
306
|
+
# Unexpected REST response format
|
|
307
|
+
logging.error(f"REST API returned unexpected format for {api}")
|
|
308
|
+
return None, "Unexpected REST response format", "REST API error"
|
|
309
|
+
|
|
310
|
+
except Exception as e:
|
|
311
|
+
# REST failed
|
|
312
|
+
logging.error(f"REST API failed for {api}: {e}")
|
|
313
|
+
return None, str(e), "REST API failed"
|
|
314
|
+
|
|
315
|
+
def shutdown(self):
|
|
316
|
+
"""Gracefully shutdown Kafka connections."""
|
|
317
|
+
if self.kafka_running:
|
|
318
|
+
logging.info("Shutting down Kafka connections...")
|
|
319
|
+
self.kafka_running = False
|
|
320
|
+
|
|
321
|
+
if self.kafka_thread:
|
|
322
|
+
self.kafka_thread.join(timeout=5)
|
|
323
|
+
|
|
324
|
+
if self.kafka_consumer:
|
|
325
|
+
self.kafka_consumer.close()
|
|
326
|
+
|
|
327
|
+
if self.kafka_producer:
|
|
328
|
+
self.kafka_producer.close()
|
|
329
|
+
|
|
330
|
+
logging.info("Kafka connections closed")
|
|
331
|
+
|
|
94
332
|
@log_errors(log_error=True)
|
|
95
333
|
def get_downscaled_ids(self):
|
|
96
|
-
"""Get IDs of downscaled instances.
|
|
334
|
+
"""Get IDs of downscaled instances using Kafka (with REST fallback).
|
|
97
335
|
|
|
98
336
|
Returns:
|
|
99
337
|
Tuple of (data, error, message) from API response
|
|
100
338
|
"""
|
|
101
|
-
logging.info(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
339
|
+
logging.info("Getting downscaled ids for instance %s", self.instance_id)
|
|
340
|
+
|
|
341
|
+
payload = {"instance_id": self.instance_id}
|
|
342
|
+
|
|
343
|
+
def rest_fallback():
|
|
344
|
+
path = f"/v1/compute/down_scaled_ids/{self.instance_id}"
|
|
345
|
+
resp = self.rpc.get(path=path)
|
|
346
|
+
return self.handle_response(
|
|
347
|
+
resp,
|
|
348
|
+
"Downscaled ids info fetched successfully",
|
|
349
|
+
"Could not fetch the Downscaled ids info",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
return self._hybrid_request(
|
|
353
|
+
api="get_downscaled_ids",
|
|
354
|
+
payload=payload,
|
|
355
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
356
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
357
|
+
rest_fallback_func=rest_fallback
|
|
111
358
|
)
|
|
112
359
|
|
|
113
360
|
@log_errors(default_return=(None, "API call failed", "Failed to stop instance"), log_error=True)
|
|
114
361
|
def stop_instance(self):
|
|
115
|
-
"""Stop the compute instance.
|
|
362
|
+
"""Stop the compute instance using Kafka (with REST fallback).
|
|
116
363
|
|
|
117
364
|
Returns:
|
|
118
365
|
Tuple of (data, error, message) from API response
|
|
119
366
|
"""
|
|
120
|
-
logging.info(
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
367
|
+
logging.info("Stopping instance %s", self.instance_id)
|
|
368
|
+
|
|
369
|
+
payload = {
|
|
370
|
+
"_idInstance": self.instance_id,
|
|
371
|
+
"isForcedStop": False,
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
def rest_fallback():
|
|
375
|
+
path = "/v1/compute/compute_instance/stop"
|
|
376
|
+
resp = self.rpc.put(path=path, payload=payload)
|
|
377
|
+
return self.handle_response(
|
|
378
|
+
resp,
|
|
379
|
+
"Instance stopped successfully",
|
|
380
|
+
"Could not stop the instance",
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return self._hybrid_request(
|
|
384
|
+
api="stop_instance",
|
|
385
|
+
payload=payload,
|
|
386
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
387
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
388
|
+
rest_fallback_func=rest_fallback
|
|
136
389
|
)
|
|
137
390
|
|
|
138
391
|
@log_errors(log_error=True)
|
|
139
|
-
def update_jupyter_token(
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
"token": token
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
392
|
+
def update_jupyter_token(self, token=""):
|
|
393
|
+
"""Update Jupyter notebook token using Kafka (with REST fallback)."""
|
|
394
|
+
payload = {"token": token, "instance_id": self.instance_id}
|
|
395
|
+
|
|
396
|
+
def rest_fallback():
|
|
397
|
+
path = f"/v1/scaling/update_jupyter_notebook_token/{self.instance_id}"
|
|
398
|
+
resp = self.rpc.put(path=path, payload={"token": token})
|
|
399
|
+
return self.handle_response(
|
|
400
|
+
resp,
|
|
401
|
+
"Resources updated successfully",
|
|
402
|
+
"Could not update the resources",
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
return self._hybrid_request(
|
|
406
|
+
api="update_jupyter_token",
|
|
407
|
+
payload=payload,
|
|
408
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
409
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
410
|
+
rest_fallback_func=rest_fallback
|
|
152
411
|
)
|
|
153
412
|
|
|
154
413
|
@log_errors(log_error=True)
|
|
@@ -167,7 +426,7 @@ class Scaling:
|
|
|
167
426
|
createdAt=None,
|
|
168
427
|
updatedAt=None,
|
|
169
428
|
):
|
|
170
|
-
"""Update status of an action.
|
|
429
|
+
"""Update status of an action using Kafka (with REST fallback).
|
|
171
430
|
|
|
172
431
|
Args:
|
|
173
432
|
service_provider: Provider of the service
|
|
@@ -188,12 +447,10 @@ class Scaling:
|
|
|
188
447
|
"""
|
|
189
448
|
if not action_record_id:
|
|
190
449
|
return None, "Action record id is required", "Action record id is required"
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
path = "/v1/compute/update_action_status"
|
|
196
|
-
payload_scaling = {
|
|
450
|
+
|
|
451
|
+
logging.info("Updating action status for action %s", action_record_id)
|
|
452
|
+
|
|
453
|
+
payload = {
|
|
197
454
|
"instanceID": self.instance_id,
|
|
198
455
|
"serviceProvider": service_provider,
|
|
199
456
|
"actionRecordId": action_record_id,
|
|
@@ -208,11 +465,22 @@ class Scaling:
|
|
|
208
465
|
"createdAt": createdAt,
|
|
209
466
|
"updatedAt": updatedAt,
|
|
210
467
|
}
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
468
|
+
|
|
469
|
+
def rest_fallback():
|
|
470
|
+
path = "/v1/compute/update_action_status"
|
|
471
|
+
resp = self.rpc.put(path=path, payload=payload)
|
|
472
|
+
return self.handle_response(
|
|
473
|
+
resp,
|
|
474
|
+
"Action status details updated successfully",
|
|
475
|
+
"Could not update the action status details ",
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
return self._hybrid_request(
|
|
479
|
+
api="update_action_status",
|
|
480
|
+
payload=payload,
|
|
481
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
482
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
483
|
+
rest_fallback_func=rest_fallback
|
|
216
484
|
)
|
|
217
485
|
|
|
218
486
|
@log_errors(log_error=True)
|
|
@@ -225,7 +493,7 @@ class Scaling:
|
|
|
225
493
|
status,
|
|
226
494
|
status_description,
|
|
227
495
|
):
|
|
228
|
-
"""Update status of an action.
|
|
496
|
+
"""Update status of an action using Kafka (with REST fallback).
|
|
229
497
|
|
|
230
498
|
Args:
|
|
231
499
|
action_record_id: ID of the action record
|
|
@@ -235,11 +503,8 @@ class Scaling:
|
|
|
235
503
|
status: Status to update
|
|
236
504
|
status_description: Description of the status
|
|
237
505
|
"""
|
|
238
|
-
logging.info(
|
|
239
|
-
|
|
240
|
-
action_record_id,
|
|
241
|
-
)
|
|
242
|
-
url = "/v1/actions"
|
|
506
|
+
logging.info("Updating status for action %s", action_record_id)
|
|
507
|
+
|
|
243
508
|
payload = {
|
|
244
509
|
"_id": action_record_id,
|
|
245
510
|
"action": action_type,
|
|
@@ -248,76 +513,91 @@ class Scaling:
|
|
|
248
513
|
"status": status,
|
|
249
514
|
"statusDescription": status_description,
|
|
250
515
|
}
|
|
251
|
-
|
|
516
|
+
|
|
517
|
+
def rest_fallback():
|
|
518
|
+
url = "/v1/actions"
|
|
519
|
+
self.rpc.put(path=url, payload=payload)
|
|
520
|
+
return None, None, "Status updated"
|
|
521
|
+
|
|
522
|
+
return self._hybrid_request(
|
|
523
|
+
api="update_action",
|
|
524
|
+
payload=payload,
|
|
525
|
+
request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
|
|
526
|
+
response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
|
|
527
|
+
rest_fallback_func=rest_fallback
|
|
528
|
+
)
|
|
252
529
|
|
|
253
530
|
@log_errors(log_error=True)
|
|
254
531
|
def get_shutdown_details(self):
|
|
255
|
-
"""Get shutdown details for the instance.
|
|
532
|
+
"""Get shutdown details for the instance using Kafka (with REST fallback).
|
|
256
533
|
|
|
257
534
|
Returns:
|
|
258
535
|
Tuple of (data, error, message) from API response
|
|
259
536
|
"""
|
|
260
|
-
logging.info(
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
537
|
+
logging.info("Getting shutdown details for instance %s", self.instance_id)
|
|
538
|
+
|
|
539
|
+
payload = {"instance_id": self.instance_id}
|
|
540
|
+
|
|
541
|
+
def rest_fallback():
|
|
542
|
+
path = f"/v1/compute/get_shutdown_details/{self.instance_id}"
|
|
543
|
+
resp = self.rpc.get(path=path)
|
|
544
|
+
return self.handle_response(
|
|
545
|
+
resp,
|
|
546
|
+
"Shutdown info fetched successfully",
|
|
547
|
+
"Could not fetch the shutdown details",
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
return self._hybrid_request(
|
|
551
|
+
api="get_shutdown_details",
|
|
552
|
+
payload=payload,
|
|
553
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
554
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
555
|
+
rest_fallback_func=rest_fallback
|
|
270
556
|
)
|
|
271
557
|
|
|
272
558
|
@log_errors(log_error=True)
|
|
273
559
|
def get_tasks_details(self):
|
|
274
|
-
"""Get task details for the instance.
|
|
560
|
+
"""Get task details for the instance using Kafka (with REST fallback).
|
|
275
561
|
|
|
276
562
|
Returns:
|
|
277
563
|
Tuple of (data, error, message) from API response
|
|
278
564
|
"""
|
|
279
|
-
logging.info(
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
565
|
+
logging.info("Getting tasks details for instance %s", self.instance_id)
|
|
566
|
+
|
|
567
|
+
payload = {"instance_id": self.instance_id}
|
|
568
|
+
|
|
569
|
+
def rest_fallback():
|
|
570
|
+
path = f"/v1/actions/fetch_instance_action_details/{self.instance_id}/action_details"
|
|
571
|
+
resp = self.rpc.get(path=path)
|
|
572
|
+
return self.handle_response(
|
|
573
|
+
resp,
|
|
574
|
+
"Task details fetched successfully",
|
|
575
|
+
"Could not fetch the task details",
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
return self._hybrid_request(
|
|
579
|
+
api="get_tasks_details",
|
|
580
|
+
payload=payload,
|
|
581
|
+
request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
|
|
582
|
+
response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
|
|
583
|
+
rest_fallback_func=rest_fallback
|
|
289
584
|
)
|
|
290
585
|
|
|
291
586
|
@log_errors(log_error=True)
|
|
292
587
|
def get_action_details(self, action_status_id):
|
|
293
|
-
"""Get details for a specific action using REST
|
|
294
|
-
|
|
588
|
+
"""Get details for a specific action using Kafka (with REST fallback).
|
|
589
|
+
|
|
295
590
|
Args:
|
|
296
591
|
action_status_id: ID of the action status to fetch
|
|
297
|
-
|
|
592
|
+
|
|
298
593
|
Returns:
|
|
299
594
|
Tuple of (data, error, message) from API response
|
|
300
595
|
"""
|
|
301
596
|
logging.info("Getting action details for action %s", action_status_id)
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
# api=api,
|
|
307
|
-
# payload=payload,
|
|
308
|
-
# request_topic=self.kafka_config["api_request_topic"],
|
|
309
|
-
# response_topic=self.kafka_config["api_response_topic"],
|
|
310
|
-
# timeout=60
|
|
311
|
-
# )
|
|
312
|
-
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
313
|
-
# if kafka_response_received:
|
|
314
|
-
# if error:
|
|
315
|
-
# logging.warning("Kafka returned error for get_action_details: %s. Falling back to REST API.", error)
|
|
316
|
-
# else:
|
|
317
|
-
# return data, error, message
|
|
318
|
-
|
|
319
|
-
# Using REST API directly
|
|
320
|
-
try:
|
|
597
|
+
|
|
598
|
+
payload = {"actionRecordId": action_status_id}
|
|
599
|
+
|
|
600
|
+
def rest_fallback():
|
|
321
601
|
path = f"/v1/actions/action/{action_status_id}/details"
|
|
322
602
|
resp = self.rpc.get(path=path)
|
|
323
603
|
return self.handle_response(
|
|
@@ -325,10 +605,14 @@ class Scaling:
|
|
|
325
605
|
"Task details fetched successfully",
|
|
326
606
|
"Could not fetch the task details",
|
|
327
607
|
)
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
608
|
+
|
|
609
|
+
return self._hybrid_request(
|
|
610
|
+
api="get_action_details",
|
|
611
|
+
payload=payload,
|
|
612
|
+
request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
|
|
613
|
+
response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
|
|
614
|
+
rest_fallback_func=rest_fallback
|
|
615
|
+
)
|
|
332
616
|
|
|
333
617
|
@log_errors(log_error=True)
|
|
334
618
|
def update_action(
|
|
@@ -342,8 +626,8 @@ class Scaling:
|
|
|
342
626
|
service="",
|
|
343
627
|
job_params=None,
|
|
344
628
|
):
|
|
345
|
-
"""Update an action using REST
|
|
346
|
-
|
|
629
|
+
"""Update an action using Kafka (with REST fallback).
|
|
630
|
+
|
|
347
631
|
Args:
|
|
348
632
|
id: Action ID
|
|
349
633
|
step_code: Step code
|
|
@@ -353,15 +637,15 @@ class Scaling:
|
|
|
353
637
|
status_description: Description of the status
|
|
354
638
|
service: Service name
|
|
355
639
|
job_params: Job parameters dictionary
|
|
356
|
-
|
|
640
|
+
|
|
357
641
|
Returns:
|
|
358
642
|
Tuple of (data, error, message) from API response
|
|
359
643
|
"""
|
|
360
644
|
if job_params is None:
|
|
361
645
|
job_params = {}
|
|
646
|
+
|
|
362
647
|
logging.info("Updating action %s", id)
|
|
363
|
-
|
|
364
|
-
# api = "update_action"
|
|
648
|
+
|
|
365
649
|
payload = {
|
|
366
650
|
"_id": id,
|
|
367
651
|
"stepCode": step_code,
|
|
@@ -372,22 +656,8 @@ class Scaling:
|
|
|
372
656
|
"serviceName": service,
|
|
373
657
|
"jobParams": job_params,
|
|
374
658
|
}
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
# payload=payload,
|
|
378
|
-
# request_topic=self.kafka_config["api_request_topic"],
|
|
379
|
-
# response_topic=self.kafka_config["api_response_topic"],
|
|
380
|
-
# timeout=60
|
|
381
|
-
# )
|
|
382
|
-
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
383
|
-
# if kafka_response_received:
|
|
384
|
-
# if error:
|
|
385
|
-
# logging.warning("Kafka returned error for update_action: %s. Falling back to REST API.", error)
|
|
386
|
-
# else:
|
|
387
|
-
# return data, error, message
|
|
388
|
-
|
|
389
|
-
# Using REST API directly
|
|
390
|
-
try:
|
|
659
|
+
|
|
660
|
+
def rest_fallback():
|
|
391
661
|
path = "/v1/actions"
|
|
392
662
|
resp = self.rpc.put(path=path, payload=payload)
|
|
393
663
|
return self.handle_response(
|
|
@@ -395,51 +665,38 @@ class Scaling:
|
|
|
395
665
|
"Error logged successfully",
|
|
396
666
|
"Could not log the errors",
|
|
397
667
|
)
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
668
|
+
|
|
669
|
+
return self._hybrid_request(
|
|
670
|
+
api="update_action",
|
|
671
|
+
payload=payload,
|
|
672
|
+
request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
|
|
673
|
+
response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
|
|
674
|
+
rest_fallback_func=rest_fallback
|
|
675
|
+
)
|
|
401
676
|
|
|
402
677
|
|
|
403
678
|
@log_errors(log_error=True)
|
|
404
679
|
def assign_jobs(self, is_gpu):
|
|
405
|
-
"""Assign jobs to the instance using REST
|
|
406
|
-
|
|
680
|
+
"""Assign jobs to the instance using Kafka (with REST fallback).
|
|
681
|
+
|
|
407
682
|
Args:
|
|
408
683
|
is_gpu: Boolean or any value indicating if this is a GPU instance.
|
|
409
684
|
Will be converted to proper boolean.
|
|
410
|
-
|
|
685
|
+
|
|
411
686
|
Returns:
|
|
412
687
|
Tuple of (data, error, message) from API response
|
|
413
688
|
"""
|
|
414
689
|
# Convert is_gpu to proper boolean
|
|
415
690
|
is_gpu_bool = bool(is_gpu)
|
|
416
691
|
logging.info("Assigning jobs for instance %s (GPU: %s)", self.instance_id, is_gpu_bool)
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
#
|
|
424
|
-
|
|
425
|
-
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
426
|
-
# api=api,
|
|
427
|
-
# payload=payload,
|
|
428
|
-
# request_topic=self.kafka_config["api_request_topic"],
|
|
429
|
-
# response_topic=self.kafka_config["api_response_topic"],
|
|
430
|
-
# timeout=60
|
|
431
|
-
# )
|
|
432
|
-
|
|
433
|
-
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
434
|
-
# if kafka_response_received:
|
|
435
|
-
# if error:
|
|
436
|
-
# logging.warning("Kafka returned error for assign_jobs: %s. Falling back to REST API.", error)
|
|
437
|
-
# else:
|
|
438
|
-
# return data, error, message
|
|
439
|
-
|
|
440
|
-
# Using REST API directly
|
|
441
|
-
try:
|
|
442
|
-
# Convert boolean to lowercase string for API endpoint
|
|
692
|
+
|
|
693
|
+
payload = {
|
|
694
|
+
"instanceID": self.instance_id,
|
|
695
|
+
"isGPUInstance": is_gpu_bool,
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
# Define REST fallback function
|
|
699
|
+
def rest_fallback():
|
|
443
700
|
is_gpu_str = str(is_gpu_bool).lower()
|
|
444
701
|
path = f"/v1/actions/assign_jobs/{is_gpu_str}/{self.instance_id}"
|
|
445
702
|
resp = self.rpc.get(path=path)
|
|
@@ -448,9 +705,15 @@ class Scaling:
|
|
|
448
705
|
"Pinged successfully",
|
|
449
706
|
"Could not ping the scaling jobs",
|
|
450
707
|
)
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
708
|
+
|
|
709
|
+
# Use hybrid approach: Kafka first, REST fallback, cache if both fail
|
|
710
|
+
return self._hybrid_request(
|
|
711
|
+
api="assign_jobs",
|
|
712
|
+
payload=payload,
|
|
713
|
+
request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
|
|
714
|
+
response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
|
|
715
|
+
rest_fallback_func=rest_fallback
|
|
716
|
+
)
|
|
454
717
|
|
|
455
718
|
|
|
456
719
|
@log_errors(log_error=True)
|
|
@@ -461,14 +724,14 @@ class Scaling:
|
|
|
461
724
|
availableMemory=0,
|
|
462
725
|
availableGPUMemory=0,
|
|
463
726
|
):
|
|
464
|
-
"""Update available resources for the instance using REST
|
|
465
|
-
|
|
727
|
+
"""Update available resources for the instance using Kafka (with REST fallback).
|
|
728
|
+
|
|
466
729
|
Args:
|
|
467
730
|
availableCPU: Available CPU resources
|
|
468
731
|
availableGPU: Available GPU resources
|
|
469
732
|
availableMemory: Available memory
|
|
470
733
|
availableGPUMemory: Available GPU memory
|
|
471
|
-
|
|
734
|
+
|
|
472
735
|
Returns:
|
|
473
736
|
Tuple of (data, error, message) from API response
|
|
474
737
|
"""
|
|
@@ -480,28 +743,9 @@ class Scaling:
|
|
|
480
743
|
"availableGPUMemory": availableGPUMemory,
|
|
481
744
|
"availableGPU": availableGPU,
|
|
482
745
|
}
|
|
483
|
-
|
|
484
|
-
#
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
488
|
-
# api=api,
|
|
489
|
-
# payload=payload,
|
|
490
|
-
# request_topic=self.kafka_config["scaling_request_topic"],
|
|
491
|
-
# response_topic=self.kafka_config["scaling_response_topic"],
|
|
492
|
-
# timeout=60
|
|
493
|
-
# )
|
|
494
|
-
|
|
495
|
-
# # Check if Kafka response was received
|
|
496
|
-
# # Response format: {'correlationId': 'id', 'status': 'success'/'error', 'data': ..., 'error': 'error message'}
|
|
497
|
-
# if kafka_response_received:
|
|
498
|
-
# if error:
|
|
499
|
-
# logging.warning("Kafka returned error for update_available_resources: %s. Falling back to REST API.", error)
|
|
500
|
-
# else:
|
|
501
|
-
# return data, error, message
|
|
502
|
-
|
|
503
|
-
# Using REST API directly
|
|
504
|
-
try:
|
|
746
|
+
|
|
747
|
+
# Define REST fallback function
|
|
748
|
+
def rest_fallback():
|
|
505
749
|
path = f"/v1/compute/update_available_resources/{self.instance_id}"
|
|
506
750
|
resp = self.rpc.put(path=path, payload=payload)
|
|
507
751
|
return self.handle_response(
|
|
@@ -509,45 +753,35 @@ class Scaling:
|
|
|
509
753
|
"Resources updated successfully",
|
|
510
754
|
"Could not update the resources",
|
|
511
755
|
)
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
756
|
+
|
|
757
|
+
# Use hybrid approach: Kafka first, REST fallback, cache if both fail
|
|
758
|
+
return self._hybrid_request(
|
|
759
|
+
api="update_available_resources",
|
|
760
|
+
payload=payload,
|
|
761
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
762
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
763
|
+
rest_fallback_func=rest_fallback
|
|
764
|
+
)
|
|
515
765
|
|
|
516
766
|
@log_errors(log_error=True)
|
|
517
767
|
def update_action_docker_logs(self, action_record_id, log_content):
|
|
518
|
-
"""Update docker logs for an action using REST
|
|
519
|
-
|
|
768
|
+
"""Update docker logs for an action using Kafka (with REST fallback).
|
|
769
|
+
|
|
520
770
|
Args:
|
|
521
771
|
action_record_id: ID of the action record
|
|
522
772
|
log_content: Content of the logs to update
|
|
523
|
-
|
|
773
|
+
|
|
524
774
|
Returns:
|
|
525
775
|
Tuple of (data, error, message) from API response
|
|
526
776
|
"""
|
|
527
777
|
logging.info("Updating docker logs for action %s", action_record_id)
|
|
528
|
-
|
|
529
|
-
# api = "update_action_docker_logs"
|
|
778
|
+
|
|
530
779
|
payload = {
|
|
531
780
|
"actionRecordId": action_record_id,
|
|
532
781
|
"logContent": log_content,
|
|
533
782
|
}
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
# payload=payload,
|
|
537
|
-
# request_topic=self.kafka_config["api_request_topic"],
|
|
538
|
-
# response_topic=self.kafka_config["api_response_topic"],
|
|
539
|
-
# timeout=60
|
|
540
|
-
# )
|
|
541
|
-
|
|
542
|
-
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
543
|
-
# if kafka_response_received:
|
|
544
|
-
# if error:
|
|
545
|
-
# logging.warning("Kafka returned error for update_action_docker_logs: %s. Falling back to REST API.", error)
|
|
546
|
-
# else:
|
|
547
|
-
# return data, error, message
|
|
548
|
-
|
|
549
|
-
# Using REST API directly
|
|
550
|
-
try:
|
|
783
|
+
|
|
784
|
+
def rest_fallback():
|
|
551
785
|
path = "/v1/actions/update_action_docker_logs"
|
|
552
786
|
resp = self.rpc.put(path=path, payload=payload)
|
|
553
787
|
return self.handle_response(
|
|
@@ -555,40 +789,67 @@ class Scaling:
|
|
|
555
789
|
"Docker logs updated successfully",
|
|
556
790
|
"Could not update the docker logs",
|
|
557
791
|
)
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
792
|
+
|
|
793
|
+
return self._hybrid_request(
|
|
794
|
+
api="update_action_docker_logs",
|
|
795
|
+
payload=payload,
|
|
796
|
+
request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
|
|
797
|
+
response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
|
|
798
|
+
rest_fallback_func=rest_fallback
|
|
799
|
+
)
|
|
562
800
|
|
|
563
801
|
@log_errors(log_error=True)
|
|
564
802
|
def get_docker_hub_credentials(self):
|
|
565
|
-
"""Get Docker Hub credentials.
|
|
803
|
+
"""Get Docker Hub credentials using Kafka (with REST fallback).
|
|
566
804
|
|
|
567
805
|
Returns:
|
|
568
806
|
Tuple of (data, error, message) from API response
|
|
569
807
|
"""
|
|
570
808
|
logging.info("Getting docker credentials")
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
809
|
+
|
|
810
|
+
payload = {}
|
|
811
|
+
|
|
812
|
+
def rest_fallback():
|
|
813
|
+
path = "/v1/compute/get_docker_hub_credentials"
|
|
814
|
+
resp = self.rpc.get(path=path)
|
|
815
|
+
return self.handle_response(
|
|
816
|
+
resp,
|
|
817
|
+
"Docker credentials fetched successfully",
|
|
818
|
+
"Could not fetch the docker credentials",
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
return self._hybrid_request(
|
|
822
|
+
api="get_docker_hub_credentials",
|
|
823
|
+
payload=payload,
|
|
824
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
825
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
826
|
+
rest_fallback_func=rest_fallback
|
|
577
827
|
)
|
|
578
828
|
|
|
579
829
|
@log_errors(log_error=True)
|
|
580
830
|
def get_open_ports_config(self):
|
|
581
|
-
"""Get open ports configuration.
|
|
831
|
+
"""Get open ports configuration using Kafka (with REST fallback).
|
|
582
832
|
|
|
583
833
|
Returns:
|
|
584
834
|
Tuple of (data, error, message) from API response
|
|
585
835
|
"""
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
836
|
+
payload = {"instance_id": self.instance_id}
|
|
837
|
+
|
|
838
|
+
def rest_fallback():
|
|
839
|
+
path = f"/v1/compute/get_open_ports/{self.instance_id}"
|
|
840
|
+
resp = self.rpc.get(path=path)
|
|
841
|
+
return self.handle_response(
|
|
842
|
+
resp,
|
|
843
|
+
"Open ports config fetched successfully",
|
|
844
|
+
"Could not fetch the open ports config",
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
return self._hybrid_request(
|
|
848
|
+
api="get_open_ports_config",
|
|
849
|
+
payload=payload,
|
|
850
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
851
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
852
|
+
rest_fallback_func=rest_fallback
|
|
592
853
|
)
|
|
593
854
|
|
|
594
855
|
@log_errors(default_return=None, log_error=True)
|
|
@@ -639,7 +900,7 @@ class Scaling:
|
|
|
639
900
|
|
|
640
901
|
@log_errors(log_error=True)
|
|
641
902
|
def get_model_secret_keys(self, secret_name):
|
|
642
|
-
"""Get model secret keys.
|
|
903
|
+
"""Get model secret keys using Kafka (with REST fallback).
|
|
643
904
|
|
|
644
905
|
Args:
|
|
645
906
|
secret_name: Name of the secret
|
|
@@ -647,12 +908,23 @@ class Scaling:
|
|
|
647
908
|
Returns:
|
|
648
909
|
Tuple of (data, error, message) from API response
|
|
649
910
|
"""
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
911
|
+
payload = {"secret_name": secret_name}
|
|
912
|
+
|
|
913
|
+
def rest_fallback():
|
|
914
|
+
path = f"/v1/compute/get_models_secret_keys?secret_name={secret_name}"
|
|
915
|
+
resp = self.rpc.get(path=path)
|
|
916
|
+
return self.handle_response(
|
|
917
|
+
resp,
|
|
918
|
+
"Secret keys fetched successfully",
|
|
919
|
+
"Could not fetch the secret keys",
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
return self._hybrid_request(
|
|
923
|
+
api="get_model_secret_keys",
|
|
924
|
+
payload=payload,
|
|
925
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
926
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
927
|
+
rest_fallback_func=rest_fallback
|
|
656
928
|
)
|
|
657
929
|
|
|
658
930
|
@log_errors(log_error=True)
|
|
@@ -753,7 +1025,7 @@ class Scaling:
|
|
|
753
1025
|
|
|
754
1026
|
@log_errors(log_error=True)
|
|
755
1027
|
def stop_account_compute(self, account_number, alias):
|
|
756
|
-
"""Stop a compute instance for an account.
|
|
1028
|
+
"""Stop a compute instance for an account using Kafka (with REST fallback).
|
|
757
1029
|
|
|
758
1030
|
Args:
|
|
759
1031
|
account_number: Account number
|
|
@@ -762,17 +1034,33 @@ class Scaling:
|
|
|
762
1034
|
Returns:
|
|
763
1035
|
Tuple of (data, error, message) from API response
|
|
764
1036
|
"""
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
"
|
|
770
|
-
|
|
1037
|
+
logging.info("Stopping account compute for %s/%s", account_number, alias)
|
|
1038
|
+
|
|
1039
|
+
payload = {
|
|
1040
|
+
"account_number": account_number,
|
|
1041
|
+
"alias": alias,
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
def rest_fallback():
|
|
1045
|
+
path = f"/v1/compute/stop_account_compute/{account_number}/{alias}"
|
|
1046
|
+
resp = self.rpc.put(path=path)
|
|
1047
|
+
return self.handle_response(
|
|
1048
|
+
resp,
|
|
1049
|
+
"Compute instance stopped successfully",
|
|
1050
|
+
"Could not stop the compute instance",
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
return self._hybrid_request(
|
|
1054
|
+
api="stop_account_compute",
|
|
1055
|
+
payload=payload,
|
|
1056
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
1057
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
1058
|
+
rest_fallback_func=rest_fallback
|
|
771
1059
|
)
|
|
772
1060
|
|
|
773
1061
|
@log_errors(log_error=True)
|
|
774
1062
|
def restart_account_compute(self, account_number, alias):
|
|
775
|
-
"""Restart a compute instance for an account.
|
|
1063
|
+
"""Restart a compute instance for an account using Kafka (with REST fallback).
|
|
776
1064
|
|
|
777
1065
|
Args:
|
|
778
1066
|
account_number: Account number
|
|
@@ -781,12 +1069,28 @@ class Scaling:
|
|
|
781
1069
|
Returns:
|
|
782
1070
|
Tuple of (data, error, message) from API response
|
|
783
1071
|
"""
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
"
|
|
789
|
-
|
|
1072
|
+
logging.info("Restarting account compute for %s/%s", account_number, alias)
|
|
1073
|
+
|
|
1074
|
+
payload = {
|
|
1075
|
+
"account_number": account_number,
|
|
1076
|
+
"alias": alias,
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
def rest_fallback():
|
|
1080
|
+
path = f"/v1/compute/restart_account_compute/{account_number}/{alias}"
|
|
1081
|
+
resp = self.rpc.put(path=path)
|
|
1082
|
+
return self.handle_response(
|
|
1083
|
+
resp,
|
|
1084
|
+
"Compute instance restarted successfully",
|
|
1085
|
+
"Could not restart the compute instance",
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
return self._hybrid_request(
|
|
1089
|
+
api="restart_account_compute",
|
|
1090
|
+
payload=payload,
|
|
1091
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
1092
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
1093
|
+
rest_fallback_func=rest_fallback
|
|
790
1094
|
)
|
|
791
1095
|
|
|
792
1096
|
@log_errors(log_error=True)
|
|
@@ -810,37 +1114,59 @@ class Scaling:
|
|
|
810
1114
|
|
|
811
1115
|
@log_errors(log_error=True)
|
|
812
1116
|
def get_all_instances_type(self):
|
|
813
|
-
"""Get all instance types.
|
|
1117
|
+
"""Get all instance types using Kafka (with REST fallback).
|
|
814
1118
|
|
|
815
1119
|
Returns:
|
|
816
1120
|
Tuple of (data, error, message) from API response
|
|
817
1121
|
"""
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
1122
|
+
payload = {}
|
|
1123
|
+
|
|
1124
|
+
def rest_fallback():
|
|
1125
|
+
path = "/v1/compute/get_all_instances_type"
|
|
1126
|
+
resp = self.rpc.get(path=path)
|
|
1127
|
+
return self.handle_response(
|
|
1128
|
+
resp,
|
|
1129
|
+
"All instance types fetched successfully",
|
|
1130
|
+
"Could not fetch the instance types",
|
|
1131
|
+
)
|
|
1132
|
+
|
|
1133
|
+
return self._hybrid_request(
|
|
1134
|
+
api="get_all_instances_type",
|
|
1135
|
+
payload=payload,
|
|
1136
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
1137
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
1138
|
+
rest_fallback_func=rest_fallback
|
|
824
1139
|
)
|
|
825
1140
|
|
|
826
1141
|
@log_errors(log_error=True)
|
|
827
1142
|
def get_compute_details(self):
|
|
828
|
-
"""Get compute instance details.
|
|
1143
|
+
"""Get compute instance details using Kafka (with REST fallback).
|
|
829
1144
|
|
|
830
1145
|
Returns:
|
|
831
1146
|
Tuple of (data, error, message) from API response
|
|
832
1147
|
"""
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
1148
|
+
payload = {"instance_id": self.instance_id}
|
|
1149
|
+
|
|
1150
|
+
def rest_fallback():
|
|
1151
|
+
path = f"/v1/scaling/get_compute_details/{self.instance_id}"
|
|
1152
|
+
resp = self.rpc.get(path=path)
|
|
1153
|
+
return self.handle_response(
|
|
1154
|
+
resp,
|
|
1155
|
+
"Compute details fetched successfully",
|
|
1156
|
+
"Could not fetch the compute details",
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
return self._hybrid_request(
|
|
1160
|
+
api="get_compute_details",
|
|
1161
|
+
payload=payload,
|
|
1162
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
1163
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
1164
|
+
rest_fallback_func=rest_fallback
|
|
839
1165
|
)
|
|
840
1166
|
|
|
841
1167
|
@log_errors(log_error=True)
|
|
842
1168
|
def get_user_access_key_pair(self, user_id):
|
|
843
|
-
"""Get user access key pair.
|
|
1169
|
+
"""Get user access key pair using Kafka (with REST fallback).
|
|
844
1170
|
|
|
845
1171
|
Args:
|
|
846
1172
|
user_id: ID of the user
|
|
@@ -848,17 +1174,28 @@ class Scaling:
|
|
|
848
1174
|
Returns:
|
|
849
1175
|
Tuple of (data, error, message) from API response
|
|
850
1176
|
"""
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
1177
|
+
payload = {"user_id": user_id, "instance_id": self.instance_id}
|
|
1178
|
+
|
|
1179
|
+
def rest_fallback():
|
|
1180
|
+
path = f"/v1/compute/get_user_access_key_pair/{user_id}/{self.instance_id}"
|
|
1181
|
+
resp = self.rpc.get(path=path)
|
|
1182
|
+
return self.handle_response(
|
|
1183
|
+
resp,
|
|
1184
|
+
"User access key pair fetched successfully",
|
|
1185
|
+
"Could not fetch the user access key pair",
|
|
1186
|
+
)
|
|
1187
|
+
|
|
1188
|
+
return self._hybrid_request(
|
|
1189
|
+
api="get_user_access_key_pair",
|
|
1190
|
+
payload=payload,
|
|
1191
|
+
request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
|
|
1192
|
+
response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
|
|
1193
|
+
rest_fallback_func=rest_fallback
|
|
857
1194
|
)
|
|
858
1195
|
|
|
859
1196
|
@log_errors(log_error=True)
|
|
860
1197
|
def get_internal_api_key(self, action_id):
|
|
861
|
-
"""Get internal API key.
|
|
1198
|
+
"""Get internal API key using Kafka (with REST fallback).
|
|
862
1199
|
|
|
863
1200
|
Args:
|
|
864
1201
|
action_id: ID of the action
|
|
@@ -866,107 +1203,22 @@ class Scaling:
|
|
|
866
1203
|
Returns:
|
|
867
1204
|
Tuple of (data, error, message) from API response
|
|
868
1205
|
"""
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
1206
|
+
payload = {"action_id": action_id, "instance_id": self.instance_id}
|
|
1207
|
+
|
|
1208
|
+
def rest_fallback():
|
|
1209
|
+
path = f"/v1/actions/get_internal_api_key/{action_id}/{self.instance_id}"
|
|
1210
|
+
resp = self.rpc.get(path=path)
|
|
1211
|
+
return self.handle_response(
|
|
1212
|
+
resp,
|
|
1213
|
+
"internal keys fetched successfully",
|
|
1214
|
+
"Could not fetch internal keys",
|
|
1215
|
+
)
|
|
1216
|
+
|
|
1217
|
+
return self._hybrid_request(
|
|
1218
|
+
api="get_internal_api_key",
|
|
1219
|
+
payload=payload,
|
|
1220
|
+
request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
|
|
1221
|
+
response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
|
|
1222
|
+
rest_fallback_func=rest_fallback
|
|
875
1223
|
)
|
|
876
1224
|
|
|
877
|
-
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
878
|
-
# @log_errors(log_error=True)
|
|
879
|
-
# def handle_kafka_response(self, msg, success_message, error_message):
|
|
880
|
-
# """
|
|
881
|
-
# Helper to process Kafka response messages in a consistent way.
|
|
882
|
-
# """
|
|
883
|
-
# if msg.get("status") == "success":
|
|
884
|
-
# data = msg.get("data")
|
|
885
|
-
# error = None
|
|
886
|
-
# message = success_message
|
|
887
|
-
# logging.info(message)
|
|
888
|
-
# else:
|
|
889
|
-
# data = msg.get("data")
|
|
890
|
-
# error = msg.get("error", "Unknown error")
|
|
891
|
-
# message = error_message
|
|
892
|
-
# logging.error("%s: %s", message, error)
|
|
893
|
-
# return data, error, message
|
|
894
|
-
|
|
895
|
-
# def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=60):
|
|
896
|
-
# """
|
|
897
|
-
# Helper to send a request to Kafka and wait for a response.
|
|
898
|
-
# Returns (data, error, message, kafka_response_received) where kafka_response_received is True if a response was received (even if error), False if transport error/timeout.
|
|
899
|
-
# """
|
|
900
|
-
# correlation_id = str(uuid.uuid4())
|
|
901
|
-
# request_message = {
|
|
902
|
-
# "correlationId": correlation_id,
|
|
903
|
-
# "api": api,
|
|
904
|
-
# "payload": payload,
|
|
905
|
-
# }
|
|
906
|
-
|
|
907
|
-
# consumer = KafkaConsumer(
|
|
908
|
-
# response_topic,
|
|
909
|
-
# bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
910
|
-
# group_id=None,
|
|
911
|
-
# value_deserializer=lambda m: json.loads(m.decode("utf-8")),
|
|
912
|
-
# auto_offset_reset='latest',
|
|
913
|
-
# enable_auto_commit=True,
|
|
914
|
-
# )
|
|
915
|
-
|
|
916
|
-
# try:
|
|
917
|
-
# if hasattr(self.session.rpc, 'AUTH_TOKEN'):
|
|
918
|
-
# self.session.rpc.AUTH_TOKEN.set_bearer_token()
|
|
919
|
-
# auth_token = self.session.rpc.AUTH_TOKEN.bearer_token
|
|
920
|
-
# auth_token = auth_token.replace("Bearer ", "")
|
|
921
|
-
# headers = [("Authorization", bytes(f"{auth_token}", "utf-8"))]
|
|
922
|
-
# else:
|
|
923
|
-
# headers = None
|
|
924
|
-
# self.kafka_producer.send(request_topic, request_message, headers=headers)
|
|
925
|
-
# # self.kafka_producer.flush()
|
|
926
|
-
# logging.info("Sent %s request to Kafka topic %s", api, request_topic)
|
|
927
|
-
# except Exception as e:
|
|
928
|
-
# logging.error("Kafka producer error: %s", e)
|
|
929
|
-
# return None, f"Kafka producer error: {e}", "Kafka send failed", False
|
|
930
|
-
# try:
|
|
931
|
-
# start = time.time()
|
|
932
|
-
# while time.time() - start < timeout:
|
|
933
|
-
# # Poll for messages with a short timeout to avoid blocking forever
|
|
934
|
-
# message_batch = consumer.poll(timeout_ms=1000)
|
|
935
|
-
# if message_batch:
|
|
936
|
-
# for topic_partition, messages in message_batch.items():
|
|
937
|
-
# for message in messages:
|
|
938
|
-
# print("trying to fetch message")
|
|
939
|
-
# msg = message.value
|
|
940
|
-
# if msg.get("correlationId") == correlation_id:
|
|
941
|
-
# consumer.close()
|
|
942
|
-
# # Always treat a received response as final, even if error
|
|
943
|
-
# return self.handle_kafka_response(
|
|
944
|
-
# msg,
|
|
945
|
-
# f"Fetched via Kafka for {api}",
|
|
946
|
-
# f"Kafka error response for {api}"
|
|
947
|
-
# ) + (True,)
|
|
948
|
-
# else:
|
|
949
|
-
# print(f"No messages received, waiting... ({time.time() - start:.1f}s/{timeout}s)")
|
|
950
|
-
#
|
|
951
|
-
# consumer.close()
|
|
952
|
-
# logging.warning("Kafka response timeout for %s after %d seconds", api, timeout)
|
|
953
|
-
# return None, "Kafka response timeout", "Kafka response timeout", False
|
|
954
|
-
# except Exception as e:
|
|
955
|
-
# logging.error("Kafka consumer error: %s", e)
|
|
956
|
-
# return None, f"Kafka consumer error: {e}", "Kafka consumer error", False
|
|
957
|
-
|
|
958
|
-
# def _cache_failed_request(self, api, payload):
|
|
959
|
-
# """Cache the failed request for retry. Here, we use a simple file cache as a placeholder."""
|
|
960
|
-
# try:
|
|
961
|
-
# cache_file = os.path.join(os.path.dirname(__file__), 'request_cache.json')
|
|
962
|
-
# if os.path.exists(cache_file):
|
|
963
|
-
# with open(cache_file, 'r') as f:
|
|
964
|
-
# cache = json.load(f)
|
|
965
|
-
# else:
|
|
966
|
-
# cache = []
|
|
967
|
-
# cache.append({"api": api, "payload": payload, "ts": time.time()})
|
|
968
|
-
# with open(cache_file, 'w') as f:
|
|
969
|
-
# json.dump(cache, f)
|
|
970
|
-
# logging.info("Cached failed request for api %s", api)
|
|
971
|
-
# except Exception as e:
|
|
972
|
-
# logging.error("Failed to cache request: %s", e)
|