matrice-compute 0.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1395 @@
1
+
2
+
3
+ import os
4
+ import logging
5
+ import json
6
+ import psutil
7
+ from matrice_common.utils import log_errors
8
+ from kafka import KafkaProducer, KafkaConsumer
9
+ import uuid
10
+ import time
11
+ import base64
12
+ import threading
13
+ import platform
14
+ import subprocess
15
+
16
+
17
+ class Scaling:
18
+
19
+ """Class providing scaling functionality for compute instances."""
20
+
21
+ def __init__(self, session, instance_id=None, enable_kafka=False):
22
+ """Initialize Scaling instance.
23
+
24
+ Args:
25
+ session: Session object for making RPC calls
26
+ instance_id: ID of the compute instance
27
+ enable_kafka: Enable Kafka communication (default True)
28
+
29
+ Raises:
30
+ Exception: If instance_id is not provided
31
+ """
32
+ if not instance_id:
33
+ msg = "Instance id not set for this instance. Cannot perform the operation for job-scheduler without instance id"
34
+ logging.error(msg)
35
+ raise ValueError(msg)
36
+ self.instance_id = instance_id
37
+ self.session = session
38
+ self.rpc = session.rpc
39
+ used_ports_str = os.environ.get("USED_PORTS", "")
40
+ self.used_ports = set(int(p) for p in used_ports_str.split(",") if p.strip())
41
+
42
+ # Kafka configuration and initialization
43
+ self.enable_kafka = enable_kafka
44
+ self.kafka_producer = None
45
+ self.kafka_consumer = None
46
+ self.kafka_thread = None
47
+ self.kafka_running = False
48
+
49
+ # Maps correlation_id to threading.Event for request/response matching
50
+ self.pending_requests = {}
51
+ # Maps correlation_id to response data
52
+ self.response_map = {}
53
+ self.response_lock = threading.Lock()
54
+
55
+ if self.enable_kafka:
56
+ try:
57
+ self.kafka_config = {
58
+ "bootstrap_servers": self.get_kafka_bootstrap_servers(),
59
+ "action_request_topic": "action_requests",
60
+ "action_response_topic": "action_responses",
61
+ "compute_request_topic": "compute_requests",
62
+ "compute_response_topic": "compute_responses"
63
+ }
64
+
65
+ # Initialize single producer
66
+ self.kafka_producer = KafkaProducer(
67
+ bootstrap_servers=self.kafka_config["bootstrap_servers"],
68
+ value_serializer=lambda v: json.dumps(v).encode("utf-8"),
69
+ max_block_ms=5000 # Timeout if Kafka is down
70
+ )
71
+
72
+ # Initialize single consumer for both response topics
73
+ self.kafka_consumer = KafkaConsumer(
74
+ self.kafka_config["action_response_topic"],
75
+ self.kafka_config["compute_response_topic"],
76
+ bootstrap_servers=self.kafka_config["bootstrap_servers"],
77
+ group_id=f"py_compute_{instance_id}",
78
+ value_deserializer=lambda m: json.loads(m.decode("utf-8")),
79
+ auto_offset_reset='latest',
80
+ enable_auto_commit=True,
81
+ consumer_timeout_ms=1000, # Poll timeout
82
+ session_timeout_ms=60000, # Increase session timeout to 60s (default 30s)
83
+ heartbeat_interval_ms=3000, # Send heartbeat every 3s
84
+ max_poll_interval_ms=300000 # Max time between polls: 5 minutes
85
+ )
86
+
87
+ # Start background thread to handle responses
88
+ self.kafka_running = True
89
+ self.kafka_thread = threading.Thread(target=self._kafka_response_listener, daemon=True)
90
+ self.kafka_thread.start()
91
+
92
+ logging.info(f"Kafka enabled with bootstrap servers: {self.kafka_config['bootstrap_servers']}")
93
+ except Exception as e:
94
+ logging.warning(f"Failed to initialize Kafka, will use REST API only: {e}")
95
+ self.enable_kafka = False
96
+ self.kafka_producer = None
97
+ self.kafka_consumer = None
98
+
99
+ logging.info(
100
+ "Initialized Scaling with instance_id: %s, Kafka enabled: %s",
101
+ instance_id,
102
+ self.enable_kafka
103
+ )
104
+
105
+ @log_errors(default_return=None, log_error=True)
106
+ def get_kafka_bootstrap_servers(self):
107
+ """Get Kafka bootstrap servers from API and decode base64 fields.
108
+
109
+ Returns:
110
+ str: Kafka bootstrap servers in format "ip:port"
111
+
112
+ Raises:
113
+ ValueError: If unable to fetch Kafka configuration
114
+ """
115
+ path = "/v1/actions/get_kafka_info"
116
+ response = self.rpc.get(path=path)
117
+ if not response or not response.get("success"):
118
+ raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
119
+ encoded_ip = response["data"]["ip"]
120
+ encoded_port = response["data"]["port"]
121
+ ip = base64.b64decode(encoded_ip).decode("utf-8")
122
+ port = base64.b64decode(encoded_port).decode("utf-8")
123
+ bootstrap_servers = f"{ip}:{port}"
124
+ # logging.info(f"Retrieved Kafka bootstrap servers: {bootstrap_servers}")
125
+ return bootstrap_servers
126
+
127
+ @log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
128
+ def handle_response(self, resp, success_message, error_message):
129
+ """Helper function to handle API response.
130
+
131
+ Args:
132
+ resp: Response from API call
133
+ success_message: Message to log on success
134
+ error_message: Message to log on error
135
+
136
+ Returns:
137
+ Tuple of (data, error, message)
138
+ """
139
+ if resp.get("success"):
140
+ data = resp.get("data")
141
+ error = None
142
+ message = success_message
143
+ logging.info(message)
144
+ else:
145
+ data = resp.get("data")
146
+ error = resp.get("message")
147
+ message = error_message
148
+ logging.error("%s: %s", message, error)
149
+ return data, error, message
150
+
151
+ def _kafka_response_listener(self):
152
+ """
153
+ Background thread that continuously polls for Kafka responses.
154
+
155
+ This thread runs in the background and listens for responses from both
156
+ action_responses and compute_responses topics. When a response is received,
157
+ it matches the correlation ID to pending requests and wakes up the waiting thread.
158
+ """
159
+ logging.info("Kafka response listener thread started")
160
+
161
+ while self.kafka_running:
162
+ try:
163
+ # Poll for messages with 1 second timeout
164
+ message_batch = self.kafka_consumer.poll(timeout_ms=1000)
165
+
166
+ if message_batch:
167
+ for topic_partition, messages in message_batch.items():
168
+ for message in messages:
169
+ try:
170
+ msg = message.value
171
+ correlation_id = msg.get("correlationId")
172
+
173
+ if correlation_id:
174
+ with self.response_lock:
175
+ if correlation_id in self.pending_requests:
176
+ # Store response and signal waiting thread
177
+ self.response_map[correlation_id] = msg
178
+ self.pending_requests[correlation_id].set()
179
+ logging.debug(f"Received Kafka response for correlation_id: {correlation_id}")
180
+ else:
181
+ logging.warning(f"Received Kafka message without correlationId: {msg}")
182
+ except Exception as e:
183
+ logging.error(f"Error processing Kafka message: {e}")
184
+
185
+ except Exception as e:
186
+ if self.kafka_running: # Only log if not shutting down
187
+ logging.error(f"Error in Kafka response listener: {e}")
188
+ time.sleep(1) # Avoid tight loop on persistent errors
189
+
190
+ logging.info("Kafka response listener thread stopped")
191
+
192
+ def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=5):
193
+ """
194
+ Send a request via Kafka and wait for response using the persistent consumer.
195
+
196
+ Args:
197
+ api: API name to call
198
+ payload: Request payload dictionary
199
+ request_topic: Kafka topic to send request to
200
+ response_topic: Kafka topic to receive response from (not used, kept for signature)
201
+ timeout: Timeout in seconds to wait for response
202
+
203
+ Returns:
204
+ Tuple of (data, error, message, kafka_success)
205
+ kafka_success is True if response received, False if timeout/error
206
+ """
207
+ if not self.enable_kafka or not self.kafka_producer:
208
+ return None, "Kafka not enabled", "Kafka not available", False
209
+
210
+ correlation_id = str(uuid.uuid4())
211
+ request_message = {
212
+ "correlationId": correlation_id,
213
+ "api": api,
214
+ "payload": payload,
215
+ }
216
+
217
+ # Create event for this request
218
+ event = threading.Event()
219
+
220
+ with self.response_lock:
221
+ self.pending_requests[correlation_id] = event
222
+
223
+ try:
224
+ # Add auth token if available
225
+ headers = None
226
+ if hasattr(self.session.rpc, 'AUTH_TOKEN'):
227
+ self.session.rpc.AUTH_TOKEN.set_bearer_token()
228
+ auth_token = self.session.rpc.AUTH_TOKEN.bearer_token
229
+ auth_token = auth_token.replace("Bearer ", "")
230
+ headers = [("Authorization", bytes(f"{auth_token}", "utf-8"))]
231
+
232
+ # Send request
233
+ self.kafka_producer.send(request_topic, request_message, headers=headers)
234
+ logging.info(f"Sent Kafka request for {api} with correlation_id: {correlation_id}")
235
+
236
+ # Wait for response with timeout
237
+ if event.wait(timeout=timeout):
238
+ # Response received
239
+ with self.response_lock:
240
+ response = self.response_map.pop(correlation_id, None)
241
+ self.pending_requests.pop(correlation_id, None)
242
+
243
+ if response:
244
+ if response.get("status") == "success":
245
+ data = response.get("data")
246
+ logging.info(f"Kafka success for {api}")
247
+ return data, None, f"Fetched via Kafka for {api}", True
248
+ else:
249
+ error = response.get("error", "Unknown error")
250
+ logging.error(f"Kafka error response for {api}: {error}")
251
+ return None, error, f"Kafka error response for {api}", True
252
+ else:
253
+ logging.warning(f"Kafka response received but missing data for {api}")
254
+ return None, "Response missing data", "Kafka response error", False
255
+ else:
256
+ # Timeout
257
+ with self.response_lock:
258
+ self.pending_requests.pop(correlation_id, None)
259
+ logging.warning(f"Kafka response timeout for {api} after {timeout} seconds")
260
+ return None, "Kafka response timeout", "Kafka response timeout", False
261
+
262
+ except Exception as e:
263
+ # Cleanup on error
264
+ with self.response_lock:
265
+ self.pending_requests.pop(correlation_id, None)
266
+ logging.error(f"Kafka send error for {api}: {e}")
267
+ return None, f"Kafka error: {e}", "Kafka send failed", False
268
+
269
+ def _hybrid_request(self, api, payload, request_topic, response_topic, rest_fallback_func):
270
+ """
271
+ Hybrid request method: try Kafka first, fallback to REST, cache if both fail.
272
+
273
+ Args:
274
+ api: API name
275
+ payload: Request payload
276
+ request_topic: Kafka request topic
277
+ response_topic: Kafka response topic
278
+ rest_fallback_func: Function to call for REST fallback (should return same format as handle_response)
279
+
280
+ Returns:
281
+ Tuple of (data, error, message) matching the API response pattern
282
+ """
283
+ # Try Kafka first
284
+ if self.enable_kafka:
285
+ data, error, message, kafka_success = self._send_kafka_request(
286
+ api, payload, request_topic, response_topic, timeout=5
287
+ )
288
+
289
+ if kafka_success and error is None:
290
+ # Kafka succeeded
291
+ return data, error, message
292
+
293
+ # Kafka returned an error response (not transport error)
294
+ if kafka_success and error is not None:
295
+ logging.warning(f"Kafka returned error for {api}, falling back to REST")
296
+
297
+ # Kafka failed or disabled, try REST
298
+ logging.info(f"Using REST API for {api}")
299
+ try:
300
+ rest_response = rest_fallback_func()
301
+
302
+ # Return REST response (success or failure)
303
+ if rest_response and len(rest_response) == 3:
304
+ return rest_response
305
+ else:
306
+ # Unexpected REST response format
307
+ logging.error(f"REST API returned unexpected format for {api}")
308
+ return None, "Unexpected REST response format", "REST API error"
309
+
310
+ except Exception as e:
311
+ # REST failed
312
+ logging.error(f"REST API failed for {api}: {e}")
313
+ return None, str(e), "REST API failed"
314
+
315
+ def shutdown(self):
316
+ """Gracefully shutdown Kafka connections."""
317
+ if self.kafka_running:
318
+ logging.info("Shutting down Kafka connections...")
319
+ self.kafka_running = False
320
+
321
+ if self.kafka_thread:
322
+ self.kafka_thread.join(timeout=5)
323
+
324
+ if self.kafka_consumer:
325
+ self.kafka_consumer.close()
326
+
327
+ if self.kafka_producer:
328
+ self.kafka_producer.close()
329
+
330
+ logging.info("Kafka connections closed")
331
+
332
+ @log_errors(log_error=True)
333
+ def get_downscaled_ids(self):
334
+ """Get IDs of downscaled instances using Kafka (with REST fallback).
335
+
336
+ Returns:
337
+ Tuple of (data, error, message) from API response
338
+ """
339
+ logging.info("Getting downscaled ids for instance %s", self.instance_id)
340
+
341
+ payload = {"instance_id": self.instance_id}
342
+
343
+ def rest_fallback():
344
+ path = f"/v1/compute/down_scaled_ids/{self.instance_id}"
345
+ resp = self.rpc.get(path=path)
346
+ return self.handle_response(
347
+ resp,
348
+ "Downscaled ids info fetched successfully",
349
+ "Could not fetch the Downscaled ids info",
350
+ )
351
+
352
+ return self._hybrid_request(
353
+ api="get_downscaled_ids",
354
+ payload=payload,
355
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
356
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
357
+ rest_fallback_func=rest_fallback
358
+ )
359
+
360
+ @log_errors(default_return=(None, "API call failed", "Failed to stop instance"), log_error=True)
361
+ def stop_instance(self):
362
+ """Stop the compute instance using Kafka (with REST fallback).
363
+
364
+ Returns:
365
+ Tuple of (data, error, message) from API response
366
+ """
367
+ logging.info("Stopping instance %s", self.instance_id)
368
+
369
+ payload = {
370
+ "_idInstance": self.instance_id,
371
+ "isForcedStop": False,
372
+ }
373
+
374
+ def rest_fallback():
375
+ path = "/v1/compute/compute_instance/stop"
376
+ resp = self.rpc.put(path=path, payload=payload)
377
+ return self.handle_response(
378
+ resp,
379
+ "Instance stopped successfully",
380
+ "Could not stop the instance",
381
+ )
382
+
383
+ return self._hybrid_request(
384
+ api="stop_instance",
385
+ payload=payload,
386
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
387
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
388
+ rest_fallback_func=rest_fallback
389
+ )
390
+
391
+ @log_errors(log_error=True)
392
+ def update_jupyter_token(self, token=""):
393
+ """Update Jupyter notebook token using Kafka (with REST fallback)."""
394
+ payload = {"token": token, "instance_id": self.instance_id}
395
+
396
+ def rest_fallback():
397
+ path = f"/v1/compute/update_jupyter_notebook_token/{self.instance_id}"
398
+ resp = self.rpc.put(path=path, payload={"token": token})
399
+ return self.handle_response(
400
+ resp,
401
+ "Resources updated successfully",
402
+ "Could not update the resources",
403
+ )
404
+
405
+ return self._hybrid_request(
406
+ api="update_jupyter_token",
407
+ payload=payload,
408
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
409
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
410
+ rest_fallback_func=rest_fallback
411
+ )
412
+
413
+ @log_errors(log_error=True)
414
+ def update_action_status(
415
+ self,
416
+ service_provider="",
417
+ action_record_id="",
418
+ isRunning=True,
419
+ status="",
420
+ docker_start_time=None,
421
+ action_duration=0,
422
+ cpuUtilisation=0.0,
423
+ gpuUtilisation=0.0,
424
+ memoryUtilisation=0.0,
425
+ gpuMemoryUsed=0,
426
+ createdAt=None,
427
+ updatedAt=None,
428
+ ):
429
+ """Update status of an action using Kafka (with REST fallback).
430
+
431
+ Args:
432
+ service_provider: Provider of the service
433
+ action_record_id: ID of the action record
434
+ isRunning: Whether action is running
435
+ status: Status of the action
436
+ docker_start_time: Start time of docker container
437
+ action_duration: Duration of the action
438
+ cpuUtilisation: CPU utilization percentage
439
+ gpuUtilisation: GPU utilization percentage
440
+ memoryUtilisation: Memory utilization percentage
441
+ gpuMemoryUsed: GPU memory used
442
+ createdAt: Creation timestamp
443
+ updatedAt: Last update timestamp
444
+
445
+ Returns:
446
+ Tuple of (data, error, message) from API response
447
+ """
448
+ if not action_record_id:
449
+ return None, "Action record id is required", "Action record id is required"
450
+
451
+ logging.info("Updating action status for action %s", action_record_id)
452
+
453
+ payload = {
454
+ "instanceID": self.instance_id,
455
+ "serviceProvider": service_provider,
456
+ "actionRecordId": action_record_id,
457
+ "isRunning": isRunning,
458
+ "status": status,
459
+ "dockerContainerStartTime": docker_start_time,
460
+ "cpuUtilisation": cpuUtilisation,
461
+ "gpuUtilisation": gpuUtilisation,
462
+ "memoryUtilisation": memoryUtilisation,
463
+ "gpuMemoryUsed": gpuMemoryUsed,
464
+ "actionDuration": action_duration,
465
+ "createdAt": createdAt,
466
+ "updatedAt": updatedAt,
467
+ }
468
+
469
+ def rest_fallback():
470
+ path = "/v1/compute/update_action_status"
471
+ resp = self.rpc.put(path=path, payload=payload)
472
+ return self.handle_response(
473
+ resp,
474
+ "Action status details updated successfully",
475
+ "Could not update the action status details ",
476
+ )
477
+
478
+ return self._hybrid_request(
479
+ api="update_action_status",
480
+ payload=payload,
481
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
482
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
483
+ rest_fallback_func=rest_fallback
484
+ )
485
+
486
+ @log_errors(log_error=True)
487
+ def update_status(
488
+ self,
489
+ action_record_id,
490
+ action_type,
491
+ service_name,
492
+ stepCode,
493
+ status,
494
+ status_description,
495
+ ):
496
+ """Update status of an action using Kafka (with REST fallback).
497
+
498
+ Args:
499
+ action_record_id: ID of the action record
500
+ action_type: Type of action
501
+ service_name: Name of the service
502
+ stepCode: Code indicating step in process
503
+ status: Status to update
504
+ status_description: Description of the status
505
+ """
506
+ logging.info("Updating status for action %s", action_record_id)
507
+
508
+ payload = {
509
+ "_id": action_record_id,
510
+ "action": action_type,
511
+ "serviceName": service_name,
512
+ "stepCode": stepCode,
513
+ "status": status,
514
+ "statusDescription": status_description,
515
+ }
516
+
517
+ def rest_fallback():
518
+ url = "/v1/actions"
519
+ self.rpc.put(path=url, payload=payload)
520
+ return None, None, "Status updated"
521
+
522
+ return self._hybrid_request(
523
+ api="update_action",
524
+ payload=payload,
525
+ request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
526
+ response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
527
+ rest_fallback_func=rest_fallback
528
+ )
529
+
530
+ @log_errors(log_error=True)
531
+ def get_shutdown_details(self):
532
+ """Get shutdown details for the instance using Kafka (with REST fallback).
533
+
534
+ Returns:
535
+ Tuple of (data, error, message) from API response
536
+ """
537
+ logging.info("Getting shutdown details for instance %s", self.instance_id)
538
+
539
+ payload = {"instance_id": self.instance_id}
540
+
541
+ def rest_fallback():
542
+ path = f"/v1/compute/get_shutdown_details/{self.instance_id}"
543
+ resp = self.rpc.get(path=path)
544
+ return self.handle_response(
545
+ resp,
546
+ "Shutdown info fetched successfully",
547
+ "Could not fetch the shutdown details",
548
+ )
549
+
550
+ return self._hybrid_request(
551
+ api="get_shutdown_details",
552
+ payload=payload,
553
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
554
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
555
+ rest_fallback_func=rest_fallback
556
+ )
557
+
558
+ @log_errors(log_error=True)
559
+ def get_tasks_details(self):
560
+ """Get task details for the instance using Kafka (with REST fallback).
561
+
562
+ Returns:
563
+ Tuple of (data, error, message) from API response
564
+ """
565
+ logging.info("Getting tasks details for instance %s", self.instance_id)
566
+
567
+ payload = {"instance_id": self.instance_id}
568
+
569
+ def rest_fallback():
570
+ path = f"/v1/actions/fetch_instance_action_details/{self.instance_id}/action_details"
571
+ resp = self.rpc.get(path=path)
572
+ return self.handle_response(
573
+ resp,
574
+ "Task details fetched successfully",
575
+ "Could not fetch the task details",
576
+ )
577
+
578
+ return self._hybrid_request(
579
+ api="get_tasks_details",
580
+ payload=payload,
581
+ request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
582
+ response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
583
+ rest_fallback_func=rest_fallback
584
+ )
585
+
586
+ @log_errors(log_error=True)
587
+ def get_action_details(self, action_status_id):
588
+ """Get details for a specific action using Kafka (with REST fallback).
589
+
590
+ Args:
591
+ action_status_id: ID of the action status to fetch
592
+
593
+ Returns:
594
+ Tuple of (data, error, message) from API response
595
+ """
596
+ logging.info("Getting action details for action %s", action_status_id)
597
+
598
+ payload = {"actionRecordId": action_status_id}
599
+
600
+ def rest_fallback():
601
+ path = f"/v1/actions/action/{action_status_id}/details"
602
+ resp = self.rpc.get(path=path)
603
+ return self.handle_response(
604
+ resp,
605
+ "Task details fetched successfully",
606
+ "Could not fetch the task details",
607
+ )
608
+
609
+ return self._hybrid_request(
610
+ api="get_action_details",
611
+ payload=payload,
612
+ request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
613
+ response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
614
+ rest_fallback_func=rest_fallback
615
+ )
616
+
617
+ @log_errors(log_error=True)
618
+ def update_action(
619
+ self,
620
+ id="",
621
+ step_code="",
622
+ action_type="",
623
+ status="",
624
+ sub_action="",
625
+ status_description="",
626
+ service="",
627
+ job_params=None,
628
+ ):
629
+ """Update an action using Kafka (with REST fallback).
630
+
631
+ Args:
632
+ id: Action ID
633
+ step_code: Step code
634
+ action_type: Type of action
635
+ status: Status of the action
636
+ sub_action: Sub-action details
637
+ status_description: Description of the status
638
+ service: Service name
639
+ job_params: Job parameters dictionary
640
+
641
+ Returns:
642
+ Tuple of (data, error, message) from API response
643
+ """
644
+ if job_params is None:
645
+ job_params = {}
646
+
647
+ logging.info("Updating action %s", id)
648
+
649
+ payload = {
650
+ "_id": id,
651
+ "stepCode": step_code,
652
+ "action": action_type,
653
+ "status": status,
654
+ "subAction": sub_action,
655
+ "statusDescription": status_description,
656
+ "serviceName": service,
657
+ "jobParams": job_params,
658
+ }
659
+
660
+ def rest_fallback():
661
+ path = "/v1/actions"
662
+ resp = self.rpc.put(path=path, payload=payload)
663
+ return self.handle_response(
664
+ resp,
665
+ "Error logged successfully",
666
+ "Could not log the errors",
667
+ )
668
+
669
+ return self._hybrid_request(
670
+ api="update_action",
671
+ payload=payload,
672
+ request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
673
+ response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
674
+ rest_fallback_func=rest_fallback
675
+ )
676
+
677
+
678
+ @log_errors(log_error=True)
679
+ def assign_jobs(self, is_gpu):
680
+ """Assign jobs to the instance using REST API.
681
+
682
+ Args:
683
+ is_gpu: Boolean or any value indicating if this is a GPU instance.
684
+ Will be converted to proper boolean.
685
+
686
+ Returns:
687
+ Tuple of (data, error, message) from API response
688
+ """
689
+ # Convert is_gpu to proper boolean
690
+ is_gpu_bool = bool(is_gpu)
691
+ logging.info("Assigning jobs for instance %s (GPU: %s)", self.instance_id, is_gpu_bool)
692
+
693
+ # Use REST API directly
694
+ is_gpu_str = str(is_gpu_bool).lower()
695
+ path = f"/v1/actions/assign_jobs/{is_gpu_str}/{self.instance_id}"
696
+ resp = self.rpc.get(path=path)
697
+ return self.handle_response(
698
+ resp,
699
+ "Pinged successfully",
700
+ "Could not ping the scaling jobs",
701
+ )
702
+
703
+
704
+ @log_errors(log_error=True)
705
+ def update_available_resources(
706
+ self,
707
+ availableCPU=0,
708
+ availableGPU=0,
709
+ availableMemory=0,
710
+ availableGPUMemory=0,
711
+ ):
712
+ """Update available resources for the instance using Kafka (with REST fallback).
713
+
714
+ Args:
715
+ availableCPU: Available CPU resources
716
+ availableGPU: Available GPU resources
717
+ availableMemory: Available memory
718
+ availableGPUMemory: Available GPU memory
719
+
720
+ Returns:
721
+ Tuple of (data, error, message) from API response
722
+ """
723
+ logging.info("Updating available resources for instance %s", self.instance_id)
724
+ payload = {
725
+ "instance_id": self.instance_id,
726
+ "availableMemory": availableMemory,
727
+ "availableCPU": availableCPU,
728
+ "availableGPUMemory": availableGPUMemory,
729
+ "availableGPU": availableGPU,
730
+ }
731
+
732
+ # Define REST fallback function
733
+ def rest_fallback():
734
+ path = f"/v1/compute/update_available_resources/{self.instance_id}"
735
+ resp = self.rpc.put(path=path, payload=payload)
736
+ return self.handle_response(
737
+ resp,
738
+ "Resources updated successfully",
739
+ "Could not update the resources",
740
+ )
741
+
742
+ # Use hybrid approach: Kafka first, REST fallback, cache if both fail
743
+ return self._hybrid_request(
744
+ api="update_available_resources",
745
+ payload=payload,
746
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
747
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
748
+ rest_fallback_func=rest_fallback
749
+ )
750
+
751
+ @log_errors(log_error=True)
752
+ def update_action_docker_logs(self, action_record_id, log_content):
753
+ """Update docker logs for an action using Kafka (with REST fallback).
754
+
755
+ Args:
756
+ action_record_id: ID of the action record
757
+ log_content: Content of the logs to update
758
+
759
+ Returns:
760
+ Tuple of (data, error, message) from API response
761
+ """
762
+ logging.info("Updating docker logs for action %s", action_record_id)
763
+
764
+ payload = {
765
+ "actionRecordId": action_record_id,
766
+ "logContent": log_content,
767
+ }
768
+
769
+ def rest_fallback():
770
+ path = "/v1/actions/update_action_docker_logs"
771
+ resp = self.rpc.put(path=path, payload=payload)
772
+ return self.handle_response(
773
+ resp,
774
+ "Docker logs updated successfully",
775
+ "Could not update the docker logs",
776
+ )
777
+
778
+ return self._hybrid_request(
779
+ api="update_action_docker_logs",
780
+ payload=payload,
781
+ request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
782
+ response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
783
+ rest_fallback_func=rest_fallback
784
+ )
785
+
786
+ def update_action_container_id(self, action_record_id, container_id):
787
+ """Update container ID for an action using Kafka (with REST fallback).
788
+
789
+ Args:
790
+ action_record_id: ID of the action record
791
+ container_id: Container ID to update
792
+
793
+ Returns:
794
+ Tuple of (data, error, message) from API response
795
+ """
796
+ logging.info("Updating container ID for action %s", action_record_id)
797
+
798
+ payload = {
799
+ "actionRecordId": action_record_id,
800
+ "containerId": container_id,
801
+ }
802
+
803
+ path = "/v1/actions/update_action_container_id"
804
+ resp = self.rpc.put(path=path, payload=payload)
805
+ return self.handle_response(
806
+ resp,
807
+ "Container ID updated successfully",
808
+ "Could not update the container ID",
809
+ )
810
+
811
+ @log_errors(log_error=True)
812
+ def get_docker_hub_credentials(self):
813
+ """Get Docker Hub credentials using Kafka (with REST fallback).
814
+
815
+ Returns:
816
+ Tuple of (data, error, message) from API response
817
+ """
818
+ logging.info("Getting docker credentials")
819
+
820
+ payload = {}
821
+
822
+ def rest_fallback():
823
+ path = "/v1/compute/get_docker_hub_credentials"
824
+ resp = self.rpc.get(path=path)
825
+ return self.handle_response(
826
+ resp,
827
+ "Docker credentials fetched successfully",
828
+ "Could not fetch the docker credentials",
829
+ )
830
+
831
+ return self._hybrid_request(
832
+ api="get_docker_hub_credentials",
833
+ payload=payload,
834
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
835
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
836
+ rest_fallback_func=rest_fallback
837
+ )
838
+
839
+ @log_errors(log_error=True)
840
+ def get_open_ports_config(self):
841
+ """Get open ports configuration using Kafka (with REST fallback).
842
+
843
+ Returns:
844
+ Tuple of (data, error, message) from API response
845
+ """
846
+ payload = {"instance_id": self.instance_id}
847
+
848
+ def rest_fallback():
849
+ path = f"/v1/compute/get_open_ports/{self.instance_id}"
850
+ resp = self.rpc.get(path=path)
851
+ return self.handle_response(
852
+ resp,
853
+ "Open ports config fetched successfully",
854
+ "Could not fetch the open ports config",
855
+ )
856
+
857
+ return self._hybrid_request(
858
+ api="get_open_ports_config",
859
+ payload=payload,
860
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
861
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
862
+ rest_fallback_func=rest_fallback
863
+ )
864
+
865
+ @log_errors(default_return=None, log_error=True)
866
+ def get_open_port(self):
867
+ """Get an available open port.
868
+
869
+ Returns:
870
+ Port number if available, None otherwise
871
+ """
872
+ port_range = {"from": 8200, "to": 9000}
873
+ try:
874
+ resp, err, msg = self.get_open_ports_config()
875
+ if not err and resp and resp[0]:
876
+ port_range = resp[0]
877
+ else:
878
+ logging.warning("Using default port range 8200-9000 due to config fetch error")
879
+ except Exception as err:
880
+ logging.warning(
881
+ "Using default port range 8200-9000. Config fetch failed: %s",
882
+ str(err),
883
+ )
884
+ min_port = port_range["from"]
885
+ max_port = port_range["to"]
886
+ for port in range(min_port, max_port):
887
+ if port in self.used_ports:
888
+ continue
889
+ self.used_ports.add(port)
890
+ ports_value = ",".join(str(p) for p in self.used_ports)
891
+ os.environ["USED_PORTS"] = str(ports_value)
892
+ logging.info("Found available port: %s", port)
893
+ return port
894
+ logging.error(
895
+ "No available ports found in range %s-%s",
896
+ min_port,
897
+ max_port,
898
+ )
899
+ return None
900
+
901
+ @log_errors(default_return="", log_error=False)
902
+ def get_data_processing_image(self):
903
+ """Get data processing image name.
904
+
905
+ Returns:
906
+ Full image name including repository and tag
907
+ """
908
+ logging.info("Getting data processing image")
909
+ return f"285699223019.dkr.ecr.us-west-2.amazonaws.com/{os.environ.get('ENV', 'prod')}-data-processing:latest"
910
+
911
+ @log_errors(log_error=True)
912
+ def get_model_secret_keys(self, secret_name):
913
+ """Get model secret keys using Kafka (with REST fallback).
914
+
915
+ Args:
916
+ secret_name: Name of the secret
917
+
918
+ Returns:
919
+ Tuple of (data, error, message) from API response
920
+ """
921
+ payload = {"secret_name": secret_name}
922
+
923
+ def rest_fallback():
924
+ path = f"/v1/compute/get_models_secret_keys?secret_name={secret_name}"
925
+ resp = self.rpc.get(path=path)
926
+ return self.handle_response(
927
+ resp,
928
+ "Secret keys fetched successfully",
929
+ "Could not fetch the secret keys",
930
+ )
931
+
932
+ return self._hybrid_request(
933
+ api="get_model_secret_keys",
934
+ payload=payload,
935
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
936
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
937
+ rest_fallback_func=rest_fallback
938
+ )
939
+
940
+ @log_errors(log_error=True)
941
+ def get_model_codebase(self, model_family_id):
942
+ """Get model codebase.
943
+
944
+ Args:
945
+ model_family_id: ID of the model family
946
+
947
+ Returns:
948
+ Tuple of (data, error, message) from API response
949
+ """
950
+ path = f"/v1/model_store/get_user_code_download_path/{model_family_id}"
951
+ resp = self.rpc.get(path=path)
952
+ return self.handle_response(
953
+ resp,
954
+ "Codebase fetched successfully",
955
+ "Could not fetch the codebase",
956
+ )
957
+
958
+ @log_errors(log_error=True)
959
+ def get_model_codebase_requirements(self, dockerId):
960
+ """Get model codebase requirements.
961
+
962
+ Args:
963
+ dockerId: ID of the docker
964
+
965
+ Returns:
966
+ Tuple of (data, error, message) from API response
967
+ """
968
+ path = f"/v1/model_store/get_user_requirements_download_path/{dockerId}"
969
+ resp = self.rpc.get(path=path)
970
+ return self.handle_response(
971
+ resp,
972
+ "Codebase requirements fetched successfully",
973
+ "Could not fetch the codebase requirements",
974
+ )
975
+
976
+ @log_errors(log_error=True)
977
+ def get_model_codebase_script(self, model_family_id):
978
+ """Get model codebase script.
979
+
980
+ Args:
981
+ model_family_id: ID of the model family
982
+
983
+ Returns:
984
+ Tuple of (data, error, message) from API response
985
+ """
986
+ path = f"/v1/model_store/get_user_script_download_path/:{model_family_id}"
987
+ resp = self.rpc.get(path=path)
988
+ return self.handle_response(
989
+ resp,
990
+ "Codebase script fetched successfully",
991
+ "Could not fetch the codebase script",
992
+ )
993
+
994
+ @log_errors(log_error=True)
995
+ def add_account_compute_instance(
996
+ self,
997
+ account_number,
998
+ alias,
999
+ service_provider,
1000
+ instance_type,
1001
+ shut_down_time,
1002
+ lease_type,
1003
+ launch_duration,
1004
+ ):
1005
+ """Add a compute instance for an account.
1006
+
1007
+ Args:
1008
+ account_number: Account number
1009
+ alias: Instance alias
1010
+ service_provider: Cloud service provider
1011
+ instance_type: Type of instance
1012
+ shut_down_time: Time to shutdown
1013
+ lease_type: Type of lease
1014
+ launch_duration: Duration to launch
1015
+
1016
+ Returns:
1017
+ Tuple of (data, error, message) from API response
1018
+ """
1019
+ path = "/v1/scaling/add_account_compute_instance"
1020
+ payload = {
1021
+ "accountNumber": account_number,
1022
+ "alias": alias,
1023
+ "serviceProvider": service_provider,
1024
+ "instanceType": instance_type,
1025
+ "shutDownTime": shut_down_time,
1026
+ "leaseType": lease_type,
1027
+ "launchDuration": launch_duration,
1028
+ }
1029
+ resp = self.rpc.post(path=path, payload=payload)
1030
+ return self.handle_response(
1031
+ resp,
1032
+ "Compute instance added successfully",
1033
+ "Could not add the compute instance",
1034
+ )
1035
+
1036
+ @log_errors(log_error=True)
1037
+ def stop_account_compute(self, account_number, alias):
1038
+ """Stop a compute instance for an account using Kafka (with REST fallback).
1039
+
1040
+ Args:
1041
+ account_number: Account number
1042
+ alias: Instance alias
1043
+
1044
+ Returns:
1045
+ Tuple of (data, error, message) from API response
1046
+ """
1047
+ logging.info("Stopping account compute for %s/%s", account_number, alias)
1048
+
1049
+ payload = {
1050
+ "account_number": account_number,
1051
+ "alias": alias,
1052
+ }
1053
+
1054
+ def rest_fallback():
1055
+ path = f"/v1/compute/stop_account_compute/{account_number}/{alias}"
1056
+ resp = self.rpc.put(path=path)
1057
+ return self.handle_response(
1058
+ resp,
1059
+ "Compute instance stopped successfully",
1060
+ "Could not stop the compute instance",
1061
+ )
1062
+
1063
+ return self._hybrid_request(
1064
+ api="stop_account_compute",
1065
+ payload=payload,
1066
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
1067
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
1068
+ rest_fallback_func=rest_fallback
1069
+ )
1070
+
1071
+ @log_errors(log_error=True)
1072
+ def restart_account_compute(self, account_number, alias):
1073
+ """Restart a compute instance for an account using Kafka (with REST fallback).
1074
+
1075
+ Args:
1076
+ account_number: Account number
1077
+ alias: Instance alias
1078
+
1079
+ Returns:
1080
+ Tuple of (data, error, message) from API response
1081
+ """
1082
+ logging.info("Restarting account compute for %s/%s", account_number, alias)
1083
+
1084
+ payload = {
1085
+ "account_number": account_number,
1086
+ "alias": alias,
1087
+ }
1088
+
1089
+ def rest_fallback():
1090
+ path = f"/v1/compute/restart_account_compute/{account_number}/{alias}"
1091
+ resp = self.rpc.put(path=path)
1092
+ return self.handle_response(
1093
+ resp,
1094
+ "Compute instance restarted successfully",
1095
+ "Could not restart the compute instance",
1096
+ )
1097
+
1098
+ return self._hybrid_request(
1099
+ api="restart_account_compute",
1100
+ payload=payload,
1101
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
1102
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
1103
+ rest_fallback_func=rest_fallback
1104
+ )
1105
+
1106
+ @log_errors(log_error=True)
1107
+ def delete_account_compute(self, account_number, alias):
1108
+ """Delete a compute instance for an account.
1109
+
1110
+ Args:
1111
+ account_number: Account number
1112
+ alias: Instance alias
1113
+
1114
+ Returns:
1115
+ Tuple of (data, error, message) from API response
1116
+ """
1117
+ path = f"/v1/scaling/delete_account_compute/{account_number}/{alias}"
1118
+ resp = self.rpc.delete(path=path)
1119
+ return self.handle_response(
1120
+ resp,
1121
+ "Compute instance deleted successfully",
1122
+ "Could not delete the compute instance",
1123
+ )
1124
+
1125
+ @log_errors(log_error=True)
1126
+ def get_all_instances_type(self):
1127
+ """Get all instance types using Kafka (with REST fallback).
1128
+
1129
+ Returns:
1130
+ Tuple of (data, error, message) from API response
1131
+ """
1132
+ payload = {}
1133
+
1134
+ def rest_fallback():
1135
+ path = "/v1/compute/get_all_instances_type"
1136
+ resp = self.rpc.get(path=path)
1137
+ return self.handle_response(
1138
+ resp,
1139
+ "All instance types fetched successfully",
1140
+ "Could not fetch the instance types",
1141
+ )
1142
+
1143
+ return self._hybrid_request(
1144
+ api="get_all_instances_type",
1145
+ payload=payload,
1146
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
1147
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
1148
+ rest_fallback_func=rest_fallback
1149
+ )
1150
+
1151
+ @log_errors(log_error=True)
1152
+ def get_compute_details(self):
1153
+ """Get compute instance details using Kafka (with REST fallback).
1154
+
1155
+ Returns:
1156
+ Tuple of (data, error, message) from API response
1157
+ """
1158
+ payload = {"instance_id": self.instance_id}
1159
+
1160
+ def rest_fallback():
1161
+ path = f"/v1/compute/get_compute_details/{self.instance_id}"
1162
+ resp = self.rpc.get(path=path)
1163
+ return self.handle_response(
1164
+ resp,
1165
+ "Compute details fetched successfully",
1166
+ "Could not fetch the compute details",
1167
+ )
1168
+
1169
+ return self._hybrid_request(
1170
+ api="get_compute_details",
1171
+ payload=payload,
1172
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
1173
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
1174
+ rest_fallback_func=rest_fallback
1175
+ )
1176
+
1177
+ @log_errors(log_error=True)
1178
+ def get_user_access_key_pair(self, user_id):
1179
+ """Get user access key pair using Kafka (with REST fallback).
1180
+
1181
+ Args:
1182
+ user_id: ID of the user
1183
+
1184
+ Returns:
1185
+ Tuple of (data, error, message) from API response
1186
+ """
1187
+ payload = {"user_id": user_id, "instance_id": self.instance_id}
1188
+
1189
+ def rest_fallback():
1190
+ path = f"/v1/compute/get_user_access_key_pair/{user_id}/{self.instance_id}"
1191
+ resp = self.rpc.get(path=path)
1192
+ return self.handle_response(
1193
+ resp,
1194
+ "User access key pair fetched successfully",
1195
+ "Could not fetch the user access key pair",
1196
+ )
1197
+
1198
+ return self._hybrid_request(
1199
+ api="get_user_access_key_pair",
1200
+ payload=payload,
1201
+ request_topic=self.kafka_config["compute_request_topic"] if self.enable_kafka else None,
1202
+ response_topic=self.kafka_config["compute_response_topic"] if self.enable_kafka else None,
1203
+ rest_fallback_func=rest_fallback
1204
+ )
1205
+
1206
+
1207
+
1208
+ def report_architecture_info(self):
1209
+ """Collects and sends architecture info to the compute service."""
1210
+ cpu_arch = platform.machine()
1211
+ cpu_name = None
1212
+ total_memory_gb = None
1213
+ gpu_provider = None
1214
+ gpu_arch = None
1215
+ cuda_version = None
1216
+ is_jetson = False
1217
+ gpu_arch_family = None
1218
+ gpu_compute_cap = None
1219
+
1220
+ if cpu_arch== "x86_64":
1221
+ cpu_arch = "x86"
1222
+ elif cpu_arch == "aarch64":
1223
+ cpu_arch = "arm64"
1224
+
1225
+ # Get CPU name
1226
+ try:
1227
+ cpu_info = subprocess.run(["lscpu"], capture_output=True, text=True)
1228
+ if cpu_info.returncode == 0:
1229
+ for line in cpu_info.stdout.splitlines():
1230
+ if "Model name:" in line:
1231
+ cpu_name = line.split("Model name:")[-1].strip()
1232
+ break
1233
+ # Fallback for systems without lscpu
1234
+ if not cpu_name:
1235
+ try:
1236
+ with open("/proc/cpuinfo", "r") as f:
1237
+ for line in f:
1238
+ if "model name" in line:
1239
+ cpu_name = line.split(":")[-1].strip()
1240
+ break
1241
+ except Exception:
1242
+ pass
1243
+ except Exception:
1244
+ pass
1245
+
1246
+ # Get total memory in GB
1247
+ try:
1248
+ total_memory_bytes = psutil.virtual_memory().total
1249
+ total_memory_gb = round(total_memory_bytes / (1024 ** 3), 2)
1250
+ except Exception:
1251
+ try:
1252
+ # Fallback using /proc/meminfo
1253
+ with open("/proc/meminfo", "r") as f:
1254
+ for line in f:
1255
+ if "MemTotal:" in line:
1256
+ mem_kb = int(line.split()[1])
1257
+ total_memory_gb = round(mem_kb / (1024 ** 2), 2)
1258
+ break
1259
+ except Exception:
1260
+ pass
1261
+
1262
+ # Jetson detection first (avoid nvidia-smi on Jetson)
1263
+ try:
1264
+ with open("/proc/device-tree/model") as f:
1265
+ model = f.read().lower()
1266
+ if "jetson" in model or "tegra" in model:
1267
+ is_jetson = True
1268
+ gpu_provider = "NVIDIA"
1269
+ try:
1270
+ cuda_result = subprocess.run(["nvcc", "--version"], capture_output=True, text=True)
1271
+ if cuda_result.returncode == 0:
1272
+ for line in cuda_result.stdout.splitlines():
1273
+ if "release" in line:
1274
+ cuda_version = line.split("release")[-1].split(",")[0].strip()
1275
+ break
1276
+ except Exception:
1277
+ pass
1278
+ except Exception:
1279
+ pass
1280
+
1281
+ # If not Jetson, try NVIDIA (nvidia-smi)
1282
+ if not is_jetson:
1283
+ try:
1284
+ result = subprocess.run(["nvidia-smi", "--query-gpu=name,compute_cap", "--format=csv,noheader"], capture_output=True, text=True)
1285
+ if result.returncode == 0:
1286
+ gpu_provider = "NVIDIA"
1287
+ gpu_info = result.stdout.strip().split("\n")[0].split(",")
1288
+ gpu_arch = gpu_info[0].strip() if len(gpu_info) > 0 else None
1289
+ gpu_compute_cap = gpu_info[1].strip() if len(gpu_info) > 1 else None
1290
+ # Map compute capability to arch family
1291
+ if gpu_compute_cap:
1292
+ major = int(gpu_compute_cap.split(".")[0])
1293
+ if major == 5:
1294
+ gpu_arch_family = "Maxwell"
1295
+ elif major == 6:
1296
+ gpu_arch_family = "Pascal"
1297
+ elif major == 7:
1298
+ gpu_arch_family = "Volta"
1299
+ elif major == 8:
1300
+ gpu_arch_family = "Ampere"
1301
+ elif major == 9:
1302
+ gpu_arch_family = "Hopper"
1303
+ elif major == 10:
1304
+ gpu_arch_family = "Blackwell"
1305
+ else:
1306
+ gpu_arch_family = "Unknown"
1307
+ # Get CUDA version
1308
+ cuda_result = subprocess.run(["nvcc", "--version"], capture_output=True, text=True)
1309
+ if cuda_result.returncode == 0:
1310
+ for line in cuda_result.stdout.splitlines():
1311
+ if "release" in line:
1312
+ cuda_version = line.split("release")[-1].split(",")[0].strip()
1313
+ break
1314
+ except FileNotFoundError:
1315
+ pass
1316
+
1317
+ # Try AMD if NVIDIA not found
1318
+ if gpu_provider is None:
1319
+ try:
1320
+ result = subprocess.run(["lspci"], capture_output=True, text=True)
1321
+ if result.returncode == 0:
1322
+ for line in result.stdout.splitlines():
1323
+ if "AMD" in line or "Advanced Micro Devices" in line:
1324
+ gpu_provider = "AMD"
1325
+ gpu_arch = line.strip()
1326
+ break
1327
+ except FileNotFoundError:
1328
+ pass
1329
+
1330
+ # Only send if provider is NVIDIA or AMD
1331
+ if gpu_provider in ("NVIDIA", "AMD"):
1332
+ payload = {
1333
+ "instance_id": self.instance_id,
1334
+ "cpu_architecture": cpu_arch,
1335
+ "cpu_name": cpu_name if cpu_name else "Unknown",
1336
+ "total_memory_gb": total_memory_gb if total_memory_gb else 0,
1337
+ "gpu_provider": gpu_provider,
1338
+ "gpu_architecture": gpu_arch_family if gpu_arch_family else "Unknown",
1339
+ "gpu": gpu_arch,
1340
+ "cuda_version": cuda_version if cuda_version else "N/A",
1341
+ "is_jetson": is_jetson
1342
+ }
1343
+ else:
1344
+ payload = {
1345
+ "instance_id": self.instance_id,
1346
+ "cpu_architecture": cpu_arch,
1347
+ "cpu_name": cpu_name if cpu_name else "Unknown",
1348
+ "total_memory_gb": total_memory_gb if total_memory_gb else 0,
1349
+ "gpu_provider": "None",
1350
+ "gpu_architecture": "None",
1351
+ "gpu": "None",
1352
+ "cuda_version": "N/A",
1353
+ "is_jetson": False
1354
+ }
1355
+
1356
+ #report for a simple cpu only instance
1357
+
1358
+ path = "/v1/compute/report_architecture_info"
1359
+ resp = self.rpc.post(path=path, payload=payload)
1360
+ return self.handle_response(
1361
+ resp,
1362
+ "Architecture info reported successfully",
1363
+ "Could not report architecture info",
1364
+ )
1365
+
1366
+
1367
+
1368
+ @log_errors(log_error=True)
1369
+ def get_internal_api_key(self, action_id):
1370
+ """Get internal API key using Kafka (with REST fallback).
1371
+
1372
+ Args:
1373
+ action_id: ID of the action
1374
+
1375
+ Returns:
1376
+ Tuple of (data, error, message) from API response
1377
+ """
1378
+ payload = {"action_id": action_id, "instance_id": self.instance_id}
1379
+
1380
+ def rest_fallback():
1381
+ path = f"/v1/actions/get_internal_api_key/{action_id}/{self.instance_id}"
1382
+ resp = self.rpc.get(path=path)
1383
+ return self.handle_response(
1384
+ resp,
1385
+ "internal keys fetched successfully",
1386
+ "Could not fetch internal keys",
1387
+ )
1388
+
1389
+ return self._hybrid_request(
1390
+ api="get_internal_api_key",
1391
+ payload=payload,
1392
+ request_topic=self.kafka_config["action_request_topic"] if self.enable_kafka else None,
1393
+ response_topic=self.kafka_config["action_response_topic"] if self.enable_kafka else None,
1394
+ rest_fallback_func=rest_fallback
1395
+ )