matrice-compute 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,880 @@
1
+ """Module providing scaling functionality."""
2
+
3
+ import os
4
+ import logging
5
+ from matrice_common.utils import log_errors
6
+ from kafka import KafkaProducer, KafkaConsumer
7
+ import uuid
8
+ import json
9
+ import time
10
+ import base64
11
+
12
+
13
+ class Scaling:
14
+
15
+ """Class providing scaling functionality for compute instances."""
16
+
17
+ def __init__(self, session, instance_id=None):
18
+ """Initialize Scaling instance.
19
+
20
+ Args:
21
+ session: Session object for making RPC calls
22
+ instance_id: ID of the compute instance
23
+
24
+ Raises:
25
+ Exception: If instance_id is not provided
26
+ """
27
+ if not instance_id:
28
+ msg = "Instance id not set for this instance. Cannot perform the operation for job-scheduler without instance id"
29
+ logging.error(msg)
30
+ raise ValueError(msg)
31
+ self.instance_id = instance_id
32
+ self.session = session
33
+ self.rpc = session.rpc
34
+ used_ports_str = os.environ.get("USED_PORTS", "")
35
+ self.used_ports = set(int(p) for p in used_ports_str.split(",") if p.strip())
36
+ logging.info(
37
+ "Initialized Scaling with instance_id: %s",
38
+ instance_id,
39
+ )
40
+ self.kafka_config = {
41
+ "bootstrap_servers": self.get_kafka_bootstrap_servers(),
42
+ "api_request_topic": "action_requests",
43
+ "api_response_topic": "action_responses",
44
+ "scaling_request_topic": "compute_requests",
45
+ "scaling_response_topic": "compute_responses"
46
+ }
47
+ self.kafka_producer = KafkaProducer(
48
+ bootstrap_servers=self.kafka_config["bootstrap_servers"],
49
+ value_serializer=lambda v: json.dumps(v).encode("utf-8"),)
50
+
51
+
52
+
53
+ @log_errors(default_return=(None, "Error creating Kafka producer", "Kafka producer creation failed"), log_error=True)
54
+ def get_kafka_bootstrap_servers(self):
55
+ """Get Kafka bootstrap servers from API and decode base64 fields."""
56
+ path = "/v1/actions/get_kafka_info"
57
+ response = self.rpc.get(path=path)
58
+ if not response or not response.get("success"):
59
+ raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
60
+ encoded_ip = response["data"]["ip"]
61
+ encoded_port = response["data"]["port"]
62
+ ip = base64.b64decode(encoded_ip).decode("utf-8")
63
+ port = base64.b64decode(encoded_port).decode("utf-8")
64
+ bootstrap_servers = f"{ip}:{port}"
65
+ return bootstrap_servers
66
+
67
+ @log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
68
+ def handle_response(self, resp, success_message, error_message):
69
+ """Helper function to handle API response.
70
+
71
+ Args:
72
+ resp: Response from API call
73
+ success_message: Message to log on success
74
+ error_message: Message to log on error
75
+
76
+ Returns:
77
+ Tuple of (data, error, message)
78
+ """
79
+ if resp.get("success"):
80
+ data = resp.get("data")
81
+ error = None
82
+ message = success_message
83
+ logging.info(message)
84
+ else:
85
+ data = resp.get("data")
86
+ error = resp.get("message")
87
+ message = error_message
88
+ logging.error("%s: %s", message, error)
89
+ return data, error, message
90
+
91
+ @log_errors(log_error=True)
92
+ def get_downscaled_ids(self):
93
+ """Get IDs of downscaled instances.
94
+
95
+ Returns:
96
+ Tuple of (data, error, message) from API response
97
+ """
98
+ logging.info(
99
+ "Getting downscaled ids for instance %s",
100
+ self.instance_id,
101
+ )
102
+ path = f"/v1/scaling/down_scaled_ids/{self.instance_id}"
103
+ resp = self.rpc.get(path=path)
104
+ return self.handle_response(
105
+ resp,
106
+ "Downscaled ids info fetched successfully",
107
+ "Could not fetch the Downscaled ids info",
108
+ )
109
+
110
+ @log_errors(default_return=(None, "API call failed", "Failed to stop instance"), log_error=True)
111
+ def stop_instance(self):
112
+ """Stop the compute instance.
113
+
114
+ Returns:
115
+ Tuple of (data, error, message) from API response
116
+ """
117
+ logging.info(
118
+ "Stopping instance %s",
119
+ self.instance_id,
120
+ )
121
+ path = "/v1/compute/compute_instance/stop"
122
+ resp = self.rpc.put(
123
+ path=path,
124
+ payload={
125
+ "_idInstance": self.instance_id,
126
+ "isForcedStop": False,
127
+ },
128
+ )
129
+ return self.handle_response(
130
+ resp,
131
+ "Instance stopped successfully",
132
+ "Could not stop the instance",
133
+ )
134
+
135
+ @log_errors(log_error=True)
136
+ def update_jupyter_token(
137
+ self,
138
+ token="",
139
+ ):
140
+ path = f"/v1/scaling/update_jupyter_notebook_token/{self.instance_id}"
141
+ payload = {
142
+ "token": token,
143
+ }
144
+ resp = self.rpc.put(path=path, payload=payload)
145
+ return self.handle_response(
146
+ resp,
147
+ "Resources updated successfully",
148
+ "Could not update the resources",
149
+ )
150
+
151
+ @log_errors(log_error=True)
152
+ def update_action_status(
153
+ self,
154
+ service_provider="",
155
+ action_record_id="",
156
+ isRunning=True,
157
+ status="",
158
+ docker_start_time=None,
159
+ action_duration=0,
160
+ cpuUtilisation=0.0,
161
+ gpuUtilisation=0.0,
162
+ memoryUtilisation=0.0,
163
+ gpuMemoryUsed=0,
164
+ createdAt=None,
165
+ updatedAt=None,
166
+ ):
167
+ """Update status of an action.
168
+
169
+ Args:
170
+ service_provider: Provider of the service
171
+ action_record_id: ID of the action record
172
+ isRunning: Whether action is running
173
+ status: Status of the action
174
+ docker_start_time: Start time of docker container
175
+ action_duration: Duration of the action
176
+ cpuUtilisation: CPU utilization percentage
177
+ gpuUtilisation: GPU utilization percentage
178
+ memoryUtilisation: Memory utilization percentage
179
+ gpuMemoryUsed: GPU memory used
180
+ createdAt: Creation timestamp
181
+ updatedAt: Last update timestamp
182
+
183
+ Returns:
184
+ Tuple of (data, error, message) from API response
185
+ """
186
+ logging.info(
187
+ "Updating action status for action %s",
188
+ action_record_id,
189
+ )
190
+ path = "/v1/compute/update_action_status"
191
+ payload_scaling = {
192
+ "instanceID": self.instance_id,
193
+ "serviceProvider": service_provider,
194
+ "actionRecordId": action_record_id,
195
+ "isRunning": isRunning,
196
+ "status": status,
197
+ "dockerContainerStartTime": docker_start_time,
198
+ "cpuUtilisation": cpuUtilisation,
199
+ "gpuUtilisation": gpuUtilisation,
200
+ "memoryUtilisation": memoryUtilisation,
201
+ "gpuMemoryUsed": gpuMemoryUsed,
202
+ "actionDuration": action_duration,
203
+ "createdAt": createdAt,
204
+ "updatedAt": updatedAt,
205
+ }
206
+ resp = self.rpc.put(path=path, payload=payload_scaling)
207
+ return self.handle_response(
208
+ resp,
209
+ "Action status details updated successfully",
210
+ "Could not update the action status details ",
211
+ )
212
+
213
+ @log_errors(log_error=True)
214
+ def update_status(
215
+ self,
216
+ action_record_id,
217
+ action_type,
218
+ service_name,
219
+ stepCode,
220
+ status,
221
+ status_description,
222
+ ):
223
+ """Update status of an action.
224
+
225
+ Args:
226
+ action_record_id: ID of the action record
227
+ action_type: Type of action
228
+ service_name: Name of the service
229
+ stepCode: Code indicating step in process
230
+ status: Status to update
231
+ status_description: Description of the status
232
+ """
233
+ logging.info(
234
+ "Updating status for action %s",
235
+ action_record_id,
236
+ )
237
+ url = "/v1/actions"
238
+ payload = {
239
+ "_id": action_record_id,
240
+ "action": action_type,
241
+ "serviceName": service_name,
242
+ "stepCode": stepCode,
243
+ "status": status,
244
+ "statusDescription": status_description,
245
+ }
246
+ self.rpc.put(path=url, payload=payload)
247
+
248
+ @log_errors(log_error=True)
249
+ def get_shutdown_details(self):
250
+ """Get shutdown details for the instance.
251
+
252
+ Returns:
253
+ Tuple of (data, error, message) from API response
254
+ """
255
+ logging.info(
256
+ "Getting shutdown details for instance %s",
257
+ self.instance_id,
258
+ )
259
+ path = f"/v1/compute/get_shutdown_details/{self.instance_id}"
260
+ resp = self.rpc.get(path=path)
261
+ return self.handle_response(
262
+ resp,
263
+ "Shutdown info fetched successfully",
264
+ "Could not fetch the shutdown details",
265
+ )
266
+
267
+ @log_errors(log_error=True)
268
+ def get_tasks_details(self):
269
+ """Get task details for the instance.
270
+
271
+ Returns:
272
+ Tuple of (data, error, message) from API response
273
+ """
274
+ logging.info(
275
+ "Getting tasks details for instance %s",
276
+ self.instance_id,
277
+ )
278
+ path = f"/v1/actions/fetch_instance_action_details/{self.instance_id}/action_details"
279
+ resp = self.rpc.get(path=path)
280
+ return self.handle_response(
281
+ resp,
282
+ "Task details fetched successfully",
283
+ "Could not fetch the task details",
284
+ )
285
+
286
+ @log_errors(log_error=True)
287
+ def get_action_details(self, action_status_id):
288
+ """Get details for a specific action using Kafka, fallback to REST, then cache."""
289
+ logging.info("Getting action details for action %s", action_status_id)
290
+ api = "get_action_details"
291
+ payload = {"actionRecordId": action_status_id}
292
+ # Try Kafka first
293
+ data, error, message, kafka_response_received = self._send_kafka_request(
294
+ api=api,
295
+ payload=payload,
296
+ request_topic=self.kafka_config["api_request_topic"],
297
+ response_topic=self.kafka_config["api_response_topic"],
298
+ timeout=600
299
+ )
300
+ if kafka_response_received:
301
+ return data, error, message
302
+
303
+ # Only if Kafka transport failed or timed out, try REST
304
+ try:
305
+ path = f"/v1/actions/action/{action_status_id}/details"
306
+ resp = self.rpc.get(path=path)
307
+ return self.handle_response(
308
+ resp,
309
+ "Task details fetched successfully (REST fallback)",
310
+ "Could not fetch the task details (REST fallback)",
311
+ )
312
+ except Exception as e:
313
+ logging.error("REST fallback failed: %s", e)
314
+ self._cache_failed_request(api, payload)
315
+ return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
316
+
317
+
318
+ @log_errors(log_error=True)
319
+ def update_action(
320
+ self,
321
+ id="",
322
+ step_code="",
323
+ action_type="",
324
+ status="",
325
+ sub_action="",
326
+ status_description="",
327
+ service="",
328
+ job_params=None,
329
+ ):
330
+ """Update an action using Kafka, fallback to REST, then cache."""
331
+ if job_params is None:
332
+ job_params = {}
333
+ logging.info("Updating action %s", id)
334
+ api = "update_action"
335
+ payload = {
336
+ "_id": id,
337
+ "stepCode": step_code,
338
+ "action": action_type,
339
+ "status": status,
340
+ "subAction": sub_action,
341
+ "statusDescription": status_description,
342
+ "serviceName": service,
343
+ "jobParams": job_params,
344
+ }
345
+ data, error, message, kafka_response_received = self._send_kafka_request(
346
+ api=api,
347
+ payload=payload,
348
+ request_topic=self.kafka_config["api_request_topic"],
349
+ response_topic=self.kafka_config["api_response_topic"],
350
+ timeout=600
351
+ )
352
+ if kafka_response_received:
353
+ return data, error, message
354
+ try:
355
+ path = "/v1/actions"
356
+ resp = self.rpc.put(path=path, payload=payload)
357
+ return self.handle_response(
358
+ resp,
359
+ "Error logged successfully (REST fallback)",
360
+ "Could not log the errors (REST fallback)",
361
+ )
362
+ except Exception as e:
363
+ logging.error("REST fallback failed (update_action): %s", e)
364
+ self._cache_failed_request(api, payload)
365
+ return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
366
+
367
+
368
+ @log_errors(log_error=True)
369
+ def assign_jobs(self, is_gpu):
370
+ """Assign jobs to the instance using Kafka, fallback to REST, then cache."""
371
+ logging.info("Assigning jobs for instance %s (GPU: %s)", self.instance_id, is_gpu)
372
+ api = "assign_jobs"
373
+ payload = {
374
+ "instanceID": self.instance_id,
375
+ "isGPUInstance": is_gpu,
376
+ }
377
+
378
+ data, error, message, kafka_response_received = self._send_kafka_request(
379
+ api=api,
380
+ payload=payload,
381
+ request_topic=self.kafka_config["api_request_topic"],
382
+ response_topic=self.kafka_config["api_response_topic"],
383
+ timeout=600
384
+ )
385
+
386
+ if kafka_response_received:
387
+ return data, error, message
388
+
389
+ # Fallback to REST
390
+ try:
391
+ path = f"/v1/actions/assign_jobs/{str(is_gpu)}/{self.instance_id}"
392
+ resp = self.rpc.get(path=path)
393
+ return self.handle_response(
394
+ resp,
395
+ "Pinged successfully (REST fallback)",
396
+ "Could not ping the scaling jobs (REST fallback)",
397
+ )
398
+ except Exception as e:
399
+ logging.error("REST fallback failed (assign_jobs): %s", e)
400
+ self._cache_failed_request(api, payload)
401
+ return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
402
+
403
+
404
+ @log_errors(log_error=True)
405
+ def update_available_resources(
406
+ self,
407
+ availableCPU=0,
408
+ availableGPU=0,
409
+ availableMemory=0,
410
+ availableGPUMemory=0,
411
+ ):
412
+ """Update available resources for the instance using Kafka, fallback to REST, then cache."""
413
+ logging.info("Updating available resources for instance %s", self.instance_id)
414
+ payload = {
415
+ "instance_id": self.instance_id,
416
+ "availableMemory": availableMemory,
417
+ "availableCPU": availableCPU,
418
+ "availableGPUMemory": availableGPUMemory,
419
+ "availableGPU": availableGPU,
420
+ }
421
+ api = "update_available_resources"
422
+ correlation_id = str(uuid.uuid4())
423
+
424
+
425
+ data, error, message, kafka_response_received = self._send_kafka_request(
426
+ api=api,
427
+ payload=payload,
428
+ request_topic=self.kafka_config["scaling_request_topic"],
429
+ response_topic=self.kafka_config["scaling_response_topic"],
430
+ timeout=600
431
+ )
432
+
433
+ if kafka_response_received:
434
+ return data, error, message
435
+ try:
436
+ path = f"/v1/scaling/update_available_resources/{self.instance_id}"
437
+ resp = self.rpc.put(path=path, payload=payload)
438
+ return self.handle_response(
439
+ resp,
440
+ "Resources updated successfully (REST fallback)",
441
+ "Could not update the resources (REST fallback)",
442
+ )
443
+ except Exception as e:
444
+ logging.error("REST fallback failed (update_available_resources): %s", e)
445
+ self._cache_failed_request(api, payload)
446
+ return None, f"Failed to update available resources via Kafka and REST: {e}", "Cached for retry"
447
+
448
+ @log_errors(log_error=True)
449
+ def update_action_docker_logs(self, action_record_id, log_content):
450
+ """Update docker logs for an action using Kafka, fallback to REST, then cache."""
451
+ logging.info("Updating docker logs for action %s", action_record_id)
452
+ api = "update_action_docker_logs"
453
+ payload = {
454
+ "actionRecordId": action_record_id,
455
+ "logContent": log_content,
456
+
457
+ }
458
+ data, error, message, kafka_response_received = self._send_kafka_request(
459
+ api=api,
460
+ payload=payload,
461
+ request_topic=self.kafka_config["api_request_topic"],
462
+ response_topic=self.kafka_config["api_response_topic"],
463
+ timeout=600
464
+ )
465
+
466
+ if kafka_response_received:
467
+ return data, error, message
468
+ try:
469
+ path = "/v1/actions/update_action_docker_logs"
470
+ resp = self.rpc.put(path=path, payload=payload)
471
+ return self.handle_response(
472
+ resp,
473
+ "Docker logs updated successfully (REST fallback)",
474
+ "Could not update the docker logs (REST fallback)",
475
+ )
476
+ except Exception as e:
477
+ logging.error("REST fallback failed (update_action_docker_logs): %s", e)
478
+
479
+
480
+ @log_errors(log_error=True)
481
+ def get_docker_hub_credentials(self):
482
+ """Get Docker Hub credentials.
483
+
484
+ Returns:
485
+ Tuple of (data, error, message) from API response
486
+ """
487
+ logging.info("Getting docker credentials")
488
+ path = "/v1/compute/get_docker_hub_credentials"
489
+ resp = self.rpc.get(path=path)
490
+ return self.handle_response(
491
+ resp,
492
+ "Docker credentials fetched successfully",
493
+ "Could not fetch the docker credentials",
494
+ )
495
+
496
+ @log_errors(log_error=True)
497
+ def get_open_ports_config(self):
498
+ """Get open ports configuration.
499
+
500
+ Returns:
501
+ Tuple of (data, error, message) from API response
502
+ """
503
+ path = f"/v1/scaling/get_open_ports/{self.instance_id}"
504
+ resp = self.rpc.get(path=path)
505
+ return self.handle_response(
506
+ resp,
507
+ "Open ports config fetched successfully",
508
+ "Could not fetch the open ports config",
509
+ )
510
+
511
+ @log_errors(default_return=None, log_error=True)
512
+ def get_open_port(self):
513
+ """Get an available open port.
514
+
515
+ Returns:
516
+ Port number if available, None otherwise
517
+ """
518
+ port_range = {"from": 8200, "to": 9000}
519
+ try:
520
+ resp, err, msg = self.get_open_ports_config()
521
+ if not err and resp and resp[0]:
522
+ port_range = resp[0]
523
+ else:
524
+ logging.warning("Using default port range 8200-9000 due to config fetch error")
525
+ except Exception as err:
526
+ logging.warning(
527
+ "Using default port range 8200-9000. Config fetch failed: %s",
528
+ str(err),
529
+ )
530
+ min_port = port_range["from"]
531
+ max_port = port_range["to"]
532
+ for port in range(min_port, max_port):
533
+ if port in self.used_ports:
534
+ continue
535
+ self.used_ports.add(port)
536
+ os.environ["USED_PORTS"] = ",".join(str(p) for p in self.used_ports)
537
+ logging.info("Found available port: %s", port)
538
+ return port
539
+ logging.error(
540
+ "No available ports found in range %s-%s",
541
+ min_port,
542
+ max_port,
543
+ )
544
+ return None
545
+
546
+ @log_errors(default_return="", log_error=False)
547
+ def get_data_processing_image(self):
548
+ """Get data processing image name.
549
+
550
+ Returns:
551
+ Full image name including repository and tag
552
+ """
553
+ logging.info("Getting data processing image")
554
+ return f"285699223019.dkr.ecr.us-west-2.amazonaws.com/{os.environ.get('ENV', 'prod')}-data-processing:latest"
555
+
556
+ @log_errors(log_error=True)
557
+ def get_model_secret_keys(self, secret_name):
558
+ """Get model secret keys.
559
+
560
+ Args:
561
+ secret_name: Name of the secret
562
+
563
+ Returns:
564
+ Tuple of (data, error, message) from API response
565
+ """
566
+ path = f"/v1/scaling/get_models_secret_keys?secret_name={secret_name}"
567
+ resp = self.rpc.get(path=path)
568
+ return self.handle_response(
569
+ resp,
570
+ "Secret keys fetched successfully",
571
+ "Could not fetch the secret keys",
572
+ )
573
+
574
+ @log_errors(log_error=True)
575
+ def get_model_codebase(self, model_family_id):
576
+ """Get model codebase.
577
+
578
+ Args:
579
+ model_family_id: ID of the model family
580
+
581
+ Returns:
582
+ Tuple of (data, error, message) from API response
583
+ """
584
+ path = f"/v1/model_store/get_user_code_download_path/{model_family_id}"
585
+ resp = self.rpc.get(path=path)
586
+ return self.handle_response(
587
+ resp,
588
+ "Codebase fetched successfully",
589
+ "Could not fetch the codebase",
590
+ )
591
+
592
+ @log_errors(log_error=True)
593
+ def get_model_codebase_requirements(self, model_family_id):
594
+ """Get model codebase requirements.
595
+
596
+ Args:
597
+ model_family_id: ID of the model family
598
+
599
+ Returns:
600
+ Tuple of (data, error, message) from API response
601
+ """
602
+ path = f"/v1/model_store/get_user_requirements_download_path/{model_family_id}"
603
+ resp = self.rpc.get(path=path)
604
+ return self.handle_response(
605
+ resp,
606
+ "Codebase requirements fetched successfully",
607
+ "Could not fetch the codebase requirements",
608
+ )
609
+
610
+ @log_errors(log_error=True)
611
+ def get_model_codebase_script(self, model_family_id):
612
+ """Get model codebase script.
613
+
614
+ Args:
615
+ model_family_id: ID of the model family
616
+
617
+ Returns:
618
+ Tuple of (data, error, message) from API response
619
+ """
620
+ path = f"/v1/model_store/get_user_script_download_path/:{model_family_id}"
621
+ resp = self.rpc.get(path=path)
622
+ return self.handle_response(
623
+ resp,
624
+ "Codebase script fetched successfully",
625
+ "Could not fetch the codebase script",
626
+ )
627
+
628
+ @log_errors(log_error=True)
629
+ def add_account_compute_instance(
630
+ self,
631
+ account_number,
632
+ alias,
633
+ service_provider,
634
+ instance_type,
635
+ shut_down_time,
636
+ lease_type,
637
+ launch_duration,
638
+ ):
639
+ """Add a compute instance for an account.
640
+
641
+ Args:
642
+ account_number: Account number
643
+ alias: Instance alias
644
+ service_provider: Cloud service provider
645
+ instance_type: Type of instance
646
+ shut_down_time: Time to shutdown
647
+ lease_type: Type of lease
648
+ launch_duration: Duration to launch
649
+
650
+ Returns:
651
+ Tuple of (data, error, message) from API response
652
+ """
653
+ path = "/v1/scaling/add_account_compute_instance"
654
+ payload = {
655
+ "accountNumber": account_number,
656
+ "alias": alias,
657
+ "serviceProvider": service_provider,
658
+ "instanceType": instance_type,
659
+ "shutDownTime": shut_down_time,
660
+ "leaseType": lease_type,
661
+ "launchDuration": launch_duration,
662
+ }
663
+ resp = self.rpc.post(path=path, payload=payload)
664
+ return self.handle_response(
665
+ resp,
666
+ "Compute instance added successfully",
667
+ "Could not add the compute instance",
668
+ )
669
+
670
+ @log_errors(log_error=True)
671
+ def stop_account_compute(self, account_number, alias):
672
+ """Stop a compute instance for an account.
673
+
674
+ Args:
675
+ account_number: Account number
676
+ alias: Instance alias
677
+
678
+ Returns:
679
+ Tuple of (data, error, message) from API response
680
+ """
681
+ path = f"/v1/scaling/stop_account_compute/{account_number}/{alias}"
682
+ resp = self.rpc.put(path=path)
683
+ return self.handle_response(
684
+ resp,
685
+ "Compute instance stopped successfully",
686
+ "Could not stop the compute instance",
687
+ )
688
+
689
+ @log_errors(log_error=True)
690
+ def restart_account_compute(self, account_number, alias):
691
+ """Restart a compute instance for an account.
692
+
693
+ Args:
694
+ account_number: Account number
695
+ alias: Instance alias
696
+
697
+ Returns:
698
+ Tuple of (data, error, message) from API response
699
+ """
700
+ path = f"/v1/scaling/restart_account_compute/{account_number}/{alias}"
701
+ resp = self.rpc.put(path=path)
702
+ return self.handle_response(
703
+ resp,
704
+ "Compute instance restarted successfully",
705
+ "Could not restart the compute instance",
706
+ )
707
+
708
+ @log_errors(log_error=True)
709
+ def delete_account_compute(self, account_number, alias):
710
+ """Delete a compute instance for an account.
711
+
712
+ Args:
713
+ account_number: Account number
714
+ alias: Instance alias
715
+
716
+ Returns:
717
+ Tuple of (data, error, message) from API response
718
+ """
719
+ path = f"/v1/scaling/delete_account_compute/{account_number}/{alias}"
720
+ resp = self.rpc.delete(path=path)
721
+ return self.handle_response(
722
+ resp,
723
+ "Compute instance deleted successfully",
724
+ "Could not delete the compute instance",
725
+ )
726
+
727
+ @log_errors(log_error=True)
728
+ def get_all_instances_type(self):
729
+ """Get all instance types.
730
+
731
+ Returns:
732
+ Tuple of (data, error, message) from API response
733
+ """
734
+ path = "/v1/scaling/get_all_instances_type"
735
+ resp = self.rpc.get(path=path)
736
+ return self.handle_response(
737
+ resp,
738
+ "All instance types fetched successfully",
739
+ "Could not fetch the instance types",
740
+ )
741
+
742
+ @log_errors(log_error=True)
743
+ def get_compute_details(self):
744
+ """Get compute instance details.
745
+
746
+ Returns:
747
+ Tuple of (data, error, message) from API response
748
+ """
749
+ path = f"/v1/scaling/get_compute_details/{self.instance_id}"
750
+ resp = self.rpc.get(path=path)
751
+ return self.handle_response(
752
+ resp,
753
+ "Compute details fetched successfully",
754
+ "Could not fetch the compute details",
755
+ )
756
+
757
+ @log_errors(log_error=True)
758
+ def get_user_access_key_pair(self, user_id):
759
+ """Get user access key pair.
760
+
761
+ Args:
762
+ user_id: ID of the user
763
+
764
+ Returns:
765
+ Tuple of (data, error, message) from API response
766
+ """
767
+ path = f"/v1/compute/get_user_access_key_pair/{user_id}/{self.instance_id}"
768
+ resp = self.rpc.get(path=path)
769
+ return self.handle_response(
770
+ resp,
771
+ "User access key pair fetched successfully",
772
+ "Could not fetch the user access key pair",
773
+ )
774
+
775
+ @log_errors(log_error=True)
776
+ def get_internal_api_key(self, action_id):
777
+ """Get internal API key.
778
+
779
+ Args:
780
+ action_id: ID of the action
781
+
782
+ Returns:
783
+ Tuple of (data, error, message) from API response
784
+ """
785
+ path = f"/v1/scaling/get_internal_api_key/{action_id}/{self.instance_id}"
786
+ resp = self.rpc.get(path=path)
787
+ return self.handle_response(
788
+ resp,
789
+ "internal keys fetched successfully",
790
+ "Could not fetch internal keys",
791
+ )
792
+
793
+ @log_errors(log_error=True)
794
+ def handle_kafka_response(self, msg, success_message, error_message):
795
+ """
796
+ Helper to process Kafka response messages in a consistent way.
797
+ """
798
+ if msg.get("status") == "success":
799
+ data = msg.get("data")
800
+ error = None
801
+ message = success_message
802
+ logging.info(message)
803
+ else:
804
+ data = msg.get("data")
805
+ error = msg.get("error", "Unknown error")
806
+ message = error_message
807
+ logging.error("%s: %s", message, error)
808
+ return data, error, message
809
+
810
+ def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=600):
811
+ """
812
+ Helper to send a request to Kafka and wait for a response.
813
+ Returns (data, error, message, kafka_response_received) where kafka_response_received is True if a response was received (even if error), False if transport error/timeout.
814
+ """
815
+ correlation_id = str(uuid.uuid4())
816
+ request_message = {
817
+ "correlationId": correlation_id,
818
+ "api": api,
819
+ "payload": payload,
820
+ }
821
+
822
+ consumer = KafkaConsumer(
823
+ response_topic,
824
+ bootstrap_servers=self.kafka_config["bootstrap_servers"],
825
+ group_id=None,
826
+ value_deserializer=lambda m: json.loads(m.decode("utf-8")),
827
+ auto_offset_reset='latest',
828
+ enable_auto_commit=True,
829
+ )
830
+
831
+ try:
832
+ if hasattr(self.session.rpc, 'AUTH_TOKEN'):
833
+ self.session.rpc.AUTH_TOKEN.set_bearer_token()
834
+ auth_token = self.session.rpc.AUTH_TOKEN.bearer_token
835
+ auth_token = auth_token.replace("Bearer ", "")
836
+ headers = [("Authorization", bytes(f"{auth_token}", "utf-8"))]
837
+ else:
838
+ headers = None
839
+ self.kafka_producer.send(request_topic, request_message, headers=headers)
840
+ # self.kafka_producer.flush()
841
+ logging.info("Sent %s request to Kafka topic %s", api, request_topic)
842
+ except Exception as e:
843
+ logging.error("Kafka producer error: %s", e)
844
+ return None, f"Kafka producer error: {e}", "Kafka send failed", False
845
+ try:
846
+ start = time.time()
847
+ for message in consumer:
848
+ msg = message.value
849
+ if msg.get("correlationId") == correlation_id:
850
+ consumer.close()
851
+ # Always treat a received response as final, even if error
852
+ return self.handle_kafka_response(
853
+ msg,
854
+ f"Fetched via Kafka for {api}",
855
+ f"Kafka error response for {api}"
856
+ ) + (True,)
857
+ if time.time() - start > timeout:
858
+ break
859
+ consumer.close()
860
+ logging.warning("Kafka response timeout for %s", api)
861
+ return None, "Kafka response timeout", "Kafka response timeout", False
862
+ except Exception as e:
863
+ logging.error("Kafka consumer error: %s", e)
864
+ return None, f"Kafka consumer error: {e}", "Kafka consumer error", False
865
+
866
+ def _cache_failed_request(self, api, payload):
867
+ """Cache the failed request for retry. Here, we use a simple file cache as a placeholder."""
868
+ try:
869
+ cache_file = os.path.join(os.path.dirname(__file__), 'request_cache.json')
870
+ if os.path.exists(cache_file):
871
+ with open(cache_file, 'r') as f:
872
+ cache = json.load(f)
873
+ else:
874
+ cache = []
875
+ cache.append({"api": api, "payload": payload, "ts": time.time()})
876
+ with open(cache_file, 'w') as f:
877
+ json.dump(cache, f)
878
+ logging.info("Cached failed request for api %s", api)
879
+ except Exception as e:
880
+ logging.error("Failed to cache request: %s", e)