matrice-compute 0.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,470 @@
1
+ """Module providing instance_manager functionality."""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import subprocess
7
+ import threading
8
+ import time
9
+ from kafka import KafkaProducer
10
+ from matrice_compute.actions_manager import ActionsManager
11
+ from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
12
+ from matrice_compute.compute_operations_handler import ComputeOperationsHandler
13
+ from matrice_compute.instance_utils import (
14
+ get_instance_info,
15
+ get_decrypted_access_key_pair,
16
+ )
17
+ from matrice_compute.resources_tracker import (
18
+ MachineResourcesTracker,
19
+ ActionsResourcesTracker,
20
+ KafkaResourceMonitor,
21
+ )
22
+ from matrice_compute.scaling import Scaling
23
+ from matrice_compute.shutdown_manager import ShutdownManager
24
+ from matrice_common.session import Session
25
+ from matrice_common.utils import log_errors
26
+
27
+
28
+ class InstanceManager:
29
+ """Class for managing compute instances and their associated actions.
30
+
31
+ Now includes auto streaming capabilities for specified deployment IDs.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ matrice_access_key_id: str = "",
37
+ matrice_secret_access_key: str = "",
38
+ encryption_key: str = "",
39
+ instance_id: str = "",
40
+ service_provider: str = "",
41
+ env: str = "",
42
+ gpus: str = "",
43
+ workspace_dir: str = "matrice_workspace",
44
+ enable_kafka: bool = False,
45
+ ):
46
+ """Initialize an instance manager.
47
+
48
+ Args:
49
+ matrice_access_key_id (str): Access key ID for Matrice authentication.
50
+ Defaults to empty string.
51
+ matrice_secret_access_key (str): Secret access key for Matrice
52
+ authentication. Defaults to empty string.
53
+ encryption_key (str): Key used for encrypting sensitive data.
54
+ Defaults to empty string.
55
+ instance_id (str): Unique identifier for this compute instance.
56
+ Defaults to empty string.
57
+ service_provider (str): Cloud service provider being used.
58
+ Defaults to empty string.
59
+ env (str): Environment name (e.g. dev, prod).
60
+ Defaults to empty string.
61
+ gpus (str): GPU configuration string (e.g. "0,1").
62
+ Defaults to empty string.
63
+ workspace_dir (str): Directory for workspace files.
64
+ Defaults to "matrice_workspace".
65
+ enable_kafka (bool): Enable Kafka communication (default False).
66
+ """
67
+ self.session = self._setup_env_credentials(
68
+ env,
69
+ service_provider,
70
+ instance_id,
71
+ encryption_key,
72
+ matrice_access_key_id,
73
+ matrice_secret_access_key,
74
+ )
75
+ os.environ["WORKSPACE_DIR"] = str(workspace_dir)
76
+ os.environ["GPUS"] = json.dumps(gpus)
77
+ self.scaling = Scaling(
78
+ self.session,
79
+ os.environ.get("INSTANCE_ID"),
80
+ enable_kafka,
81
+ )
82
+ logging.info("InstanceManager initialized with scaling")
83
+ jupyter_token = os.environ.get("JUPYTER_TOKEN")
84
+ if jupyter_token:
85
+ self.scaling.update_jupyter_token(jupyter_token)
86
+ logging.info("InstanceManager updated Jupyter token")
87
+ else:
88
+ logging.warning("No Jupyter token found in environment variables")
89
+ self.current_actions = {}
90
+ self.actions_manager = ActionsManager(self.scaling)
91
+ logging.info("InstanceManager initialized with actions manager")
92
+ self.scale_down_manager = ActionsScaleDownManager(self.scaling)
93
+ logging.info("InstanceManager initialized with scale down manager")
94
+ self.shutdown_manager = ShutdownManager(self.scaling)
95
+ logging.info("InstanceManager initialized with shutdown manager")
96
+ self.machine_resources_tracker = MachineResourcesTracker(self.scaling)
97
+ logging.info("InstanceManager initialized with machine resources tracker")
98
+ self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
99
+ logging.info("InstanceManager initialized with actions resources tracker")
100
+
101
+ # Initialize Kafka resource monitor using the same internal Kafka as scaling
102
+ try:
103
+ kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
104
+ self.kafka_resource_monitor = KafkaResourceMonitor(
105
+ instance_id=os.environ.get("INSTANCE_ID"),
106
+ kafka_bootstrap=kafka_bootstrap,
107
+ interval_seconds=60
108
+ )
109
+ logging.info("InstanceManager initialized with Kafka resource monitor using internal Kafka: %s", kafka_bootstrap)
110
+ except (ValueError, Exception) as e:
111
+ logging.warning("Failed to initialize Kafka resource monitor: %s", e)
112
+ self.kafka_resource_monitor = None
113
+
114
+ # Initialize Compute Operations Handler for event-driven operations
115
+ # Uses EventListener from matrice_common for simplified Kafka consumption
116
+ try:
117
+ instance_id = os.environ.get("INSTANCE_ID")
118
+ self.compute_operations_handler = ComputeOperationsHandler(
119
+ actions_manager=self.actions_manager,
120
+ session=self.session,
121
+ scaling=self.scaling,
122
+ instance_id=instance_id
123
+ )
124
+ logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
125
+ except (ValueError, Exception) as e:
126
+ logging.warning("Failed to initialize Compute Operations Handler: %s", e)
127
+ self.compute_operations_handler = None
128
+
129
+ self.poll_interval = 10
130
+ # Note: encryption_key is set in _setup_env_credentials
131
+
132
+ # Initialize container monitoring
133
+ self.container_monitor_thread = None
134
+ self.container_monitor_running = False
135
+ self.container_kafka_producer = None
136
+
137
+ logging.info("InstanceManager initialized.")
138
+
139
+ # report the resources at startup
140
+ try:
141
+ self.scaling.report_architecture_info()
142
+ logging.info("InstanceManager reported initial resources.")
143
+ except Exception as exc:
144
+ logging.error(
145
+ "Error reporting initial resources: %s",
146
+ str(exc),
147
+ )
148
+
149
+ @log_errors(default_return=None, raise_exception=True, log_error=True)
150
+ def _setup_env_credentials(
151
+ self,
152
+ env: str,
153
+ service_provider: str,
154
+ instance_id: str,
155
+ encryption_key: str,
156
+ matrice_access_key_id: str,
157
+ matrice_secret_access_key: str,
158
+ ):
159
+ """Set up environment credentials.
160
+
161
+ Args:
162
+ env (str): Environment name
163
+ service_provider (str): Cloud service provider
164
+ instance_id (str): Instance identifier
165
+ encryption_key (str): Encryption key
166
+ matrice_access_key_id (str): Matrice access key ID
167
+ matrice_secret_access_key (str): Matrice secret access key
168
+
169
+ Returns:
170
+ Session: Initialized session object
171
+
172
+ Raises:
173
+ Exception: If required environment variables are not set
174
+ """
175
+ try:
176
+ auto_instance_info = get_instance_info(service_provider, instance_id)
177
+ (
178
+ auto_service_provider,
179
+ auto_instance_id,
180
+ ) = auto_instance_info
181
+ except Exception as exc:
182
+ logging.error(
183
+ "Error getting instance info: %s",
184
+ str(exc),
185
+ )
186
+ auto_service_provider = ""
187
+ auto_instance_id = ""
188
+
189
+ manual_instance_info = {
190
+ "ENV": env or os.environ.get("ENV"),
191
+ "SERVICE_PROVIDER": service_provider
192
+ or os.environ.get("SERVICE_PROVIDER")
193
+ or auto_service_provider,
194
+ "INSTANCE_ID": instance_id
195
+ or os.environ.get("INSTANCE_ID")
196
+ or auto_instance_id,
197
+ "MATRICE_ENCRYPTION_KEY": encryption_key
198
+ or os.environ.get("MATRICE_ENCRYPTION_KEY"),
199
+ "MATRICE_ACCESS_KEY_ID": matrice_access_key_id
200
+ or os.environ.get("MATRICE_ACCESS_KEY_ID"),
201
+ "MATRICE_SECRET_ACCESS_KEY": matrice_secret_access_key
202
+ or os.environ.get("MATRICE_SECRET_ACCESS_KEY"),
203
+ }
204
+ for (
205
+ key,
206
+ value,
207
+ ) in manual_instance_info.items():
208
+ if value is not None:
209
+ os.environ[key] = str(value)
210
+ if not (os.environ.get("SERVICE_PROVIDER") and os.environ.get("INSTANCE_ID")):
211
+ raise Exception(
212
+ "SERVICE_PROVIDER and INSTANCE_ID must be set as environment variables or passed as arguments"
213
+ )
214
+ self.encryption_key = manual_instance_info["MATRICE_ENCRYPTION_KEY"]
215
+
216
+ access_key = manual_instance_info["MATRICE_ACCESS_KEY_ID"]
217
+ secret_key = manual_instance_info["MATRICE_SECRET_ACCESS_KEY"]
218
+
219
+ if ( # Keys are not encrypted
220
+ self.encryption_key
221
+ and access_key
222
+ and secret_key
223
+ and len(access_key) != 21
224
+ and len(secret_key) != 21
225
+ ):
226
+ access_key, secret_key = self._decrypt_access_key_pair(
227
+ access_key,
228
+ secret_key,
229
+ self.encryption_key,
230
+ )
231
+ os.environ["MATRICE_SECRET_ACCESS_KEY"] = secret_key
232
+ os.environ["MATRICE_ACCESS_KEY_ID"] = access_key
233
+ os.environ["MATRICE_ENCRYPTION_KEY"] = self.encryption_key
234
+ return Session(
235
+ account_number="",
236
+ secret_key=secret_key,
237
+ access_key=access_key,
238
+ )
239
+
240
+ @log_errors(default_return=(None, None), raise_exception=False)
241
+ def _decrypt_access_key_pair(
242
+ self,
243
+ enc_access_key: str,
244
+ enc_secret_key: str,
245
+ encryption_key: str = "",
246
+ ) -> tuple:
247
+ """Decrypt the access key pair.
248
+
249
+ Args:
250
+ enc_access_key (str): Encrypted access key
251
+ enc_secret_key (str): Encrypted secret key
252
+ encryption_key (str): Key for decryption. Defaults to empty string.
253
+
254
+ Returns:
255
+ tuple: Decrypted (access_key, secret_key) pair
256
+ """
257
+ return get_decrypted_access_key_pair(
258
+ enc_access_key,
259
+ enc_secret_key,
260
+ encryption_key,
261
+ )
262
+
263
+ @log_errors(raise_exception=True, log_error=True)
264
+ def start_instance_manager(self) -> None:
265
+ """Run the instance manager loop."""
266
+ while True:
267
+ try:
268
+ self.shutdown_manager.handle_shutdown(
269
+ bool(self.actions_manager.get_current_actions())
270
+ )
271
+ except Exception as exc:
272
+ logging.error(
273
+ "Error in shutdown_manager handle_shutdown: %s",
274
+ str(exc),
275
+ )
276
+ # try:
277
+ # self.scale_down_manager.auto_scaledown_actions()
278
+ # except Exception as exc:
279
+ # logging.error(
280
+ # "Error in scale_down_manager auto_scaledown_actions: %s",
281
+ # str(exc),
282
+ # )
283
+ # try:
284
+ # self.machine_resources_tracker.update_available_resources()
285
+ # except Exception as exc:
286
+ # logging.error(
287
+ # "Error in machine_resources_tracker update_available_resources: %s",
288
+ # str(exc),
289
+ # )
290
+ try:
291
+ self.actions_resources_tracker.update_actions_resources()
292
+ except Exception as exc:
293
+ logging.error(
294
+ "Error in actions_resources_tracker update_actions_resources: %s",
295
+ str(exc),
296
+ )
297
+
298
+ time.sleep(self.poll_interval)
299
+
300
+ @log_errors(raise_exception=False, log_error=True)
301
+ def start_container_status_monitor(self):
302
+ """Start the background container status monitoring."""
303
+ if self.container_monitor_running:
304
+ logging.info("Container status monitor is already running")
305
+ return
306
+
307
+ self.container_monitor_running = True
308
+ self.container_monitor_thread = threading.Thread(
309
+ target=self._container_status_monitor_worker,
310
+ daemon=True,
311
+ name="ContainerStatusMonitor"
312
+ )
313
+ self.container_monitor_thread.start()
314
+ logging.info("Started container status monitoring thread")
315
+
316
+ @log_errors(raise_exception=False, log_error=True)
317
+ def stop_container_status_monitor(self):
318
+ """Stop the background container status monitoring."""
319
+ if not self.container_monitor_running:
320
+ return
321
+
322
+ logging.info("Stopping container status monitor...")
323
+ self.container_monitor_running = False
324
+
325
+ if self.container_monitor_thread:
326
+ self.container_monitor_thread.join(timeout=10)
327
+
328
+ if self.container_kafka_producer:
329
+ self.container_kafka_producer.close()
330
+ self.container_kafka_producer = None
331
+
332
+ logging.info("Container status monitor stopped")
333
+
334
+ def _container_status_monitor_worker(self):
335
+ """Background worker function that monitors container status."""
336
+ # Initialize Kafka producer
337
+ try:
338
+ if self.scaling.enable_kafka:
339
+ bootstrap_servers = self.scaling.get_kafka_bootstrap_servers()
340
+ self.container_kafka_producer = KafkaProducer(
341
+ bootstrap_servers=bootstrap_servers,
342
+ value_serializer=lambda v: json.dumps(v).encode("utf-8"),
343
+ max_block_ms=5000 # Timeout if Kafka is down
344
+ )
345
+ logging.info("Container status monitor: Kafka producer initialized")
346
+ else:
347
+ logging.warning("Container status monitor: Kafka is disabled, no monitoring will be performed")
348
+ return
349
+ except Exception as e:
350
+ logging.error("Container status monitor: Failed to initialize Kafka producer: %s", str(e))
351
+ return
352
+
353
+ instance_id = os.environ.get("INSTANCE_ID")
354
+ topic_name = "compute_container_status"
355
+
356
+ logging.info("Container status monitor started for instance: %s", instance_id)
357
+
358
+ while self.container_monitor_running:
359
+ try:
360
+ # Get container status using docker ps -a
361
+ result = subprocess.run(
362
+ ["docker", "ps", "-a", "--format", "json"],
363
+ capture_output=True,
364
+ text=True,
365
+ timeout=30
366
+ )
367
+
368
+ if result.returncode != 0:
369
+ logging.error("Container status monitor: docker ps command failed: %s", result.stderr)
370
+ time.sleep(30) # Wait before retrying
371
+ continue
372
+
373
+ # Parse container information
374
+ containers = []
375
+ if result.stdout.strip():
376
+ for line in result.stdout.strip().split('\n'):
377
+ try:
378
+ container_info = json.loads(line)
379
+ containers.append({
380
+ "container_id": container_info.get("ID", ""),
381
+ "image": container_info.get("Image", ""),
382
+ "command": container_info.get("Command", ""),
383
+ "created": container_info.get("CreatedAt", ""),
384
+ "status": container_info.get("Status", ""),
385
+ "ports": container_info.get("Ports", ""),
386
+ "names": container_info.get("Names", ""),
387
+ "size": container_info.get("Size", ""),
388
+ "state": container_info.get("State", ""),
389
+ "labels": container_info.get("Labels", "")
390
+ })
391
+ except json.JSONDecodeError as e:
392
+ logging.warning("Container status monitor: Failed to parse container info: %s", str(e))
393
+ continue
394
+
395
+ # Prepare message for Kafka
396
+ status_message = {
397
+ "timestamp": time.time(),
398
+ "instance_id": instance_id,
399
+ "container_count": len(containers),
400
+ "containers": containers
401
+ }
402
+
403
+ # Send to Kafka
404
+ if self.container_kafka_producer:
405
+ try:
406
+ self.container_kafka_producer.send(topic_name, status_message)
407
+ logging.debug("Container status monitor: Sent status for %d containers", len(containers))
408
+ except Exception as e:
409
+ logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
410
+
411
+ except subprocess.TimeoutExpired:
412
+ logging.error("Container status monitor: docker ps command timed out")
413
+ except Exception as e:
414
+ logging.error("Container status monitor: Unexpected error: %s", str(e))
415
+
416
+ # Wait 30 seconds before next check
417
+ for _ in range(30):
418
+ if not self.container_monitor_running:
419
+ break
420
+ time.sleep(1)
421
+
422
+ logging.info("Container status monitor worker stopped")
423
+
424
+ @log_errors(default_return=(None, None), raise_exception=True)
425
+ def start(self) -> tuple:
426
+ """Start the instance manager threads.
427
+
428
+ Returns:
429
+ tuple: (instance_manager_thread, actions_manager_thread)
430
+ """
431
+ # Start Kafka resource monitor in background thread
432
+ if self.kafka_resource_monitor:
433
+ try:
434
+ self.kafka_resource_monitor.start()
435
+ logging.info("Started Kafka resource monitor")
436
+ except Exception as exc:
437
+ logging.error("Failed to start Kafka resource monitor: %s", str(exc))
438
+
439
+ # Start Compute Operations Handler in background thread
440
+ if self.compute_operations_handler:
441
+ try:
442
+ self.compute_operations_handler.start()
443
+ logging.info("Started Compute Operations Handler")
444
+ except Exception as exc:
445
+ logging.error("Failed to start Compute Operations Handler: %s", str(exc))
446
+
447
+ # Start Container Status Monitor in background thread
448
+ try:
449
+ self.start_container_status_monitor()
450
+ logging.info("Started Container Status Monitor")
451
+ except Exception as exc:
452
+ logging.error("Failed to start Container Status Monitor: %s", str(exc))
453
+
454
+ # Create and start threads
455
+ instance_manager_thread = threading.Thread(
456
+ target=self.start_instance_manager,
457
+ name="InstanceManager",
458
+ )
459
+ instance_manager_thread.start()
460
+
461
+ actions_manager_thread = threading.Thread(
462
+ target=self.actions_manager.start_actions_manager,
463
+ name="ActionsManager",
464
+ )
465
+ actions_manager_thread.start()
466
+
467
+ return (
468
+ instance_manager_thread,
469
+ actions_manager_thread,
470
+ )