matrice-compute 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,226 @@
1
+ """Module providing actions_manager functionality."""
2
+
3
+ import logging
4
+ import os
5
+ import time
6
+ from matrice_compute.action_instance import (
7
+ ActionInstance,
8
+ )
9
+ from matrice_compute.instance_utils import (
10
+ has_gpu,
11
+ get_mem_usage,
12
+ cleanup_docker_storage,
13
+ )
14
+ from matrice_compute.scaling import (
15
+ Scaling,
16
+ )
17
+ from matrice_common.utils import log_errors
18
+
19
+
20
+ class ActionsManager:
21
+ """Class for managing actions."""
22
+
23
+ def __init__(self, scaling: Scaling):
24
+ """Initialize an action manager.
25
+
26
+ Args:
27
+ scaling (Scaling): Scaling service instance
28
+ """
29
+ self.current_actions: dict[str, ActionInstance] = {}
30
+ self.scaling = scaling
31
+ self.memory_threshold = 0.9
32
+ self.poll_interval = 10
33
+ self.last_actions_check = 0
34
+ logging.info("ActionsManager initialized")
35
+
36
+ @log_errors(default_return=[], raise_exception=False)
37
+ def fetch_actions(self) -> list:
38
+ """Poll for actions and process them if memory threshold is not exceeded.
39
+
40
+ Returns:
41
+ list: List of fetched actions
42
+ """
43
+ actions = []
44
+ logging.info("Polling backend for new jobs")
45
+ fetched_actions, error, _ = self.scaling.assign_jobs(has_gpu())
46
+ if error:
47
+ logging.error("Error assigning jobs: %s", error)
48
+ return actions
49
+ if not isinstance(fetched_actions, list):
50
+ fetched_actions = [fetched_actions]
51
+ for action in fetched_actions:
52
+ if not action:
53
+ continue
54
+ if action["_id"] != "000000000000000000000000":
55
+ actions.append(action)
56
+ logging.info(
57
+ "Fetched action details: %s",
58
+ actions,
59
+ )
60
+ return actions
61
+
62
+ @log_errors(default_return=None, raise_exception=False)
63
+ def process_action(self, action: dict) -> ActionInstance:
64
+ """Process the given action.
65
+
66
+ Args:
67
+ action (dict): Action details to process
68
+
69
+ Returns:
70
+ ActionInstance: Processed action instance or None if failed
71
+ """
72
+ logging.info(
73
+ "Processing action: %s",
74
+ action["_id"],
75
+ )
76
+ action_instance = ActionInstance(self.scaling, action)
77
+ self.scaling.update_action_status(
78
+ service_provider=os.environ["SERVICE_PROVIDER"],
79
+ action_record_id=action["_id"],
80
+ status="starting",
81
+ action_duration=0,
82
+ )
83
+ logging.info("locking action")
84
+ self.scaling.update_action_status(
85
+ service_provider=os.environ["SERVICE_PROVIDER"],
86
+ status="started",
87
+ action_record_id=action["_id"],
88
+ isRunning=True,
89
+ action_duration=0,
90
+ cpuUtilisation=0.0,
91
+ gpuUtilisation=0.0,
92
+ memoryUtilisation=0.0,
93
+ gpuMemoryUsed=0,
94
+ )
95
+ self.scaling.update_status(
96
+ action["_id"],
97
+ action["action"],
98
+ "bg-job-scheduler",
99
+ "JBSS_LCK",
100
+ "OK",
101
+ "Job is locked for processing",
102
+ )
103
+ action_instance.execute()
104
+ logging.info(
105
+ "action %s started.",
106
+ action_instance.action_record_id,
107
+ )
108
+ return action_instance
109
+
110
+ @log_errors(raise_exception=False)
111
+ def process_actions(self) -> None:
112
+ """Process fetched actions."""
113
+ for action in self.fetch_actions():
114
+ action_instance = self.process_action(action)
115
+ if action_instance:
116
+ self.current_actions[action["_id"]] = action_instance
117
+
118
+ @log_errors(raise_exception=False)
119
+ def purge_unwanted(self) -> None:
120
+ """Purge completed or failed actions.
121
+
122
+ This method checks all actions in the current_actions dictionary and removes any that:
123
+ 1. Are explicitly reported as not running by the is_running() method
124
+ 2. Have invalid or corrupted process objects
125
+ """
126
+ purged_count = 0
127
+
128
+ # Check each action and purge if needed
129
+ for action_id, instance in list(self.current_actions.items()):
130
+ should_purge = False
131
+ purge_reason = ""
132
+
133
+ # Check if process is reported as not running
134
+ if not instance.is_running():
135
+ should_purge = True
136
+ purge_reason = "process reported as not running"
137
+
138
+ # Check for process object validity
139
+ elif not hasattr(instance, 'process') or instance.process is None:
140
+ should_purge = True
141
+ purge_reason = "invalid process object"
142
+
143
+ # Purge if any condition was met
144
+ if should_purge:
145
+ logging.info(
146
+ "Action %s is being purged: %s",
147
+ action_id,
148
+ purge_reason
149
+ )
150
+
151
+ # Remove from tracking dictionaries
152
+ del self.current_actions[action_id]
153
+ purged_count += 1
154
+
155
+ # Try to explicitly stop the action if possible
156
+ try:
157
+ if hasattr(instance, 'stop'):
158
+ instance.stop()
159
+ except Exception as e:
160
+ logging.error(f"Error stopping action {action_id}: {str(e)}")
161
+
162
+ if purged_count > 0:
163
+ logging.info(
164
+ "Purged %d completed actions, %d actions remain in queue",
165
+ purged_count,
166
+ len(self.current_actions)
167
+ )
168
+
169
+ @log_errors(default_return={}, raise_exception=False)
170
+ def get_current_actions(self) -> dict:
171
+ """Get the current actions.
172
+
173
+ This method:
174
+ 1. Purges any completed actions using purge_unwanted()
175
+ 2. Double-checks remaining actions to ensure they are truly running
176
+ 3. Provides detailed logging about current actions state
177
+
178
+ Returns:
179
+ dict: Current active actions
180
+ """
181
+ # Always purge unwanted actions first
182
+ self.purge_unwanted()
183
+ if self.current_actions:
184
+ action_ids = list(self.current_actions.keys())
185
+ logging.info(
186
+ "Currently running %d actions: %s",
187
+ len(self.current_actions),
188
+ action_ids
189
+ )
190
+ else:
191
+ logging.debug("No actions currently running")
192
+ return {}
193
+ return self.current_actions
194
+
195
+ @log_errors(raise_exception=True)
196
+ def start_actions_manager(self) -> None:
197
+ """Start the actions manager main loop."""
198
+ while True:
199
+ try:
200
+ mem_usage = get_mem_usage()
201
+ logging.info("Memory usage: %d", mem_usage)
202
+ waiting_time = int(
203
+ min(
204
+ self.poll_interval
205
+ / max(
206
+ 0.001,
207
+ self.memory_threshold - mem_usage,
208
+ ),
209
+ 120,
210
+ )
211
+ )
212
+ if mem_usage < self.memory_threshold:
213
+ self.process_actions()
214
+ logging.info(
215
+ "Waiting for %d seconds before next poll",
216
+ waiting_time,
217
+ )
218
+ else:
219
+ logging.info(
220
+ "Memory threshold exceeded, waiting for %d seconds",
221
+ waiting_time,
222
+ )
223
+ cleanup_docker_storage()
224
+ except Exception as e:
225
+ logging.error("Error in actions manager: %s", e)
226
+ time.sleep(waiting_time)
@@ -0,0 +1,57 @@
1
+ """Module providing actions_scaledown_manager functionality."""
2
+
3
+ import logging
4
+ import docker
5
+ from matrice_compute.scaling import (
6
+ Scaling,
7
+ )
8
+ from matrice_common.utils import log_errors
9
+
10
+
11
+ class ActionsScaleDownManager:
12
+ """Class for managing container scale down operations."""
13
+
14
+ def __init__(self, scaling: Scaling):
15
+ """Initialize the scale down manager.
16
+
17
+ Args:
18
+ scaling (Scaling): Scaling service instance
19
+ """
20
+ self.docker_client = docker.from_env()
21
+ self.scaling = scaling
22
+
23
+ @log_errors(raise_exception=False, log_error=True)
24
+ def auto_scaledown_actions(self) -> None:
25
+ """Start polling for containers that need to be scaled down and stop them."""
26
+ down_scaled_jobs, error, _ = self.scaling.get_downscaled_ids()
27
+ if error is not None:
28
+ logging.error(
29
+ "Error getting downscaled ids: %s",
30
+ error,
31
+ )
32
+ return
33
+ containers = self.docker_client.containers.list(
34
+ filters={"status": "running"},
35
+ all=True,
36
+ )
37
+ if down_scaled_jobs:
38
+ for container in containers:
39
+ container_id = container.id
40
+ inspect_data = self.docker_client.api.inspect_container(container_id)
41
+ action_record_id = next(
42
+ (arg for arg in inspect_data["Args"] if len(arg) == 24),
43
+ None,
44
+ )
45
+ if action_record_id in down_scaled_jobs:
46
+ try:
47
+ container.stop()
48
+ logging.info(
49
+ "Container %s stopped.",
50
+ container_id,
51
+ )
52
+ except docker.errors.APIError as err:
53
+ logging.error(
54
+ "Failed to stop container %s: %s",
55
+ container_id,
56
+ str(err),
57
+ )
@@ -0,0 +1,270 @@
1
+ """Module providing instance_manager functionality."""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import threading
7
+ import time
8
+ from matrice_compute.actions_manager import ActionsManager
9
+ from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
10
+ from matrice_compute.instance_utils import (
11
+ get_instance_info,
12
+ get_decrypted_access_key_pair,
13
+ )
14
+ from matrice_compute.resources_tracker import (
15
+ MachineResourcesTracker,
16
+ ActionsResourcesTracker,
17
+ )
18
+ from matrice_compute.scaling import Scaling
19
+ from matrice_compute.shutdown_manager import ShutdownManager
20
+ from matrice_common.session import Session
21
+ from matrice_common.utils import log_errors
22
+
23
+
24
+ class InstanceManager:
25
+ """Class for managing compute instances and their associated actions.
26
+
27
+ Now includes auto streaming capabilities for specified deployment IDs.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ matrice_access_key_id: str = "",
33
+ matrice_secret_access_key: str = "",
34
+ encryption_key: str = "",
35
+ instance_id: str = "",
36
+ service_provider: str = "",
37
+ env: str = "",
38
+ gpus: str = "",
39
+ workspace_dir: str = "matrice_workspace",
40
+ ):
41
+ """Initialize an instance manager.
42
+
43
+ Args:
44
+ matrice_access_key_id (str): Access key ID for Matrice authentication.
45
+ Defaults to empty string.
46
+ matrice_secret_access_key (str): Secret access key for Matrice
47
+ authentication. Defaults to empty string.
48
+ encryption_key (str): Key used for encrypting sensitive data.
49
+ Defaults to empty string.
50
+ instance_id (str): Unique identifier for this compute instance.
51
+ Defaults to empty string.
52
+ service_provider (str): Cloud service provider being used.
53
+ Defaults to empty string.
54
+ env (str): Environment name (e.g. dev, prod).
55
+ Defaults to empty string.
56
+ gpus (str): GPU configuration string (e.g. "0,1").
57
+ Defaults to empty string.
58
+ workspace_dir (str): Directory for workspace files.
59
+ Defaults to "matrice_workspace".
60
+ """
61
+ self.session = self._setup_env_credentials(
62
+ env,
63
+ service_provider,
64
+ instance_id,
65
+ encryption_key,
66
+ matrice_access_key_id,
67
+ matrice_secret_access_key,
68
+ )
69
+ os.environ["WORKSPACE_DIR"] = str(workspace_dir)
70
+ os.environ["GPUS"] = json.dumps(gpus)
71
+ self.scaling = Scaling(
72
+ self.session,
73
+ os.environ.get("INSTANCE_ID"),
74
+ )
75
+ logging.info("InstanceManager initialized with scaling")
76
+ jupyter_token = os.environ.get("JUPYTER_TOKEN")
77
+ if jupyter_token:
78
+ self.scaling.update_jupyter_token(jupyter_token)
79
+ logging.info("InstanceManager updated Jupyter token")
80
+ else:
81
+ logging.warning("No Jupyter token found in environment variables")
82
+ self.current_actions = {}
83
+ self.actions_manager = ActionsManager(self.scaling)
84
+ logging.info("InstanceManager initialized with actions manager")
85
+ self.scale_down_manager = ActionsScaleDownManager(self.scaling)
86
+ logging.info("InstanceManager initialized with scale down manager")
87
+ self.shutdown_manager = ShutdownManager(self.scaling)
88
+ logging.info("InstanceManager initialized with shutdown manager")
89
+ self.machine_resources_tracker = MachineResourcesTracker(self.scaling)
90
+ logging.info("InstanceManager initialized with machine resources tracker")
91
+ self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
92
+ logging.info("InstanceManager initialized with actions resources tracker")
93
+ self.poll_interval = 10
94
+ self.encryption_key = None
95
+ logging.info("InstanceManager initialized.")
96
+
97
+ @log_errors(default_return=None, raise_exception=True, log_error=True)
98
+ def _setup_env_credentials(
99
+ self,
100
+ env: str,
101
+ service_provider: str,
102
+ instance_id: str,
103
+ encryption_key: str,
104
+ matrice_access_key_id: str,
105
+ matrice_secret_access_key: str,
106
+ ):
107
+ """Set up environment credentials.
108
+
109
+ Args:
110
+ env (str): Environment name
111
+ service_provider (str): Cloud service provider
112
+ instance_id (str): Instance identifier
113
+ encryption_key (str): Encryption key
114
+ matrice_access_key_id (str): Matrice access key ID
115
+ matrice_secret_access_key (str): Matrice secret access key
116
+
117
+ Returns:
118
+ Session: Initialized session object
119
+
120
+ Raises:
121
+ Exception: If required environment variables are not set
122
+ """
123
+ try:
124
+ auto_instance_info = get_instance_info(service_provider, instance_id)
125
+ (
126
+ auto_service_provider,
127
+ auto_instance_id,
128
+ ) = auto_instance_info
129
+ except Exception as exc:
130
+ logging.error(
131
+ "Error getting instance info: %s",
132
+ str(exc),
133
+ )
134
+ auto_service_provider = ""
135
+ auto_instance_id = ""
136
+
137
+ manual_instance_info = {
138
+ "ENV": env or os.environ.get("ENV"),
139
+ "SERVICE_PROVIDER": service_provider
140
+ or os.environ.get("SERVICE_PROVIDER")
141
+ or auto_service_provider,
142
+ "INSTANCE_ID": instance_id
143
+ or os.environ.get("INSTANCE_ID")
144
+ or auto_instance_id,
145
+ "MATRICE_ENCRYPTION_KEY": encryption_key
146
+ or os.environ.get("MATRICE_ENCRYPTION_KEY"),
147
+ "MATRICE_ACCESS_KEY_ID": matrice_access_key_id
148
+ or os.environ.get("MATRICE_ACCESS_KEY_ID"),
149
+ "MATRICE_SECRET_ACCESS_KEY": matrice_secret_access_key
150
+ or os.environ.get("MATRICE_SECRET_ACCESS_KEY"),
151
+ }
152
+ for (
153
+ key,
154
+ value,
155
+ ) in manual_instance_info.items():
156
+ os.environ[key] = value
157
+ if not (os.environ.get("SERVICE_PROVIDER") and os.environ.get("INSTANCE_ID")):
158
+ raise Exception(
159
+ "SERVICE_PROVIDER and INSTANCE_ID must be set as environment variables or passed as arguments"
160
+ )
161
+ self.encryption_key = manual_instance_info["MATRICE_ENCRYPTION_KEY"]
162
+
163
+ access_key = manual_instance_info["MATRICE_ACCESS_KEY_ID"]
164
+ secret_key = manual_instance_info["MATRICE_SECRET_ACCESS_KEY"]
165
+
166
+ if ( # Keys are not encrypted
167
+ self.encryption_key
168
+ and access_key
169
+ and secret_key
170
+ and len(access_key) != 21
171
+ and len(secret_key) != 21
172
+ ):
173
+ access_key, secret_key = self._decrypt_access_key_pair(
174
+ access_key,
175
+ secret_key,
176
+ self.encryption_key,
177
+ )
178
+ os.environ["MATRICE_SECRET_ACCESS_KEY"] = secret_key
179
+ os.environ["MATRICE_ACCESS_KEY_ID"] = access_key
180
+ os.environ["MATRICE_ENCRYPTION_KEY"] = self.encryption_key
181
+ return Session(
182
+ account_number="",
183
+ secret_key=secret_key,
184
+ access_key=access_key,
185
+ )
186
+
187
+ @log_errors(default_return=(None, None), raise_exception=False)
188
+ def _decrypt_access_key_pair(
189
+ self,
190
+ enc_access_key: str,
191
+ enc_secret_key: str,
192
+ encryption_key: str = "",
193
+ ) -> tuple:
194
+ """Decrypt the access key pair.
195
+
196
+ Args:
197
+ enc_access_key (str): Encrypted access key
198
+ enc_secret_key (str): Encrypted secret key
199
+ encryption_key (str): Key for decryption. Defaults to empty string.
200
+
201
+ Returns:
202
+ tuple: Decrypted (access_key, secret_key) pair
203
+ """
204
+ return get_decrypted_access_key_pair(
205
+ enc_access_key,
206
+ enc_secret_key,
207
+ encryption_key,
208
+ )
209
+
210
+ @log_errors(raise_exception=True, log_error=True)
211
+ def start_instance_manager(self) -> None:
212
+ """Run the instance manager loop."""
213
+ while True:
214
+ try:
215
+ self.shutdown_manager.handle_shutdown(
216
+ bool(self.actions_manager.get_current_actions())
217
+ )
218
+ except Exception as exc:
219
+ logging.error(
220
+ "Error in shutdown_manager handle_shutdown: %s",
221
+ str(exc),
222
+ )
223
+ try:
224
+ self.scale_down_manager.auto_scaledown_actions()
225
+ except Exception as exc:
226
+ logging.error(
227
+ "Error in scale_down_manager auto_scaledown_actions: %s",
228
+ str(exc),
229
+ )
230
+ try:
231
+ self.machine_resources_tracker.update_available_resources()
232
+ except Exception as exc:
233
+ logging.error(
234
+ "Error in machine_resources_tracker update_available_resources: %s",
235
+ str(exc),
236
+ )
237
+ try:
238
+ self.actions_resources_tracker.update_actions_resources()
239
+ except Exception as exc:
240
+ logging.error(
241
+ "Error in actions_resources_tracker update_actions_resources: %s",
242
+ str(exc),
243
+ )
244
+
245
+ time.sleep(self.poll_interval)
246
+
247
+ @log_errors(default_return=(None, None), raise_exception=True)
248
+ def start(self) -> tuple:
249
+ """Start the instance manager threads.
250
+
251
+ Returns:
252
+ tuple: (instance_manager_thread, actions_manager_thread)
253
+ """
254
+ # Create and start threads
255
+ instance_manager_thread = threading.Thread(
256
+ target=self.start_instance_manager,
257
+ name="InstanceManager",
258
+ )
259
+ instance_manager_thread.start()
260
+
261
+ actions_manager_thread = threading.Thread(
262
+ target=self.actions_manager.start_actions_manager,
263
+ name="ActionsManager",
264
+ )
265
+ actions_manager_thread.start()
266
+
267
+ return (
268
+ instance_manager_thread,
269
+ actions_manager_thread,
270
+ )