matrice-compute 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ """Module providing shutdown_manager functionality."""
2
+
3
+ import logging
4
+ import time
5
+ import os
6
+ import sys
7
+ import platform
8
+ import subprocess
9
+ import signal
10
+ from typing import Optional, Tuple
11
+ from matrice_common.utils import log_errors
12
+ from matrice_compute.scaling import Scaling
13
+
14
+
15
+ class ShutdownManager:
16
+ """Class for managing compute instance shutdown."""
17
+
18
+ def __init__(self, scaling: Scaling) -> None:
19
+ """
20
+ Initialize ShutdownManager.
21
+
22
+ Args:
23
+ scaling (Scaling): Scaling instance to manage shutdown.
24
+ """
25
+ self.scaling = scaling
26
+ self.launch_time: float = time.time()
27
+ self.last_no_queued_time: Optional[float] = None
28
+ self.shutdown_threshold: int = 500 # Idle time threshold in seconds
29
+ self.launch_duration: int = 1 # Launch duration in minutes
30
+ self.instance_source: str = "auto"
31
+ self.encryption_key: Optional[str] = None
32
+ self.reserved_instance: Optional[bool] = None
33
+ self.shutdown_attempts: int = 0
34
+ self.max_shutdown_attempts: int = 3
35
+ self.force_shutdown_attempts: int = 0
36
+ self.max_force_shutdown_attempts: int = 2
37
+ self.launch_duration_seconds: int = self.launch_duration * 60 # Convert minutes to seconds
38
+ self._load_shutdown_configuration()
39
+
40
+ @log_errors(raise_exception=False, log_error=True)
41
+ def _load_shutdown_configuration(self) -> None:
42
+ """
43
+ Load shutdown configuration from AWS secrets and initialize parameters.
44
+
45
+ This method retrieves shutdown configuration details from the scaling service
46
+ and updates the instance attributes accordingly.
47
+ """
48
+ response, error, message = self.scaling.get_shutdown_details()
49
+ if error is None:
50
+ self.shutdown_threshold = response.get("shutdownThreshold", 500)
51
+ self.launch_duration = response.get("launchDuration", 1)
52
+ self.instance_source = response.get("instanceSource", "auto")
53
+ self.encryption_key = response.get("encryptionKey")
54
+ self.launch_duration_seconds = self.launch_duration * 60 # Convert minutes to seconds
55
+ self.reserved_instance = self.instance_source == "reserved"
56
+ logging.info(
57
+ "Loaded shutdown configuration: threshold=%s, duration=%s, source=%s, reserved=%s",
58
+ self.shutdown_threshold,
59
+ self.launch_duration,
60
+ self.instance_source,
61
+ self.reserved_instance,
62
+ )
63
+
64
+ def _check_root(self) -> bool:
65
+ """
66
+ Check if the current process has root privileges.
67
+
68
+ Returns:
69
+ bool: True if the process has root privileges, False otherwise.
70
+ """
71
+ if hasattr(os, "geteuid") and os.geteuid() != 0:
72
+ logging.error("Shutdown requires root privileges.")
73
+ return False
74
+ return True
75
+
76
+ def _execute_shutdown_command(self) -> bool:
77
+ """
78
+ Execute system shutdown command with multiple fallbacks.
79
+
80
+ This method attempts to shut down the system using various commands
81
+ depending on the operating system.
82
+
83
+ Returns:
84
+ bool: True if any shutdown command succeeded, False otherwise.
85
+ """
86
+ self._check_root()
87
+ system = platform.system().lower()
88
+
89
+ # Define shutdown commands in order of preference (most graceful first)
90
+ shutdown_commands = []
91
+
92
+ if system == "linux":
93
+ shutdown_commands = [
94
+ ["shutdown", "now"], # Standard Linux shutdown
95
+ ["systemctl", "poweroff"], # Systemd poweroff
96
+ ["systemctl", "poweroff", "--force"], # Force systemd poweroff
97
+ ["halt", "-f"], # Force halt
98
+ ["poweroff", "-f"], # Force poweroff
99
+ ["init", "0"], # Init level 0 (shutdown)
100
+ ["telinit", "0"], # Alternative init command
101
+ ]
102
+ elif system == "windows":
103
+ shutdown_commands = [
104
+ ["shutdown", "/s", "/t", "0"], # Windows shutdown
105
+ ["shutdown", "/s", "/f", "/t", "0"], # Windows force shutdown
106
+ ["shutdown", "/p"], # Windows immediate poweroff
107
+ ]
108
+ elif system == "darwin": # macOS
109
+ shutdown_commands = [
110
+ ["shutdown", "-h", "now"], # macOS shutdown
111
+ ["halt"], # macOS halt
112
+ ["sudo", "shutdown", "-h", "now"], # Sudo shutdown
113
+ ]
114
+ else:
115
+ # Generic Unix-like fallbacks
116
+ shutdown_commands = [
117
+ ["shutdown", "-h", "now"],
118
+ ["halt"],
119
+ ["poweroff"],
120
+ ["init", "0"],
121
+ ]
122
+
123
+ # Try each command in sequence
124
+ for cmd in shutdown_commands:
125
+ try:
126
+ logging.info("Attempting shutdown with command: %s", " ".join(cmd))
127
+ result = subprocess.run(
128
+ cmd,
129
+ capture_output=True,
130
+ text=True,
131
+ timeout=30,
132
+ check=False
133
+ )
134
+
135
+ if result.returncode == 0:
136
+ logging.info("Shutdown command succeeded: %s", " ".join(cmd))
137
+ return True
138
+ else:
139
+ logging.warning(
140
+ "Shutdown command failed with return code %d: %s. STDERR: %s",
141
+ result.returncode,
142
+ " ".join(cmd),
143
+ result.stderr
144
+ )
145
+ except subprocess.TimeoutExpired:
146
+ logging.warning("Shutdown command timed out: %s", " ".join(cmd))
147
+ except FileNotFoundError:
148
+ logging.warning("Shutdown command not found: %s", " ".join(cmd))
149
+ except Exception as e:
150
+ logging.warning("Shutdown command failed: %s. Error: %s", " ".join(cmd), str(e))
151
+
152
+ # If all standard commands failed, try more aggressive methods
153
+ return self._try_aggressive_shutdown()
154
+
155
+ def _try_aggressive_shutdown(self) -> bool:
156
+ """
157
+ Try more aggressive shutdown methods when standard commands fail.
158
+
159
+ Returns:
160
+ bool: True if any aggressive shutdown method succeeded, False otherwise.
161
+ """
162
+ logging.warning("Standard shutdown commands failed, trying aggressive methods")
163
+ try:
164
+ system = platform.system().lower()
165
+ if system == "linux":
166
+ try:
167
+ subprocess.run("echo 1 > /proc/sys/kernel/sysrq", shell=True, check=False, timeout=5)
168
+ except Exception:
169
+ pass
170
+
171
+ aggressive_commands = [
172
+ "sync",
173
+ "echo 4 > /proc/acpi/sleep",
174
+ "echo s > /proc/sysrq-trigger",
175
+ "echo u > /proc/sysrq-trigger",
176
+ "echo o > /proc/sysrq-trigger",
177
+ "echo b > /proc/sysrq-trigger",
178
+ ]
179
+ for cmd in aggressive_commands:
180
+ try:
181
+ logging.info("Trying aggressive shutdown: %s", cmd)
182
+ result = subprocess.run(cmd, shell=True, check=False, timeout=10)
183
+ if result.returncode == 0:
184
+ logging.info("Aggressive shutdown command succeeded")
185
+ time.sleep(2)
186
+ return True
187
+ except Exception as e:
188
+ logging.debug("Aggressive command failed: %s", str(e))
189
+ except Exception as e:
190
+ logging.error("Error in aggressive shutdown methods: %s", str(e))
191
+ return False
192
+
193
+
194
+ @log_errors(raise_exception=True, log_error=True)
195
+ def do_cleanup_and_shutdown(self) -> bool:
196
+ """Clean up resources and shut down the instance.
197
+
198
+ This method attempts a coordinated shutdown with multiple fallback strategies:
199
+ 1. API call to notify the scaling service
200
+ 2. Graceful OS shutdown command
201
+ 3. Aggressive shutdown methods if needed
202
+ 4. Emergency forced shutdown as last resort
203
+
204
+ Returns:
205
+ bool: True if shutdown was initiated successfully, False otherwise
206
+ """
207
+ max_retries = self.max_shutdown_attempts
208
+
209
+ for attempt in range(1, max_retries + 1):
210
+ try:
211
+ logging.info("Shutdown attempt %d of %d", attempt, max_retries)
212
+
213
+ # Step 1: Notify scaling service of shutdown
214
+ logging.info("Notifying scaling service of instance shutdown")
215
+ try:
216
+ response = self.scaling.stop_instance()
217
+
218
+ # Handle case where stop_instance returns None or unexpected format
219
+ if response is None:
220
+ result, error, message = None, "API returned None", "No response from stop_instance API"
221
+ elif isinstance(response, tuple) and len(response) == 3:
222
+ result, error, message = response
223
+ else:
224
+ result, error, message = None, "Invalid response format", f"Unexpected response format: {response}"
225
+
226
+ except Exception as api_error:
227
+ result, error, message = None, str(api_error), "Exception during API call"
228
+
229
+ if error:
230
+ logging.error("Failed to notify scaling service (attempt %d): %s", attempt, error)
231
+ if attempt < max_retries:
232
+ logging.info("Retrying in 5 seconds...")
233
+ time.sleep(5)
234
+ continue
235
+ else:
236
+ logging.warning("Proceeding with shutdown despite API notification failure")
237
+ else:
238
+ logging.info("Scaling service notified successfully: %s", message)
239
+
240
+ # Step 2: Attempt graceful system shutdown
241
+ logging.info("Initiating graceful system shutdown")
242
+ shutdown_success = self._execute_shutdown_command()
243
+
244
+ if shutdown_success:
245
+ logging.info("Graceful shutdown command executed successfully")
246
+ # Give the system time to process the shutdown
247
+ time.sleep(10)
248
+ # If we reach here, graceful shutdown may have failed
249
+ logging.warning("System did not shut down gracefully, trying aggressive methods")
250
+
251
+ # Step 3: Try aggressive shutdown methods
252
+ logging.warning("Attempting aggressive shutdown methods")
253
+ aggressive_success = self._try_aggressive_shutdown()
254
+
255
+ if aggressive_success:
256
+ logging.info("Aggressive shutdown initiated")
257
+ time.sleep(5)
258
+
259
+ return True
260
+
261
+ except Exception as e:
262
+ logging.error("Critical error during shutdown attempt %d: %s", attempt, str(e))
263
+ return False
264
+
265
+ @log_errors(raise_exception=False, log_error=True)
266
+ def handle_shutdown(self, tasks_running: bool) -> None:
267
+ """Check idle time and trigger shutdown if threshold is exceeded.
268
+
269
+ Args:
270
+ tasks_running: Boolean indicating if there are running tasks
271
+ """
272
+ # CRITICAL: Check if this is a reserved instance that should not be shut down
273
+ # if self.reserved_instance:
274
+ # logging.debug("Reserved instance detected, skipping shutdown check")
275
+ # return
276
+
277
+ # Update idle time tracking
278
+ if tasks_running:
279
+ self.last_no_queued_time = None
280
+ logging.info("Tasks are running, resetting idle timer")
281
+ elif self.last_no_queued_time is None:
282
+ self.last_no_queued_time = time.time()
283
+ logging.info("No tasks running, starting idle timer")
284
+
285
+ if self.last_no_queued_time is not None:
286
+ idle_time = time.time() - self.last_no_queued_time
287
+ launch_time_passed = (time.time() - self.launch_time) > self.launch_duration_seconds
288
+
289
+ # Log current status
290
+ logging.info(
291
+ "Time since last action: %s seconds. Time left to shutdown: %s seconds.",
292
+ idle_time,
293
+ max(0, self.shutdown_threshold - idle_time),
294
+ )
295
+
296
+ # Check if we should shut down
297
+ if idle_time <= self.shutdown_threshold:
298
+ return
299
+
300
+ if not launch_time_passed:
301
+ logging.info(
302
+ "Instance not shutting down yet. Launch duration: %s seconds, elapsed: %s seconds",
303
+ self.launch_duration_seconds,
304
+ time.time() - self.launch_time,
305
+ )
306
+ return
307
+
308
+ logging.info(
309
+ "Idle time %s seconds exceeded threshold %s seconds. Shutting down.",
310
+ idle_time,
311
+ self.shutdown_threshold
312
+ )
313
+
314
+ self.do_cleanup_and_shutdown()
@@ -0,0 +1,77 @@
1
+ """Module providing task_utils functionality."""
2
+
3
+ import os
4
+ import shutil
5
+ import urllib.request
6
+ import zipfile
7
+ from typing import Optional
8
+ from matrice_common.utils import log_errors
9
+
10
+
11
+ @log_errors(raise_exception=True, log_error=True)
12
+ def setup_workspace_and_run_task(
13
+ work_fs: str,
14
+ action_id: str,
15
+ model_codebase_url: str,
16
+ model_codebase_requirements_url: Optional[str] = None,
17
+ ) -> None:
18
+ """Set up workspace and run task with provided parameters.
19
+
20
+ Args:
21
+ work_fs (str): Working filesystem path.
22
+ action_id (str): Unique identifier for the action.
23
+ model_codebase_url (str): URL to download model codebase from.
24
+ model_codebase_requirements_url (Optional[str]): URL to download requirements from. Defaults to None.
25
+
26
+ Returns:
27
+ None
28
+ """
29
+ workspace_dir = f"{work_fs}/{action_id}"
30
+ codebase_zip_path = f"{workspace_dir}/file.zip"
31
+ requirements_txt_path = f"{workspace_dir}/requirements.txt"
32
+ if os.path.exists(workspace_dir):
33
+ return
34
+ os.makedirs(workspace_dir, exist_ok=True)
35
+
36
+ # Download codebase ZIP file
37
+ urllib.request.urlretrieve(model_codebase_url, codebase_zip_path)
38
+
39
+ # Extract ZIP file with overwrite
40
+ with zipfile.ZipFile(codebase_zip_path, 'r') as zip_ref:
41
+ zip_ref.extractall(workspace_dir)
42
+
43
+ # Move files from subdirectories to workspace root (equivalent to rsync -av)
44
+ for root, dirs, files in os.walk(workspace_dir):
45
+ # Skip the workspace_dir itself to avoid moving files to themselves
46
+ if root == workspace_dir:
47
+ continue
48
+
49
+ for file in files:
50
+ src_file = os.path.join(root, file)
51
+ dst_file = os.path.join(workspace_dir, file)
52
+
53
+ # If destination file exists, overwrite it (equivalent to rsync -av behavior)
54
+ if os.path.exists(dst_file):
55
+ os.remove(dst_file)
56
+
57
+ shutil.move(src_file, dst_file)
58
+
59
+ # Remove empty directories after moving files
60
+ for dir_name in dirs:
61
+ dir_path = os.path.join(root, dir_name)
62
+ if os.path.exists(dir_path) and not os.listdir(dir_path):
63
+ os.rmdir(dir_path)
64
+
65
+ # Clean up any remaining empty subdirectories
66
+ for root, dirs, files in os.walk(workspace_dir, topdown=False):
67
+ if root == workspace_dir:
68
+ continue
69
+ if not files and not dirs:
70
+ try:
71
+ os.rmdir(root)
72
+ except OSError:
73
+ pass # Directory might not be empty or might not exist
74
+
75
+ # Download requirements.txt if URL is provided
76
+ if model_codebase_requirements_url:
77
+ urllib.request.urlretrieve(model_codebase_requirements_url, requirements_txt_path)
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: matrice_compute
3
+ Version: 0.1.1
4
+ Summary: Common server utilities for Matrice.ai services
5
+ Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
+ License-Expression: MIT
7
+ Keywords: matrice,common,utilities,pyarmor,obfuscated
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Operating System :: POSIX :: Linux
12
+ Classifier: Operating System :: Microsoft :: Windows
13
+ Classifier: Operating System :: MacOS
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE.txt
24
+ Dynamic: license-file
25
+ Dynamic: requires-python
26
+
27
+ ## matrice\_instance\_manager
28
+
@@ -0,0 +1,17 @@
1
+ matrice_compute/__init__.py,sha256=bV-Vtb0-Sa3K1uXaAc8qP7NK6pf354OU1K19aQSt3VI,288
2
+ matrice_compute/action_instance.py,sha256=CGRvWzkYeMHP-_GGqrWAgQ_4suqrFovlWH-NkY30p9g,57600
3
+ matrice_compute/actions_manager.py,sha256=Nbosuf6kFisl2vbsgd5rZiYjdX4za8X588aut0OoKpI,7713
4
+ matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
+ matrice_compute/instance_manager.py,sha256=mGA2JsvRPWGQESKcZoc0k_xCB40fmsQDv2ojCT3kf1g,10060
6
+ matrice_compute/instance_utils.py,sha256=tIFVUi8HJPy4GY-jtfVx2zIgmXNta7s3jCIRzBga1hI,21977
7
+ matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
8
+ matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ matrice_compute/resources_tracker.py,sha256=My26LPglDHcQcTkxxiXwpfdqkpEAt3clrqJ-k1fAl1M,17878
10
+ matrice_compute/scaling.py,sha256=UqxvSZtypELWmD4EBKS1X-vX3tJnSpgOO6_b4XOe2Uk,31175
11
+ matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
12
+ matrice_compute/task_utils.py,sha256=ML9uTrYQiWgEMJitYxoGlVOa9KUXNKV_WqnousOTK6k,2762
13
+ matrice_compute-0.1.1.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
14
+ matrice_compute-0.1.1.dist-info/METADATA,sha256=zdb-JyIdO6bT2yko4QB9LgECmENwsu4TEkHlXpowSJI,1037
15
+ matrice_compute-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ matrice_compute-0.1.1.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
17
+ matrice_compute-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Matrice.ai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ matrice_compute