matrice-compute 0.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,538 @@
1
+ """Module providing prechecks functionality."""
2
+
3
+ import logging
4
+ import sys
5
+ import subprocess
6
+ from typing import Any, Optional
7
+ from matrice_compute.scaling import (
8
+ Scaling,
9
+ )
10
+ from matrice_compute.actions_scaledown_manager import (
11
+ ActionsScaleDownManager,
12
+ )
13
+ from matrice_compute.resources_tracker import (
14
+ ResourcesTracker,
15
+ MachineResourcesTracker,
16
+ ActionsResourcesTracker,
17
+ )
18
+ from matrice_compute.instance_utils import (
19
+ get_instance_info,
20
+ cleanup_docker_storage,
21
+ get_cpu_memory_usage,
22
+ get_gpu_memory_usage,
23
+ get_mem_usage,
24
+ get_gpu_with_sufficient_memory_for_action,
25
+ get_max_file_system,
26
+ has_gpu,
27
+ )
28
+ from matrice.docker_utils import check_docker
29
+
30
+
31
+ class Prechecks:
32
+ """Class for running pre-checks before compute operations."""
33
+
34
+ def __init__(
35
+ self,
36
+ session: Any,
37
+ instance_id: Optional[str] = None,
38
+ ) -> None:
39
+ """Initialize Prechecks.
40
+
41
+ Args:
42
+ session: Session object for RPC calls
43
+ instance_id: Optional instance ID
44
+ """
45
+ self.session = session
46
+ self.rpc = session.rpc
47
+ self.instance_id = instance_id
48
+ self.access_key = None
49
+ self.secret_key = None
50
+ self.docker_username = None
51
+ self.docker_password = None
52
+ self.shutdown_threshold = None
53
+ self.launch_duration = None
54
+ self.instance_source = None
55
+ self.scaling = Scaling(session, instance_id)
56
+ self.actions_scale_down_manager = ActionsScaleDownManager(self.scaling)
57
+ self.resources_tracker = ResourcesTracker()
58
+ self.machine_resources_tracker = MachineResourcesTracker(self.scaling)
59
+ self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
60
+
61
+ def setup_docker(self) -> bool:
62
+ """
63
+ Setup docker.
64
+
65
+ Returns:
66
+ bool: True if setup successful
67
+ """
68
+ response, error, message = self.scaling.get_docker_hub_credentials()
69
+ if error is None:
70
+ self.docker_username = response.get("username")
71
+ self.docker_password = response.get("password")
72
+ else:
73
+ logging.error(
74
+ "Error getting docker credentials: %s",
75
+ error,
76
+ )
77
+ return False
78
+ try:
79
+ cmd = f"docker login -u {self.docker_username} -p {self.docker_password}"
80
+ subprocess.run(cmd, shell=True, check=True)
81
+ logging.info("Successfully logged into Docker")
82
+ return True
83
+ except subprocess.CalledProcessError as err:
84
+ logging.error(
85
+ "Failed to login to Docker: %s",
86
+ str(err),
87
+ )
88
+ return False
89
+
90
+ def create_docker_volume(self) -> bool:
91
+ """
92
+ Create docker volume.
93
+
94
+ Returns:
95
+ bool: True if volume created successfully
96
+ """
97
+ try:
98
+ subprocess.run(
99
+ [
100
+ "docker",
101
+ "volume",
102
+ "create",
103
+ "workspace",
104
+ ],
105
+ check=True,
106
+ )
107
+ return True
108
+ except subprocess.CalledProcessError as err:
109
+ logging.error(
110
+ "Failed to create docker volume: %s",
111
+ str(err),
112
+ )
113
+ return False
114
+
115
+ def get_available_resources(self) -> bool:
116
+ """Check available system resources are within valid ranges.
117
+
118
+ Returns:
119
+ bool: True if resources are within valid ranges
120
+ """
121
+ (
122
+ available_memory,
123
+ available_cpu,
124
+ gpu_memory_free,
125
+ gpu_utilization,
126
+ ) = self.resources_tracker.get_available_resources()
127
+ if any(
128
+ resource > 100
129
+ for resource in [
130
+ available_memory,
131
+ available_cpu,
132
+ ]
133
+ ):
134
+ logging.error(
135
+ "Resource usage exceeds 100%: Memory %s%%, CPU %s%%",
136
+ available_memory,
137
+ available_cpu,
138
+ )
139
+ sys.exit(1)
140
+ if gpu_memory_free > 256:
141
+ logging.error(
142
+ "GPU memory exceeds 256GB limit: %sGB",
143
+ gpu_memory_free,
144
+ )
145
+ sys.exit(1)
146
+ if any(
147
+ resource < 0
148
+ for resource in [
149
+ available_memory,
150
+ available_cpu,
151
+ gpu_memory_free,
152
+ gpu_utilization,
153
+ ]
154
+ ):
155
+ logging.error(
156
+ "Resource usage cannot be negative: Memory %s%%, CPU %s%%, GPU Memory %sGB",
157
+ available_memory,
158
+ available_cpu,
159
+ gpu_memory_free,
160
+ )
161
+ sys.exit(1)
162
+ if gpu_utilization > 100:
163
+ logging.error(
164
+ "GPU utilization exceeds 100%%: %s%%",
165
+ gpu_utilization,
166
+ )
167
+ sys.exit(1)
168
+ logging.info("Resource availability check passed")
169
+ return True
170
+
171
+ def check_credentials(
172
+ self,
173
+ access_key: Optional[str] = None,
174
+ secret_key: Optional[str] = None,
175
+ ) -> bool:
176
+ """Check if access key and secret key are valid.
177
+
178
+ Args:
179
+ access_key: Optional access key to validate
180
+ secret_key: Optional secret key to validate
181
+
182
+ Returns:
183
+ bool: True if credentials are valid
184
+ """
185
+ if not access_key or not secret_key:
186
+ logging.error("Missing access key or secret key")
187
+ sys.exit(1)
188
+ logging.info("Credentials check passed")
189
+ return True
190
+
191
+ def check_instance_id(self, instance_id: Optional[str] = None) -> bool:
192
+ """Validate instance ID from args or env.
193
+
194
+ Args:
195
+ instance_id: Optional instance ID to validate
196
+
197
+ Returns:
198
+ bool: True if instance ID is valid
199
+ """
200
+ if not instance_id:
201
+ logging.error("Missing instance ID")
202
+ sys.exit(1)
203
+ if not isinstance(instance_id, str) or len(instance_id) < 8:
204
+ logging.error("Invalid instance ID format")
205
+ sys.exit(1)
206
+ self.instance_id = instance_id
207
+ instance_info = get_instance_info(instance_id)
208
+ if not instance_info:
209
+ logging.error(
210
+ "Invalid instance ID %s",
211
+ self.instance_id,
212
+ )
213
+ sys.exit(1)
214
+ logging.info(
215
+ "Instance ID %s validated",
216
+ self.instance_id,
217
+ )
218
+ return True
219
+
220
+ def check_docker(self) -> bool:
221
+ """Check if docker is installed and working.
222
+
223
+ Returns:
224
+ bool: True if docker is working
225
+ """
226
+ if not check_docker():
227
+ logging.error("Docker not installed or not running")
228
+ sys.exit(1)
229
+ try:
230
+ import docker
231
+
232
+ client = docker.from_env()
233
+ client.ping()
234
+ except Exception as err:
235
+ logging.error(
236
+ "Docker API check failed: %s",
237
+ str(err),
238
+ )
239
+ sys.exit(1)
240
+ logging.info("Docker check passed")
241
+ return True
242
+
243
+ def check_gpu(self) -> bool:
244
+ """Check if machine has GPU and it's functioning.
245
+
246
+ Returns:
247
+ bool: True if GPU check passes
248
+ """
249
+ gpu_mem = get_gpu_memory_usage()
250
+ if not gpu_mem:
251
+ logging.error("No GPU detected on this machine")
252
+ sys.exit(1)
253
+ if any(mem < 4 for mem in gpu_mem.values()):
254
+ logging.error("GPU has insufficient memory (min 4GB required)")
255
+ sys.exit(1)
256
+ try:
257
+ import torch
258
+
259
+ if not torch.cuda.is_available():
260
+ logging.error("CUDA not available")
261
+ sys.exit(1)
262
+ except ImportError:
263
+ logging.warning("PyTorch not installed - skipping CUDA check")
264
+ logging.info("GPU check passed")
265
+ return True
266
+
267
+ def check_resources(self) -> bool:
268
+ """Validate system resource limits and availability.
269
+
270
+ Returns:
271
+ bool: True if resource checks pass
272
+ """
273
+ cpu_usage = get_cpu_memory_usage()
274
+ if cpu_usage > 100:
275
+ logging.error(
276
+ "CPU usage exceeds 100%%: %s%%",
277
+ cpu_usage,
278
+ )
279
+ sys.exit(1)
280
+ elif cpu_usage > 90:
281
+ logging.warning("High CPU usage: %s%%", cpu_usage)
282
+ mem_usage = get_mem_usage()
283
+ if mem_usage > 100:
284
+ logging.error(
285
+ "Memory usage exceeds 100%%: %s%%",
286
+ mem_usage,
287
+ )
288
+ sys.exit(1)
289
+ elif mem_usage > 90:
290
+ logging.warning(
291
+ "High memory usage: %s%%",
292
+ mem_usage,
293
+ )
294
+ gpu_mem = get_gpu_memory_usage()
295
+ if any(mem > 256 for mem in gpu_mem.values()):
296
+ logging.error("GPU memory exceeds 256GB limit")
297
+ sys.exit(1)
298
+ if cpu_usage > 95 or mem_usage > 95:
299
+ logging.error("Insufficient available resources")
300
+ sys.exit(1)
301
+ logging.info("Resource limits check passed")
302
+ return True
303
+
304
+ def cleanup_docker_storage(self) -> bool:
305
+ """Clean up docker storage and verify space freed.
306
+
307
+ Returns:
308
+ bool: True if cleanup successful
309
+ """
310
+ try:
311
+ initial_space = get_max_file_system()
312
+ cleanup_docker_storage()
313
+ final_space = get_max_file_system()
314
+ if final_space <= initial_space:
315
+ logging.warning("Docker cleanup did not free any space")
316
+ return True
317
+ except Exception as err:
318
+ logging.error(
319
+ "Docker storage cleanup failed: %s",
320
+ str(err),
321
+ )
322
+ return False
323
+
324
+ def get_shutdown_details(self) -> bool:
325
+ """Get and validate shutdown details from response.
326
+
327
+ Returns:
328
+ bool: True if shutdown details are valid
329
+ """
330
+ try:
331
+ response = self.scaling.get_shutdown_details()
332
+ if not response:
333
+ logging.error("Empty response from get_shutdown_details")
334
+ return False
335
+ required_fields = [
336
+ "shutdownThreshold",
337
+ "launchDuration",
338
+ "instanceSource",
339
+ ]
340
+ if not all(field in response for field in required_fields):
341
+ logging.error("Invalid shutdown details response")
342
+ return False
343
+ self.shutdown_threshold = response.get("shutdownThreshold")
344
+ self.launch_duration = response.get("launchDuration")
345
+ self.instance_source = response.get("instanceSource")
346
+ if (
347
+ not isinstance(
348
+ self.shutdown_threshold,
349
+ (int, float),
350
+ )
351
+ or self.shutdown_threshold <= 0
352
+ ):
353
+ logging.error("Invalid shutdown threshold")
354
+ return False
355
+ if (
356
+ not isinstance(
357
+ self.launch_duration,
358
+ (int, float),
359
+ )
360
+ or self.launch_duration <= 0
361
+ ):
362
+ logging.error("Invalid launch duration")
363
+ return False
364
+ return True
365
+ except Exception as err:
366
+ logging.error(
367
+ "Failed to get shutdown details: %s",
368
+ str(err),
369
+ )
370
+ return False
371
+
372
+ def test_gpu(self) -> bool:
373
+ """Test if GPU is working and has sufficient memory.
374
+
375
+ Returns:
376
+ bool: True if GPU test passes
377
+ """
378
+ if has_gpu():
379
+ action_details = {"memory_required": 4}
380
+ gpu_indices = get_gpu_with_sufficient_memory_for_action(action_details)
381
+ if not gpu_indices:
382
+ logging.error("No GPU with sufficient memory")
383
+ sys.exit(1)
384
+ try:
385
+ import torch
386
+
387
+ test_tensor = torch.cuda.FloatTensor(2, 2).fill_(1.0)
388
+ result = torch.matmul(test_tensor, test_tensor)
389
+ if not torch.all(result == 2.0):
390
+ logging.error("GPU computation test failed")
391
+ sys.exit(1)
392
+ except Exception as err:
393
+ logging.error(
394
+ "GPU computation test failed: %s",
395
+ str(err),
396
+ )
397
+ sys.exit(1)
398
+ return True
399
+
400
+ def check_get_gpu_indices(self) -> bool:
401
+ """Check if get_gpu_indices returns valid indices.
402
+
403
+ Returns:
404
+ bool: True if GPU indices are valid
405
+ """
406
+ action_details = {"memory_required": 4}
407
+ gpu_indices = get_gpu_with_sufficient_memory_for_action(action_details)
408
+ if not gpu_indices:
409
+ logging.error("Failed to get GPU indices")
410
+ sys.exit(1)
411
+ if not all(isinstance(idx, int) and idx >= 0 for idx in gpu_indices):
412
+ logging.error("Invalid GPU indices returned")
413
+ sys.exit(1)
414
+ if len(gpu_indices) != len(set(gpu_indices)):
415
+ logging.error("Duplicate GPU indices returned")
416
+ sys.exit(1)
417
+ return True
418
+
419
+ def check_resources_tracking(self) -> bool:
420
+ """Test resource tracking updates and monitoring.
421
+
422
+ Returns:
423
+ bool: True if resource tracking is working
424
+ """
425
+ try:
426
+ self.machine_resources_tracker.update_available_resources()
427
+ self.actions_resources_tracker.update_actions_resources()
428
+ return True
429
+ except Exception as err:
430
+ logging.error(
431
+ "Failed to update resource tracking: %s",
432
+ str(err),
433
+ )
434
+ sys.exit(1)
435
+
436
+ def check_scaling_status(self) -> bool:
437
+ """Test scaling service status.
438
+
439
+ Returns:
440
+ bool: True if scaling status is ok
441
+ """
442
+ try:
443
+ downscaled_ids = self.scaling.get_downscaled_ids()
444
+ if self.instance_id in downscaled_ids:
445
+ logging.error("Instance is marked for downscaling")
446
+ sys.exit(1)
447
+ return True
448
+ except Exception as err:
449
+ logging.error(
450
+ "Failed to check scaling status: %s",
451
+ str(err),
452
+ )
453
+ sys.exit(1)
454
+
455
+ def check_filesystem_space(self) -> bool:
456
+ """Check available filesystem space and usage.
457
+
458
+ Returns:
459
+ bool: True if filesystem space is sufficient
460
+ """
461
+ max_fs = get_max_file_system()
462
+ if not max_fs:
463
+ logging.error("Failed to get filesystem information")
464
+ sys.exit(1)
465
+ return True
466
+
467
+ def test_actions_scale_down(self) -> bool:
468
+ """Test actions scale down.
469
+
470
+ Returns:
471
+ bool: True if scale down test passes
472
+ """
473
+ self.actions_scale_down_manager.auto_scaledown_actions()
474
+ return True
475
+
476
+ def check_fetch_actions(self) -> bool:
477
+ """Test action fetching and validation.
478
+
479
+ Returns:
480
+ bool: True if action fetching works
481
+ """
482
+ fetched_actions, error, message = self.scaling.assign_jobs(has_gpu())
483
+ if error:
484
+ logging.error("Error assigning jobs: %s", error)
485
+ return False
486
+ if fetched_actions:
487
+ if not isinstance(fetched_actions, list):
488
+ logging.error("Invalid actions format")
489
+ return False
490
+ for action in fetched_actions:
491
+ if not isinstance(action, dict) or "_id" not in action:
492
+ logging.error("Invalid action format")
493
+ return False
494
+ return True
495
+
496
+ def run_all_checks(
497
+ self,
498
+ instance_id: Optional[str] = None,
499
+ access_key: Optional[str] = None,
500
+ secret_key: Optional[str] = None,
501
+ ) -> bool:
502
+ """Run all prechecks in sequence.
503
+
504
+ Args:
505
+ instance_id: Optional instance ID to validate
506
+ access_key: Optional access key to validate
507
+ secret_key: Optional secret key to validate
508
+
509
+ Returns:
510
+ bool: True if all checks pass
511
+ """
512
+ checks = [
513
+ lambda: self.check_credentials(access_key, secret_key),
514
+ lambda: self.check_instance_id(instance_id),
515
+ self.check_docker,
516
+ self.setup_docker,
517
+ self.create_docker_volume,
518
+ self.check_gpu,
519
+ self.check_resources,
520
+ self.cleanup_docker_storage,
521
+ self.check_filesystem_space,
522
+ self.check_resources_tracking,
523
+ self.check_scaling_status,
524
+ self.get_shutdown_details,
525
+ self.test_gpu,
526
+ self.check_get_gpu_indices,
527
+ self.test_actions_scale_down,
528
+ self.check_fetch_actions,
529
+ ]
530
+ for check in checks:
531
+ if not check():
532
+ logging.error(
533
+ "Check failed: %s",
534
+ check.__name__,
535
+ )
536
+ return False
537
+ logging.info("All prechecks passed successfully")
538
+ return True
File without changes