matrice-compute 0.1.19__tar.gz → 0.1.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/PKG-INFO +1 -1
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/action_instance.py +104 -19
- matrice_compute-0.1.21/src/matrice_compute/instance_utils.py +1147 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/resources_tracker.py +125 -53
- matrice_compute-0.1.21/src/matrice_compute/scaling.py +1224 -0
- matrice_compute-0.1.19/src/matrice_compute/instance_utils.py +0 -738
- matrice_compute-0.1.19/src/matrice_compute/scaling.py +0 -972
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/LICENSE.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/README.md +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/pyproject.toml +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/setup.cfg +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/setup.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.19 → matrice_compute-0.1.21}/src/matrice_compute/task_utils.py +0 -0
|
@@ -12,6 +12,7 @@ from matrice_compute.instance_utils import (
|
|
|
12
12
|
get_gpu_with_sufficient_memory_for_action,
|
|
13
13
|
get_decrypted_access_key_pair,
|
|
14
14
|
get_max_file_system,
|
|
15
|
+
get_best_service_ip_and_network,
|
|
15
16
|
)
|
|
16
17
|
from matrice_compute.task_utils import (
|
|
17
18
|
setup_workspace_and_run_task,
|
|
@@ -267,17 +268,68 @@ class ActionInstance:
|
|
|
267
268
|
Returns:
|
|
268
269
|
str: GPU configuration string
|
|
269
270
|
"""
|
|
270
|
-
|
|
271
|
+
action_id = action_details.get("_id", "unknown")
|
|
272
|
+
|
|
273
|
+
# Check if GPU is required
|
|
274
|
+
gpu_required = action_details["actionDetails"].get("gpuRequired", False)
|
|
275
|
+
if not gpu_required:
|
|
276
|
+
logging.info(
|
|
277
|
+
"Action %s does not require GPU - will run on CPU",
|
|
278
|
+
action_id
|
|
279
|
+
)
|
|
271
280
|
return ""
|
|
272
|
-
|
|
273
|
-
|
|
281
|
+
|
|
282
|
+
# Get required GPU memory for logging
|
|
283
|
+
required_memory = action_details.get("actionDetails", {}).get(
|
|
284
|
+
"expectedResources", {}
|
|
285
|
+
).get("gpuMemory", 0)
|
|
286
|
+
|
|
287
|
+
logging.info(
|
|
288
|
+
"Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
|
|
289
|
+
action_id,
|
|
290
|
+
required_memory
|
|
274
291
|
)
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
# Get the best-fit GPU(s) with sufficient memory
|
|
295
|
+
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
296
|
+
action_details=action_details
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
if gpu_indices:
|
|
300
|
+
gpu_str = ",".join(map(str, gpu_indices))
|
|
301
|
+
logging.info(
|
|
302
|
+
"Action %s: Selected GPU device(s): %s (required memory: %d MB)",
|
|
303
|
+
action_id,
|
|
304
|
+
gpu_str,
|
|
305
|
+
required_memory
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Return Docker GPU configuration
|
|
309
|
+
# Format: --gpus "device=0" or --gpus "device=0,1,2"
|
|
310
|
+
return f'--gpus "device={gpu_str}"'
|
|
311
|
+
else:
|
|
312
|
+
logging.warning(
|
|
313
|
+
"Action %s: No GPUs with sufficient memory found (required: %d MB)",
|
|
314
|
+
action_id,
|
|
315
|
+
required_memory
|
|
316
|
+
)
|
|
317
|
+
return ""
|
|
318
|
+
|
|
319
|
+
except ValueError as e:
|
|
320
|
+
logging.error(
|
|
321
|
+
"Action %s: Error selecting GPU - %s",
|
|
322
|
+
action_id,
|
|
323
|
+
str(e)
|
|
324
|
+
)
|
|
325
|
+
return ""
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logging.error(
|
|
328
|
+
"Action %s: Unexpected error in GPU selection - %s",
|
|
329
|
+
action_id,
|
|
330
|
+
str(e)
|
|
331
|
+
)
|
|
332
|
+
return ""
|
|
281
333
|
|
|
282
334
|
@log_errors(default_return="", raise_exception=False)
|
|
283
335
|
def get_base_docker_cmd(
|
|
@@ -526,13 +578,18 @@ class ActionInstance:
|
|
|
526
578
|
|
|
527
579
|
if username and password:
|
|
528
580
|
login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
|
|
529
|
-
subprocess.run(login_cmd, shell=True, check=True)
|
|
581
|
+
result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
|
|
582
|
+
if result.returncode != 0:
|
|
583
|
+
raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
|
|
530
584
|
logging.info("Docker login successful")
|
|
531
585
|
else:
|
|
532
586
|
logging.warning(
|
|
533
587
|
"Docker credentials not available, skipping Docker login"
|
|
534
588
|
)
|
|
535
589
|
|
|
590
|
+
except subprocess.TimeoutExpired:
|
|
591
|
+
logging.error("Docker login timed out after 30 seconds")
|
|
592
|
+
raise Exception("Docker login timed out")
|
|
536
593
|
except Exception as err:
|
|
537
594
|
logging.error(
|
|
538
595
|
"Docker login failed: %s",
|
|
@@ -1151,9 +1208,17 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1151
1208
|
return
|
|
1152
1209
|
image = action_details["actionDetails"].get("docker")
|
|
1153
1210
|
|
|
1154
|
-
|
|
1155
1211
|
self.setup_action_requirements(action_details)
|
|
1156
1212
|
|
|
1213
|
+
# Get the best IP and network configuration for port 8102
|
|
1214
|
+
ws_host, use_host_network = get_best_service_ip_and_network(8102)
|
|
1215
|
+
|
|
1216
|
+
# Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
|
|
1217
|
+
if not os.environ.get("INFERENCE_WS_HOST"):
|
|
1218
|
+
os.environ["INFERENCE_WS_HOST"] = ws_host
|
|
1219
|
+
|
|
1220
|
+
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1221
|
+
|
|
1157
1222
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1158
1223
|
worker_cmd = (
|
|
1159
1224
|
f"docker run -d --pull=always --net=host "
|
|
@@ -1164,7 +1229,6 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1164
1229
|
f"{image} "
|
|
1165
1230
|
f"./app "
|
|
1166
1231
|
f"{self.action_record_id} "
|
|
1167
|
-
|
|
1168
1232
|
)
|
|
1169
1233
|
logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
|
|
1170
1234
|
|
|
@@ -1185,7 +1249,13 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1185
1249
|
image = action_details["actionDetails"].get("docker")
|
|
1186
1250
|
|
|
1187
1251
|
self.setup_action_requirements(action_details)
|
|
1188
|
-
|
|
1252
|
+
|
|
1253
|
+
# Get the ws_host from environment variable set by inference_ws_server_execute
|
|
1254
|
+
ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
|
|
1255
|
+
ws_url = f"{ws_host}:8102"
|
|
1256
|
+
|
|
1257
|
+
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1258
|
+
|
|
1189
1259
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1190
1260
|
worker_cmd = (
|
|
1191
1261
|
f"docker run -d --pull=always --net=host "
|
|
@@ -1195,9 +1265,10 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1195
1265
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1196
1266
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1197
1267
|
f"-e PORT=3000 "
|
|
1268
|
+
f'-e WS_HOST="{ws_url}" '
|
|
1198
1269
|
f"{image}"
|
|
1199
1270
|
)
|
|
1200
|
-
logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
|
|
1271
|
+
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
1201
1272
|
|
|
1202
1273
|
# Docker Command run
|
|
1203
1274
|
self.start(worker_cmd, "fe_fs_streaming")
|
|
@@ -1304,6 +1375,11 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1304
1375
|
action_id=action_id,
|
|
1305
1376
|
)
|
|
1306
1377
|
|
|
1378
|
+
# Get the best IP for Redis (port 6379)
|
|
1379
|
+
redis_host, _ = get_best_service_ip_and_network(6379)
|
|
1380
|
+
|
|
1381
|
+
logging.info(f"Redis will use IP: {redis_host} on port 6379")
|
|
1382
|
+
|
|
1307
1383
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1308
1384
|
|
|
1309
1385
|
# Redis container with --net=host (Port: 6379)
|
|
@@ -1315,7 +1391,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1315
1391
|
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1316
1392
|
)
|
|
1317
1393
|
|
|
1318
|
-
logging.info("Starting Redis container
|
|
1394
|
+
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
1319
1395
|
|
|
1320
1396
|
# Start Redis container first
|
|
1321
1397
|
redis_process = subprocess.Popen(
|
|
@@ -1324,13 +1400,13 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1324
1400
|
stdout=subprocess.PIPE,
|
|
1325
1401
|
stderr=subprocess.PIPE,
|
|
1326
1402
|
)
|
|
1327
|
-
logging.info("Redis container started successfully on
|
|
1403
|
+
logging.info("Redis container started successfully on %s:6379", redis_host)
|
|
1328
1404
|
|
|
1329
1405
|
# Wait for Redis to be ready
|
|
1330
1406
|
time.sleep(5)
|
|
1331
1407
|
|
|
1332
1408
|
env_vars = {
|
|
1333
|
-
"REDIS_URL": f"
|
|
1409
|
+
"REDIS_URL": f"{redis_host}:6379",
|
|
1334
1410
|
"REDIS_PASSWORD": redis_password,
|
|
1335
1411
|
}
|
|
1336
1412
|
|
|
@@ -1348,7 +1424,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1348
1424
|
f"{self.action_record_id} "
|
|
1349
1425
|
)
|
|
1350
1426
|
|
|
1351
|
-
logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
|
|
1427
|
+
logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
|
|
1352
1428
|
|
|
1353
1429
|
self.start(cmd, "redis_setup")
|
|
1354
1430
|
|
|
@@ -1385,8 +1461,17 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1385
1461
|
model_family=model_family,
|
|
1386
1462
|
action_id=action_id,
|
|
1387
1463
|
)
|
|
1464
|
+
|
|
1465
|
+
# Get GPU configuration based on requirements and availability
|
|
1466
|
+
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1388
1467
|
use_gpu = self.get_gpu_config(action_details)
|
|
1389
|
-
|
|
1468
|
+
|
|
1469
|
+
logging.info(
|
|
1470
|
+
"Action %s: Model deployment GPU config: %s",
|
|
1471
|
+
action_id,
|
|
1472
|
+
use_gpu if use_gpu else "CPU-only"
|
|
1473
|
+
)
|
|
1474
|
+
|
|
1390
1475
|
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1391
1476
|
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1392
1477
|
logging.info("cmd is: %s", cmd)
|