matrice-compute 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ from matrice_compute.instance_utils import (
12
12
  get_gpu_with_sufficient_memory_for_action,
13
13
  get_decrypted_access_key_pair,
14
14
  get_max_file_system,
15
+ get_best_service_ip_and_network,
15
16
  )
16
17
  from matrice_compute.task_utils import (
17
18
  setup_workspace_and_run_task,
@@ -267,17 +268,68 @@ class ActionInstance:
267
268
  Returns:
268
269
  str: GPU configuration string
269
270
  """
270
- if not action_details["actionDetails"].get("gpuRequired", False):
271
+ action_id = action_details.get("_id", "unknown")
272
+
273
+ # Check if GPU is required
274
+ gpu_required = action_details["actionDetails"].get("gpuRequired", False)
275
+ if not gpu_required:
276
+ logging.info(
277
+ "Action %s does not require GPU - will run on CPU",
278
+ action_id
279
+ )
271
280
  return ""
272
- gpu_indices = get_gpu_with_sufficient_memory_for_action(
273
- action_details=action_details
281
+
282
+ # Get required GPU memory for logging
283
+ required_memory = action_details.get("actionDetails", {}).get(
284
+ "expectedResources", {}
285
+ ).get("gpuMemory", 0)
286
+
287
+ logging.info(
288
+ "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
289
+ action_id,
290
+ required_memory
274
291
  )
275
- if gpu_indices:
276
- gpu_str = ",".join(map(str, gpu_indices))
277
- logging.info("Using GPUs: %s", gpu_str)
278
- return f'--gpus "device={gpu_str}"'
279
- logging.info("No GPUs with sufficient memory found.")
280
- return ""
292
+
293
+ try:
294
+ # Get the best-fit GPU(s) with sufficient memory
295
+ gpu_indices = get_gpu_with_sufficient_memory_for_action(
296
+ action_details=action_details
297
+ )
298
+
299
+ if gpu_indices:
300
+ gpu_str = ",".join(map(str, gpu_indices))
301
+ logging.info(
302
+ "Action %s: Selected GPU device(s): %s (required memory: %d MB)",
303
+ action_id,
304
+ gpu_str,
305
+ required_memory
306
+ )
307
+
308
+ # Return Docker GPU configuration
309
+ # Format: --gpus "device=0" or --gpus "device=0,1,2"
310
+ return f'--gpus "device={gpu_str}"'
311
+ else:
312
+ logging.warning(
313
+ "Action %s: No GPUs with sufficient memory found (required: %d MB)",
314
+ action_id,
315
+ required_memory
316
+ )
317
+ return ""
318
+
319
+ except ValueError as e:
320
+ logging.error(
321
+ "Action %s: Error selecting GPU - %s",
322
+ action_id,
323
+ str(e)
324
+ )
325
+ return ""
326
+ except Exception as e:
327
+ logging.error(
328
+ "Action %s: Unexpected error in GPU selection - %s",
329
+ action_id,
330
+ str(e)
331
+ )
332
+ return ""
281
333
 
282
334
  @log_errors(default_return="", raise_exception=False)
283
335
  def get_base_docker_cmd(
@@ -526,13 +578,18 @@ class ActionInstance:
526
578
 
527
579
  if username and password:
528
580
  login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
529
- subprocess.run(login_cmd, shell=True, check=True)
581
+ result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
582
+ if result.returncode != 0:
583
+ raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
530
584
  logging.info("Docker login successful")
531
585
  else:
532
586
  logging.warning(
533
587
  "Docker credentials not available, skipping Docker login"
534
588
  )
535
589
 
590
+ except subprocess.TimeoutExpired:
591
+ logging.error("Docker login timed out after 30 seconds")
592
+ raise Exception("Docker login timed out")
536
593
  except Exception as err:
537
594
  logging.error(
538
595
  "Docker login failed: %s",
@@ -1151,9 +1208,17 @@ def inference_ws_server_execute(self: ActionInstance):
1151
1208
  return
1152
1209
  image = action_details["actionDetails"].get("docker")
1153
1210
 
1154
-
1155
1211
  self.setup_action_requirements(action_details)
1156
1212
 
1213
+ # Get the best IP and network configuration for port 8102
1214
+ ws_host, use_host_network = get_best_service_ip_and_network(8102)
1215
+
1216
+ # Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
1217
+ if not os.environ.get("INFERENCE_WS_HOST"):
1218
+ os.environ["INFERENCE_WS_HOST"] = ws_host
1219
+
1220
+ logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1221
+
1157
1222
  # Inference WebSocket server with --net=host (Port: 8102)
1158
1223
  worker_cmd = (
1159
1224
  f"docker run -d --pull=always --net=host "
@@ -1164,7 +1229,6 @@ def inference_ws_server_execute(self: ActionInstance):
1164
1229
  f"{image} "
1165
1230
  f"./app "
1166
1231
  f"{self.action_record_id} "
1167
-
1168
1232
  )
1169
1233
  logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
1170
1234
 
@@ -1185,7 +1249,13 @@ def fe_fs_streaming_execute(self: ActionInstance):
1185
1249
  image = action_details["actionDetails"].get("docker")
1186
1250
 
1187
1251
  self.setup_action_requirements(action_details)
1188
-
1252
+
1253
+ # Get the ws_host from environment variable set by inference_ws_server_execute
1254
+ ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
1255
+ ws_url = f"{ws_host}:8102"
1256
+
1257
+ logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1258
+
1189
1259
  # Frontend streaming with --net=host (Port: 3000)
1190
1260
  worker_cmd = (
1191
1261
  f"docker run -d --pull=always --net=host "
@@ -1195,9 +1265,10 @@ def fe_fs_streaming_execute(self: ActionInstance):
1195
1265
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1196
1266
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1197
1267
  f"-e PORT=3000 "
1268
+ f'-e WS_HOST="{ws_url}" '
1198
1269
  f"{image}"
1199
1270
  )
1200
- logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
1271
+ logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
1201
1272
 
1202
1273
  # Docker Command run
1203
1274
  self.start(worker_cmd, "fe_fs_streaming")
@@ -1304,6 +1375,11 @@ def redis_setup_execute(self: ActionInstance):
1304
1375
  action_id=action_id,
1305
1376
  )
1306
1377
 
1378
+ # Get the best IP for Redis (port 6379)
1379
+ redis_host, _ = get_best_service_ip_and_network(6379)
1380
+
1381
+ logging.info(f"Redis will use IP: {redis_host} on port 6379")
1382
+
1307
1383
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1308
1384
 
1309
1385
  # Redis container with --net=host (Port: 6379)
@@ -1315,7 +1391,7 @@ def redis_setup_execute(self: ActionInstance):
1315
1391
  f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1316
1392
  )
1317
1393
 
1318
- logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
1394
+ logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1319
1395
 
1320
1396
  # Start Redis container first
1321
1397
  redis_process = subprocess.Popen(
@@ -1324,13 +1400,13 @@ def redis_setup_execute(self: ActionInstance):
1324
1400
  stdout=subprocess.PIPE,
1325
1401
  stderr=subprocess.PIPE,
1326
1402
  )
1327
- logging.info("Redis container started successfully on localhost:6379")
1403
+ logging.info("Redis container started successfully on %s:6379", redis_host)
1328
1404
 
1329
1405
  # Wait for Redis to be ready
1330
1406
  time.sleep(5)
1331
1407
 
1332
1408
  env_vars = {
1333
- "REDIS_URL": f"localhost:6379",
1409
+ "REDIS_URL": f"{redis_host}:6379",
1334
1410
  "REDIS_PASSWORD": redis_password,
1335
1411
  }
1336
1412
 
@@ -1348,7 +1424,7 @@ def redis_setup_execute(self: ActionInstance):
1348
1424
  f"{self.action_record_id} "
1349
1425
  )
1350
1426
 
1351
- logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
1427
+ logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
1352
1428
 
1353
1429
  self.start(cmd, "redis_setup")
1354
1430
 
@@ -1385,8 +1461,17 @@ def model_deploy_execute(self: ActionInstance):
1385
1461
  model_family=model_family,
1386
1462
  action_id=action_id,
1387
1463
  )
1464
+
1465
+ # Get GPU configuration based on requirements and availability
1466
+ # This uses the best-fit algorithm to select the most appropriate GPU(s)
1388
1467
  use_gpu = self.get_gpu_config(action_details)
1389
- use_gpu = "--runtime=nvidia "
1468
+
1469
+ logging.info(
1470
+ "Action %s: Model deployment GPU config: %s",
1471
+ action_id,
1472
+ use_gpu if use_gpu else "CPU-only"
1473
+ )
1474
+
1390
1475
  extra_env_vars = {"INTERNAL_PORT": internal_port}
1391
1476
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1392
1477
  logging.info("cmd is: %s", cmd)