matrice-compute 0.1.13__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/action_instance.py +117 -102
  4. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/instance_utils.py +7 -1
  5. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/resources_tracker.py +16 -4
  6. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/scaling.py +5 -2
  7. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/task_utils.py +2 -2
  8. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/LICENSE.txt +0 -0
  9. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/README.md +0 -0
  10. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/matrice_compute.egg-info/SOURCES.txt +0 -0
  11. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/matrice_compute.egg-info/dependency_links.txt +0 -0
  12. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/matrice_compute.egg-info/not-zip-safe +0 -0
  13. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/matrice_compute.egg-info/top_level.txt +0 -0
  14. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/pyproject.toml +0 -0
  15. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/setup.cfg +0 -0
  16. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/setup.py +0 -0
  17. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/__init__.py +0 -0
  18. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/actions_manager.py +0 -0
  19. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  20. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/instance_manager.py +0 -0
  21. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/prechecks.py +0 -0
  22. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/py.typed +0 -0
  23. {matrice_compute-0.1.13 → matrice_compute-0.1.14}/src/matrice_compute/shutdown_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -575,75 +575,75 @@ class ActionInstance:
575
575
  )
576
576
  raise
577
577
 
578
- @log_errors(raise_exception=False)
579
- def create_redis_container(self, redis_image=None, redis_password=None):
580
- """Create and start a Redis container using Docker.
581
-
582
- Args:
583
- redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
584
-
585
- Returns:
586
- tuple: (container_info, error, message)
587
- """
588
- if redis_image is None:
589
- redis_image = "redis:latest"
590
-
591
- network_name = f"redis_network_{int(time.time())}"
592
- subprocess.run(f"docker network create {network_name}", shell=True, check=True)
593
-
594
- try:
595
- # Get an available port for Redis
596
- external_port = "6379"
597
-
598
- # Generate a unique container name and password
599
- container_name = f"redis_container_{int(time.time())}"
600
-
601
- # Build the docker command to create Redis container with password
602
- cmd = (
603
- f"docker run -d "
604
- f"--network {network_name} "
605
- f"--name {container_name} "
606
- f"-p {external_port}:6379 "
607
- f"--restart unless-stopped "
608
- f"{redis_image} "
609
- f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
610
- )
611
-
612
- logging.info("Creating Redis container with command: %s", cmd)
613
-
614
- # Execute the command
615
- result = subprocess.run(
616
- cmd, shell=True, capture_output=True, text=True, timeout=60
617
- )
618
-
619
- if result.returncode == 0:
620
- container_id = result.stdout.strip()
621
- container_info = {
622
- "container_id": container_id,
623
- "container_name": container_name,
624
- "network_name": network_name,
625
- "external_port": external_port,
626
- "internal_port": 6379,
627
- "password": redis_password,
628
- "image": redis_image,
629
- "status": "running",
630
- }
631
-
632
- logging.info("Redis container created successfully: %s", container_info)
633
- return container_info, None, "Redis container created successfully"
634
- else:
635
- error_message = f"Failed to create Redis container: {result.stderr}"
636
- logging.error(error_message)
637
- return None, "ContainerCreationError", error_message
638
-
639
- except subprocess.TimeoutExpired:
640
- error_message = "Timeout while creating Redis container"
641
- logging.error(error_message)
642
- return None, "TimeoutError", error_message
643
- except Exception as e:
644
- error_message = f"Unexpected error creating Redis container: {str(e)}"
645
- logging.error(error_message)
646
- return None, "UnexpectedError", error_message
578
+ # @log_errors(raise_exception=False)
579
+ # def create_redis_container(self, redis_image=None, redis_password=None):
580
+ # """Create and start a Redis container using Docker.
581
+
582
+ # Args:
583
+ # redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
584
+
585
+ # Returns:
586
+ # tuple: (container_info, error, message)
587
+ # """
588
+ # if redis_image is None:
589
+ # redis_image = "redis:latest"
590
+
591
+ # network_name = f"redis_network_{int(time.time())}"
592
+ # subprocess.run(f"docker network create {network_name}", shell=True, check=True)
593
+
594
+ # try:
595
+ # # Get an available port for Redis
596
+ # external_port = "6379"
597
+
598
+ # # Generate a unique container name and password
599
+ # container_name = f"redis_container_{int(time.time())}"
600
+
601
+ # # Build the docker command to create Redis container with password
602
+ # cmd = (
603
+ # f"docker run -d "
604
+ # f"--network {network_name} "
605
+ # f"--name {container_name} "
606
+ # f"-p {external_port}:6379 "
607
+ # f"--restart unless-stopped "
608
+ # f"{redis_image} "
609
+ # f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
610
+ # )
611
+
612
+ # logging.info("Creating Redis container with command: %s", cmd)
613
+
614
+ # # Execute the command
615
+ # result = subprocess.run(
616
+ # cmd, shell=True, capture_output=True, text=True, timeout=60
617
+ # )
618
+
619
+ # if result.returncode == 0:
620
+ # container_id = result.stdout.strip()
621
+ # container_info = {
622
+ # "container_id": container_id,
623
+ # "container_name": container_name,
624
+ # "network_name": network_name,
625
+ # "external_port": external_port,
626
+ # "internal_port": 6379,
627
+ # "password": redis_password,
628
+ # "image": redis_image,
629
+ # "status": "running",
630
+ # }
631
+
632
+ # logging.info("Redis container created successfully: %s", container_info)
633
+ # return container_info, None, "Redis container created successfully"
634
+ # else:
635
+ # error_message = f"Failed to create Redis container: {result.stderr}"
636
+ # logging.error(error_message)
637
+ # return None, "ContainerCreationError", error_message
638
+
639
+ # except subprocess.TimeoutExpired:
640
+ # error_message = "Timeout while creating Redis container"
641
+ # logging.error(error_message)
642
+ # return None, "TimeoutError", error_message
643
+ # except Exception as e:
644
+ # error_message = f"Unexpected error creating Redis container: {str(e)}"
645
+ # logging.error(error_message)
646
+ # return None, "UnexpectedError", error_message
647
647
 
648
648
  @log_errors(raise_exception=False, log_error=False)
649
649
  def send_logs_continuously(self):
@@ -1053,7 +1053,7 @@ def database_setup_execute(self: ActionInstance):
1053
1053
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1054
1054
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
1055
1055
  f"-e PROJECT_ID={project_id} "
1056
- f"-e ENV=dev "
1056
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1057
1057
  f"{image} "
1058
1058
  )
1059
1059
  print("Docker command", cmd)
@@ -1124,6 +1124,7 @@ def lpr_setup_execute(self: ActionInstance):
1124
1124
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1125
1125
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1126
1126
  f'-e ACTION_ID="{self.action_record_id}" '
1127
+ f'-e PORT={external_port} '
1127
1128
  f"{image}"
1128
1129
  )
1129
1130
  print("Worker docker run command:", worker_cmd)
@@ -1134,27 +1135,31 @@ def lpr_setup_execute(self: ActionInstance):
1134
1135
  @log_errors(raise_exception=False)
1135
1136
  def inference_ws_server_execute(self: ActionInstance):
1136
1137
  """
1137
- Creates and start inference pipline.
1138
+ Creates and start inference pipeline.
1139
+ Inference WebSocket server runs on port 8102 (localhost only with --net=host).
1138
1140
  """
1139
1141
  action_details = self.get_action_details()
1140
1142
 
1141
1143
  if not action_details:
1142
1144
  return
1143
1145
  image = action_details["actionDetails"].get("docker")
1146
+
1144
1147
 
1145
1148
  self.setup_action_requirements(action_details)
1146
1149
 
1147
- # Add worker container run command
1150
+ # Inference WebSocket server with --net=host (Port: 8102)
1148
1151
  worker_cmd = (
1149
- f"docker run -d --pull=always "
1152
+ f"docker run -d --pull=always --net=host "
1150
1153
  f"--name inference "
1151
- f"-p 8102:8102 "
1152
1154
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1153
1155
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1154
1156
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1155
- f"{image}"
1157
+ f"{image} "
1158
+ f"./app "
1159
+ f"{self.action_record_id} "
1160
+
1156
1161
  )
1157
- print("inference docker run command:", worker_cmd)
1162
+ logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
1158
1163
 
1159
1164
  # Docker Command run
1160
1165
  self.start(worker_cmd, "inference_ws_server")
@@ -1163,7 +1168,8 @@ def inference_ws_server_execute(self: ActionInstance):
1163
1168
  @log_errors(raise_exception=False)
1164
1169
  def fe_fs_streaming_execute(self: ActionInstance):
1165
1170
  """
1166
- Creates and setup the frontend for fs streaming
1171
+ Creates and setup the frontend for fs streaming.
1172
+ Frontend streaming runs on port 3000 (localhost only with --net=host).
1167
1173
  """
1168
1174
  action_details = self.get_action_details()
1169
1175
 
@@ -1173,17 +1179,16 @@ def fe_fs_streaming_execute(self: ActionInstance):
1173
1179
 
1174
1180
  self.setup_action_requirements(action_details)
1175
1181
 
1176
- # Add worker container run command
1182
+ # Frontend streaming with --net=host (Port: 3000)
1177
1183
  worker_cmd = (
1178
- f"docker run -d --pull=always "
1184
+ f"docker run -d --pull=always --net=host "
1179
1185
  f"--name fe_streaming "
1180
- f"-p 3000:3000 "
1181
1186
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1182
1187
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1183
1188
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1184
1189
  f"{image}"
1185
1190
  )
1186
- print("fe_fs_stremaing docker run command:", worker_cmd)
1191
+ logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
1187
1192
 
1188
1193
  # Docker Command run
1189
1194
  self.start(worker_cmd, "fe_fs_streaming")
@@ -1235,8 +1240,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
1235
1240
  def redis_setup_execute(self: ActionInstance):
1236
1241
  """
1237
1242
  Creates and starts a Redis container using Docker.
1243
+ Redis runs on port 6379 (localhost only with --net=host).
1238
1244
  """
1239
- external_port = self.scaling.get_open_port()
1240
1245
  work_fs = get_max_file_system()
1241
1246
 
1242
1247
  action_details = self.get_action_details()
@@ -1248,18 +1253,6 @@ def redis_setup_execute(self: ActionInstance):
1248
1253
  "password", f"redis_pass_{int(time.time())}"
1249
1254
  )
1250
1255
 
1251
- container_info, error, message = self.create_redis_container(
1252
- action_details["actionDetails"].get("redis_image", "redis:latest"),
1253
- redis_password=redis_password,
1254
- )
1255
- if error:
1256
- logging.error(
1257
- "Error creating Redis container: %s",
1258
- message,
1259
- )
1260
- return
1261
- logging.info("Redis container created successfully: %s", container_info)
1262
-
1263
1256
  # Initialize redis container
1264
1257
  self.setup_action_requirements(
1265
1258
  action_details,
@@ -1268,17 +1261,39 @@ def redis_setup_execute(self: ActionInstance):
1268
1261
  action_id=action_id,
1269
1262
  )
1270
1263
 
1264
+ redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1265
+
1266
+ # Redis container with --net=host (Port: 6379)
1267
+ redis_cmd = (
1268
+ f"docker run -d --net=host "
1269
+ f"--name redis_container_{int(time.time())} "
1270
+ f"--restart unless-stopped "
1271
+ f"{redis_image} "
1272
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1273
+ )
1274
+
1275
+ logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
1276
+
1277
+ # Start Redis container first
1278
+ redis_process = subprocess.Popen(
1279
+ redis_cmd,
1280
+ shell=True,
1281
+ stdout=subprocess.PIPE,
1282
+ stderr=subprocess.PIPE,
1283
+ )
1284
+ logging.info("Redis container started successfully on localhost:6379")
1285
+
1286
+ # Wait for Redis to be ready
1287
+ time.sleep(5)
1288
+
1271
1289
  env_vars = {
1272
- "REDIS_URL": f"{container_info['container_name']}:{container_info['external_port']}",
1273
- "REDIS_PASSWORD": container_info["password"],
1290
+ "REDIS_URL": f"localhost:6379",
1291
+ "REDIS_PASSWORD": redis_password,
1274
1292
  }
1275
1293
 
1276
- network_config = f" --network {container_info['network_name']} -p 8082:8082"
1277
-
1278
- # Make the docker file here
1294
+ # bg-redis management container with --net=host (Port: 8082)
1279
1295
  cmd = (
1280
- f"docker run "
1281
- f"{network_config} "
1296
+ f"docker run --net=host "
1282
1297
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1283
1298
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1284
1299
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1290,7 +1305,7 @@ def redis_setup_execute(self: ActionInstance):
1290
1305
  f"{self.action_record_id} "
1291
1306
  )
1292
1307
 
1293
- logging.info("cmd is: %s", cmd)
1308
+ logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
1294
1309
 
1295
1310
  self.start(cmd, "redis_setup")
1296
1311
 
@@ -366,7 +366,13 @@ def get_max_file_system() -> str:
366
366
  max_available_filesystem,
367
367
  max_disk["available"],
368
368
  )
369
- if max_available_filesystem in ["/", ""]:
369
+ # Check if filesystem is writable, or if it's root/empty
370
+ if max_available_filesystem in ["/", ""] or not os.access(max_available_filesystem, os.W_OK):
371
+ if max_available_filesystem not in ["/", ""]:
372
+ logging.warning(
373
+ "Filesystem %s is not writable, falling back to home directory",
374
+ max_available_filesystem,
375
+ )
370
376
  home_dir = os.path.expanduser("~")
371
377
  if not os.environ.get("WORKSPACE_DIR"):
372
378
  logging.error("WORKSPACE_DIR environment variable not set")
@@ -247,10 +247,14 @@ class ActionsResourcesTracker:
247
247
  self.max_actions_usage = {}
248
248
  self.resources_tracker = ResourcesTracker()
249
249
  self.client = docker.from_env()
250
+ self.logged_stopped_containers = []
250
251
 
251
252
  @log_errors(raise_exception=False, log_error=True)
252
253
  def update_actions_resources(self) -> None:
253
- """Process both running and exited containers"""
254
+ """Process both running and exited containers.
255
+
256
+ Note: Does not remove containers to keep logs. Only tracks resource usage.
257
+ """
254
258
  exited_containers = self.client.containers.list(
255
259
  filters={"status": "exited"},
256
260
  all=True,
@@ -259,8 +263,12 @@ class ActionsResourcesTracker:
259
263
  if exited_containers:
260
264
  for container in exited_containers:
261
265
  try:
266
+ if container.id in self.logged_stopped_containers:
267
+ continue
262
268
  self._update_container_action_status(container, "completed")
263
- container.remove()
269
+ self.logged_stopped_containers.append(container.id)
270
+ # COMMENTED OUT: Do not remove containers to keep logs
271
+ # container.remove()
264
272
  except Exception as err:
265
273
  logging.error(
266
274
  "Error processing exited container %s: %s",
@@ -310,7 +318,7 @@ class ActionsResourcesTracker:
310
318
  args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
311
319
  action_record_id = args_24[-1] if args_24 else None
312
320
  if not action_record_id:
313
- logging.warning("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
321
+ logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
314
322
  duration = calculate_time_difference(start_time, finish_time)
315
323
  (
316
324
  current_gpu_utilization,
@@ -320,6 +328,8 @@ class ActionsResourcesTracker:
320
328
  ) = self.get_current_action_usage(container, status)
321
329
  sub_containers = self.get_sub_containers_by_label("action_id", action_record_id)
322
330
  for sub_container in sub_containers:
331
+ if sub_container.id in self.logged_stopped_containers:
332
+ continue
323
333
  (
324
334
  sub_container_gpu_utilization,
325
335
  sub_container_gpu_memory,
@@ -330,10 +340,12 @@ class ActionsResourcesTracker:
330
340
  current_gpu_memory += sub_container_gpu_memory
331
341
  current_cpu_utilization += sub_container_cpu_utilization
332
342
  current_memory_utilization += sub_container_memory_utilization
343
+ # COMMENTED OUT: Do not stop/remove sub-containers to keep logs
333
344
  if status == "completed":
334
345
  try:
335
346
  sub_container.stop()
336
- sub_container.remove(force=True)
347
+ self.logged_stopped_containers.append(sub_container.id)
348
+ # sub_container.remove(force=True)
337
349
  except Exception as err:
338
350
  logging.error(
339
351
  "Error removing sub-container %s: %s",
@@ -9,6 +9,7 @@ import json
9
9
  import time
10
10
  import base64
11
11
 
12
+ # TODO: update /scaling to /compute
12
13
 
13
14
  class Scaling:
14
15
 
@@ -185,6 +186,8 @@ class Scaling:
185
186
  Returns:
186
187
  Tuple of (data, error, message) from API response
187
188
  """
189
+ if not action_record_id:
190
+ return None, "Action record id is required", "Action record id is required"
188
191
  logging.info(
189
192
  "Updating action status for action %s",
190
193
  action_record_id,
@@ -499,7 +502,7 @@ class Scaling:
499
502
 
500
503
  # Using REST API directly
501
504
  try:
502
- path = f"/v1/scaling/update_available_resources/{self.instance_id}"
505
+ path = f"/v1/compute/update_available_resources/{self.instance_id}"
503
506
  resp = self.rpc.put(path=path, payload=payload)
504
507
  return self.handle_response(
505
508
  resp,
@@ -644,7 +647,7 @@ class Scaling:
644
647
  Returns:
645
648
  Tuple of (data, error, message) from API response
646
649
  """
647
- path = f"/v1/scaling/get_models_secret_keys?secret_name={secret_name}"
650
+ path = f"/v1/compute/get_models_secret_keys?secret_name={secret_name}"
648
651
  resp = self.rpc.get(path=path)
649
652
  return self.handle_response(
650
653
  resp,
@@ -29,8 +29,8 @@ def setup_workspace_and_run_task(
29
29
  workspace_dir = f"{work_fs}/{action_id}"
30
30
  codebase_zip_path = f"{workspace_dir}/file.zip"
31
31
  requirements_txt_path = f"{workspace_dir}/requirements.txt"
32
- if os.path.exists(workspace_dir):
33
- return
32
+ # if os.path.exists(workspace_dir): # don't skip if workspace already exists, override it
33
+ # return
34
34
  os.makedirs(workspace_dir, exist_ok=True)
35
35
 
36
36
  # Download codebase ZIP file