matrice-compute 0.1.30__tar.gz → 0.1.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/action_instance.py +31 -47
  4. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/LICENSE.txt +0 -0
  5. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/README.md +0 -0
  6. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/matrice_compute.egg-info/SOURCES.txt +0 -0
  7. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/matrice_compute.egg-info/dependency_links.txt +0 -0
  8. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/matrice_compute.egg-info/not-zip-safe +0 -0
  9. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/matrice_compute.egg-info/top_level.txt +0 -0
  10. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/pyproject.toml +0 -0
  11. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/setup.cfg +0 -0
  12. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/setup.py +0 -0
  13. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/__init__.py +0 -0
  14. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/actions_manager.py +0 -0
  15. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  16. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/compute_operations_handler.py +0 -0
  17. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/instance_manager.py +0 -0
  18. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/instance_utils.py +0 -0
  19. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/prechecks.py +0 -0
  20. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/py.typed +0 -0
  21. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/resources_tracker.py +0 -0
  22. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/scaling.py +0 -0
  23. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/shutdown_manager.py +0 -0
  24. {matrice_compute-0.1.30 → matrice_compute-0.1.31}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.30
3
+ Version: 0.1.31
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.30
3
+ Version: 0.1.31
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -490,12 +490,11 @@ class ActionInstance:
490
490
  )
491
491
 
492
492
  cmd_parts = [
493
- f"docker run {use_gpu} ",
493
+ f"docker run -d {use_gpu} ",
494
494
  network_config,
495
495
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
496
  *volumes,
497
497
  # Container configuration and startup commands
498
- f"--cidfile ./{self.action_record_id}.cid ",
499
498
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
500
499
  f'/bin/bash -c "cd {docker_workdir} && '
501
500
  f"{env_exports} && "
@@ -897,60 +896,45 @@ class ActionInstance:
897
896
  self.cmd = cmd
898
897
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
899
898
 
900
- with open(self.log_path, "wb") as out:
901
- self.process = subprocess.Popen(
902
- shlex.split(self.cmd),
903
- stdout=out,
904
- stderr=out,
905
- env={**os.environ},
906
- start_new_session=True,
907
- )
899
+ # Run docker with -d flag to get container ID from stdout
900
+ process = subprocess.Popen(
901
+ shlex.split(self.cmd),
902
+ stdout=subprocess.PIPE,
903
+ stderr=subprocess.PIPE,
904
+ text=True,
905
+ env={**os.environ},
906
+ )
908
907
 
909
- self.container_id = None
910
-
911
- cid_file_path = f"./{self.action_record_id}.cid"
912
- max_retries = 5
913
- retry_delay = 1 # seconds
914
- for attempt in range(max_retries):
915
- try:
916
- with open(cid_file_path, "r") as cid_file:
917
- container_id = cid_file.read().strip()
918
- self.container_id = container_id
919
- logging.info(
920
- "Started process for action %s with container ID: %s",
921
- self.action_record_id,
922
- self.container_id,
923
- )
924
- break
925
- except FileNotFoundError:
926
- logging.warning(
927
- "CID file not found for action %s, attempt %d/%d",
928
- self.action_record_id,
929
- attempt + 1,
930
- max_retries,
931
- )
932
- time.sleep(retry_delay)
933
- except Exception as e:
934
- logging.error(
935
- "Error reading CID file for action %s: %s",
936
- self.action_record_id,
937
- str(e),
938
- )
939
- time.sleep(retry_delay)
940
- else:
908
+ stdout, stderr = process.communicate(timeout=120)
909
+
910
+ if process.returncode != 0:
941
911
  logging.error(
942
- "Failed to read CID file for action %s after %d attempts",
912
+ "Docker run failed for action %s: %s",
943
913
  self.action_record_id,
944
- max_retries,
914
+ stderr,
945
915
  )
946
- raise Exception("Failed to start process: CID file not found")
916
+ raise RuntimeError(f"Docker run failed: {stderr}")
917
+
918
+ self.container_id = stdout.strip()
919
+ logging.info(
920
+ "Started container for action %s with ID: %s",
921
+ self.action_record_id,
922
+ self.container_id,
923
+ )
947
924
 
948
- # report container id to scaling service
925
+ # Start following container logs in background
926
+ self.process = subprocess.Popen(
927
+ ["docker", "logs", "-f", self.container_id],
928
+ stdout=open(self.log_path, "wb"),
929
+ stderr=subprocess.STDOUT,
930
+ start_new_session=True,
931
+ )
932
+
933
+ # Report container id to scaling service
949
934
  self.scaling.update_action_container_id(
950
935
  action_record_id=self.action_record_id,
951
936
  container_id=self.container_id,
952
937
  )
953
-
954
938
 
955
939
  @log_errors(raise_exception=False)
956
940
  def start_logger(self):