matrice-compute 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +31 -47
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.31.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.31.dist-info}/RECORD +6 -6
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.31.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.31.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.31.dist-info}/top_level.txt +0 -0
|
@@ -490,12 +490,11 @@ class ActionInstance:
|
|
|
490
490
|
)
|
|
491
491
|
|
|
492
492
|
cmd_parts = [
|
|
493
|
-
f"docker run {use_gpu} ",
|
|
493
|
+
f"docker run -d {use_gpu} ",
|
|
494
494
|
network_config,
|
|
495
495
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
496
496
|
*volumes,
|
|
497
497
|
# Container configuration and startup commands
|
|
498
|
-
f"--cidfile ./{self.action_record_id}.cid ",
|
|
499
498
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
500
499
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
501
500
|
f"{env_exports} && "
|
|
@@ -897,60 +896,45 @@ class ActionInstance:
|
|
|
897
896
|
self.cmd = cmd
|
|
898
897
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
899
898
|
|
|
900
|
-
with
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
899
|
+
# Run docker with -d flag to get container ID from stdout
|
|
900
|
+
process = subprocess.Popen(
|
|
901
|
+
shlex.split(self.cmd),
|
|
902
|
+
stdout=subprocess.PIPE,
|
|
903
|
+
stderr=subprocess.PIPE,
|
|
904
|
+
text=True,
|
|
905
|
+
env={**os.environ},
|
|
906
|
+
)
|
|
908
907
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
max_retries = 5
|
|
913
|
-
retry_delay = 1 # seconds
|
|
914
|
-
for attempt in range(max_retries):
|
|
915
|
-
try:
|
|
916
|
-
with open(cid_file_path, "r") as cid_file:
|
|
917
|
-
container_id = cid_file.read().strip()
|
|
918
|
-
self.container_id = container_id
|
|
919
|
-
logging.info(
|
|
920
|
-
"Started process for action %s with container ID: %s",
|
|
921
|
-
self.action_record_id,
|
|
922
|
-
self.container_id,
|
|
923
|
-
)
|
|
924
|
-
break
|
|
925
|
-
except FileNotFoundError:
|
|
926
|
-
logging.warning(
|
|
927
|
-
"CID file not found for action %s, attempt %d/%d",
|
|
928
|
-
self.action_record_id,
|
|
929
|
-
attempt + 1,
|
|
930
|
-
max_retries,
|
|
931
|
-
)
|
|
932
|
-
time.sleep(retry_delay)
|
|
933
|
-
except Exception as e:
|
|
934
|
-
logging.error(
|
|
935
|
-
"Error reading CID file for action %s: %s",
|
|
936
|
-
self.action_record_id,
|
|
937
|
-
str(e),
|
|
938
|
-
)
|
|
939
|
-
time.sleep(retry_delay)
|
|
940
|
-
else:
|
|
908
|
+
stdout, stderr = process.communicate(timeout=120)
|
|
909
|
+
|
|
910
|
+
if process.returncode != 0:
|
|
941
911
|
logging.error(
|
|
942
|
-
"
|
|
912
|
+
"Docker run failed for action %s: %s",
|
|
943
913
|
self.action_record_id,
|
|
944
|
-
|
|
914
|
+
stderr,
|
|
945
915
|
)
|
|
946
|
-
raise
|
|
916
|
+
raise RuntimeError(f"Docker run failed: {stderr}")
|
|
917
|
+
|
|
918
|
+
self.container_id = stdout.strip()
|
|
919
|
+
logging.info(
|
|
920
|
+
"Started container for action %s with ID: %s",
|
|
921
|
+
self.action_record_id,
|
|
922
|
+
self.container_id,
|
|
923
|
+
)
|
|
947
924
|
|
|
948
|
-
#
|
|
925
|
+
# Start following container logs in background
|
|
926
|
+
self.process = subprocess.Popen(
|
|
927
|
+
["docker", "logs", "-f", self.container_id],
|
|
928
|
+
stdout=open(self.log_path, "wb"),
|
|
929
|
+
stderr=subprocess.STDOUT,
|
|
930
|
+
start_new_session=True,
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Report container id to scaling service
|
|
949
934
|
self.scaling.update_action_container_id(
|
|
950
935
|
action_record_id=self.action_record_id,
|
|
951
936
|
container_id=self.container_id,
|
|
952
937
|
)
|
|
953
|
-
|
|
954
938
|
|
|
955
939
|
@log_errors(raise_exception=False)
|
|
956
940
|
def start_logger(self):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=NpI7uCaLJ5GKdW-2JBGCjTwijb8XBrRc7GKRC4uhQF4,76650
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=Iex5uw0PLRR4pvIAZDxc2CypucbanKDbJ3SK8mMGXK8,18148
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
@@ -11,8 +11,8 @@ matrice_compute/resources_tracker.py,sha256=1jSLrIFlOh-vgyNzFrUrE2Ak2JAGCIfV7wcy
|
|
|
11
11
|
matrice_compute/scaling.py,sha256=cdEJqdVsPGDeOjkVAG85lubOn-qwDRV5qqmrNl_XpCM,55146
|
|
12
12
|
matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
|
|
13
13
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
18
|
-
matrice_compute-0.1.
|
|
14
|
+
matrice_compute-0.1.31.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
+
matrice_compute-0.1.31.dist-info/METADATA,sha256=nhJU2AA0SxaSWMZXKjYtAthzjbjdEmmD3agMYqukQx8,1038
|
|
16
|
+
matrice_compute-0.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
matrice_compute-0.1.31.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
+
matrice_compute-0.1.31.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|