matrice-compute 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -495,6 +495,7 @@ class ActionInstance:
495
495
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
496
  *volumes,
497
497
  # Container configuration and startup commands
498
+ f"--cidfile ./{self.action_record_id}.cid ",
498
499
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
499
500
  f'/bin/bash -c "cd {docker_workdir} && '
500
501
  f"{env_exports} && "
@@ -895,6 +896,7 @@ class ActionInstance:
895
896
  """
896
897
  self.cmd = cmd
897
898
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
899
+
898
900
  with open(self.log_path, "wb") as out:
899
901
  self.process = subprocess.Popen(
900
902
  shlex.split(self.cmd),
@@ -903,6 +905,52 @@ class ActionInstance:
903
905
  env={**os.environ},
904
906
  start_new_session=True,
905
907
  )
908
+
909
+ self.container_id = None
910
+
911
+ cid_file_path = f"./{self.action_record_id}.cid"
912
+ max_retries = 5
913
+ retry_delay = 1 # seconds
914
+ for attempt in range(max_retries):
915
+ try:
916
+ with open(cid_file_path, "r") as cid_file:
917
+ container_id = cid_file.read().strip()
918
+ self.container_id = container_id
919
+ logging.info(
920
+ "Started process for action %s with container ID: %s",
921
+ self.action_record_id,
922
+ self.container_id,
923
+ )
924
+ break
925
+ except FileNotFoundError:
926
+ logging.warning(
927
+ "CID file not found for action %s, attempt %d/%d",
928
+ self.action_record_id,
929
+ attempt + 1,
930
+ max_retries,
931
+ )
932
+ time.sleep(retry_delay)
933
+ except Exception as e:
934
+ logging.error(
935
+ "Error reading CID file for action %s: %s",
936
+ self.action_record_id,
937
+ str(e),
938
+ )
939
+ time.sleep(retry_delay)
940
+ else:
941
+ logging.error(
942
+ "Failed to read CID file for action %s after %d attempts",
943
+ self.action_record_id,
944
+ max_retries,
945
+ )
946
+ raise Exception("Failed to start process: CID file not found")
947
+
948
+ # report container id to scaling service
949
+ self.scaling.update_action_container_id(
950
+ action_record_id=self.action_record_id,
951
+ container_id=self.container_id,
952
+ )
953
+
906
954
 
907
955
  @log_errors(raise_exception=False)
908
956
  def start_logger(self):
@@ -1172,11 +1220,27 @@ def database_setup_execute(self: ActionInstance):
1172
1220
 
1173
1221
  project_id = action_details["_idProject"]
1174
1222
 
1223
+ if action_details["actionDetails"].get("containerId"):
1224
+ logging.info(
1225
+ "Using existing container ID for inference tracker: %s",
1226
+ action_details["actionDetails"]["containerId"],
1227
+ )
1228
+ self.docker_container = action_details["actionDetails"]["containerId"]
1229
+ cmd = "docker restart " + self.docker_container
1230
+ self.start(cmd, "qdrant_setup")
1231
+
1232
+ #qdrant restart
1233
+ qdrant_cmd = "docker restart qdrant"
1234
+ self.start(qdrant_cmd, 'qdrant_setup')
1235
+
1236
+ return
1237
+
1175
1238
  # MongoDB container with --net=host (Port: 27020:27017)
1176
1239
  cmd = (
1177
1240
  f"docker run --pull=always --net=host "
1178
1241
  f"--name mongodbdatabase "
1179
1242
  f"-v matrice_myvol:/matrice_data "
1243
+ f"--cidfile ./{self.action_record_id}.cid "
1180
1244
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1181
1245
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1182
1246
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1215,11 +1279,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
1215
1279
 
1216
1280
  self.setup_action_requirements(action_details)
1217
1281
 
1282
+ if action_details["actionDetails"].get("containerId"):
1283
+ logging.info(
1284
+ "Using existing container ID for facial recognition worker: %s",
1285
+ action_details["actionDetails"]["containerId"],
1286
+ )
1287
+ self.docker_container = action_details["actionDetails"]["containerId"]
1288
+ cmd = "docker restart " + self.docker_container
1289
+ self.start(cmd, "facial_recognition_setup")
1290
+ return
1291
+
1218
1292
  # Facial recognition worker container with --net=host (Port: 8081)
1219
1293
  worker_cmd = (
1220
1294
  f"docker run -d --pull=always --net=host "
1221
1295
  f"--name worker "
1296
+ f"--cidfile ./{self.action_record_id}.cid "
1222
1297
  f"-v matrice_myvol:/matrice_data "
1298
+ f"--cidfile ./{self.action_record_id}.cid "
1223
1299
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1224
1300
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1225
1301
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1245,10 +1321,21 @@ def lpr_setup_execute(self: ActionInstance):
1245
1321
 
1246
1322
  self.setup_action_requirements(action_details)
1247
1323
 
1324
+ if action_details["actionDetails"].get("containerId"):
1325
+ logging.info(
1326
+ "Using existing container ID for LPR worker: %s",
1327
+ action_details["actionDetails"]["containerId"],
1328
+ )
1329
+ self.docker_container = action_details["actionDetails"]["containerId"]
1330
+ cmd = "docker restart " + self.docker_container
1331
+ self.start(cmd, "lpr_setup")
1332
+ return
1333
+
1248
1334
  # LPR worker container with --net=host (Port: 8082)
1249
1335
  worker_cmd = (
1250
1336
  f"docker run -d --net=host --pull=always "
1251
1337
  f"--name lpr-worker "
1338
+ f"--cidfile ./{self.action_record_id}.cid "
1252
1339
  f"-v matrice_myvol:/matrice_data "
1253
1340
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1254
1341
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1285,10 +1372,21 @@ def inference_ws_server_execute(self: ActionInstance):
1285
1372
 
1286
1373
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1287
1374
 
1375
+ if action_details["actionDetails"].get("containerId"):
1376
+ logging.info(
1377
+ "Using existing container ID for inference WebSocket server: %s",
1378
+ action_details["actionDetails"]["containerId"],
1379
+ )
1380
+ self.docker_container = action_details["actionDetails"]["containerId"]
1381
+ cmd = "docker restart " + self.docker_container
1382
+ self.start(cmd, "inference_ws_server")
1383
+ return
1384
+
1288
1385
  # Inference WebSocket server with --net=host (Port: 8102)
1289
1386
  worker_cmd = (
1290
1387
  f"docker run -d --pull=always --net=host "
1291
1388
  f"--name inference "
1389
+ f"--cidfile ./{self.action_record_id}.cid "
1292
1390
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1293
1391
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1294
1392
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1321,11 +1419,22 @@ def fe_fs_streaming_execute(self: ActionInstance):
1321
1419
  ws_url = f"{ws_host}:8102"
1322
1420
 
1323
1421
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1422
+
1423
+ if action_details["actionDetails"].get("containerId"):
1424
+ logging.info(
1425
+ "Using existing container ID for frontend streaming: %s",
1426
+ action_details["actionDetails"]["containerId"],
1427
+ )
1428
+ self.docker_container = action_details["actionDetails"]["containerId"]
1429
+ cmd = "docker restart " + self.docker_container
1430
+ self.start(cmd, "fe_fs_streaming")
1431
+ return
1324
1432
 
1325
1433
  # Frontend streaming with --net=host (Port: 3000)
1326
1434
  worker_cmd = (
1327
1435
  f"docker run -d --pull=always --net=host "
1328
1436
  f"--name fe_streaming "
1437
+ f"--cidfile ./{self.action_record_id}.cid "
1329
1438
  f"-v matrice_myvol:/matrice_data "
1330
1439
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1331
1440
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1355,11 +1464,22 @@ def fe_analytics_service_execute(self: ActionInstance):
1355
1464
  self.setup_action_requirements(action_details)
1356
1465
 
1357
1466
  project_id = action_details["_idProject"]
1467
+
1468
+ if action_details["actionDetails"].get("containerId"):
1469
+ logging.info(
1470
+ "Using existing container ID for frontend analytics service: %s",
1471
+ action_details["actionDetails"]["containerId"],
1472
+ )
1473
+ self.docker_container = action_details["actionDetails"]["containerId"]
1474
+ cmd = "docker restart " + self.docker_container
1475
+ self.start(cmd, "fe_analytics_service")
1476
+ return
1358
1477
 
1359
1478
  # Frontend analytics service with --net=host (Port: 3001)
1360
1479
  worker_cmd = (
1361
1480
  f"docker run -d --pull=always --net=host "
1362
1481
  f"--name fe-analytics "
1482
+ f"--cidfile ./{self.action_record_id}.cid "
1363
1483
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1364
1484
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1365
1485
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1447,11 +1567,27 @@ def redis_setup_execute(self: ActionInstance):
1447
1567
  logging.info(f"Redis will use IP: {redis_host} on port 6379")
1448
1568
 
1449
1569
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1570
+
1571
+
1572
+ if action_details["actionDetails"].get("containerId"):
1573
+ logging.info(
1574
+ "Using existing container ID for redis management: %s",
1575
+ action_details["actionDetails"]["containerId"],
1576
+ )
1577
+ self.docker_container = action_details["actionDetails"]["containerId"]
1578
+ cmd = "docker restart " + self.docker_container
1579
+ self.start(cmd, "redis_setup")
1580
+
1581
+ # Redis container restart
1582
+ redis_restart_cmd = "docker restart redis_container"
1583
+ self.start(redis_restart_cmd, "redis")
1584
+
1585
+ return
1450
1586
 
1451
1587
  # Redis container with --net=host (Port: 6379)
1452
1588
  redis_cmd = (
1453
1589
  f"docker run -d --net=host "
1454
- f"--name redis_container_{int(time.time())} "
1590
+ f"--name redis_container"
1455
1591
  f"--restart unless-stopped "
1456
1592
  f"--memory=32g "
1457
1593
  f"--cpus=8 "
@@ -1496,6 +1632,7 @@ def redis_setup_execute(self: ActionInstance):
1496
1632
  # bg-redis management container with --net=host (Port: 8082)
1497
1633
  cmd = (
1498
1634
  f"docker run --net=host "
1635
+ f"--cidfile ./{self.action_record_id}.cid "
1499
1636
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1500
1637
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1501
1638
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1592,6 +1729,17 @@ def model_train_execute(self: ActionInstance):
1592
1729
  model_family=model_family,
1593
1730
  action_id=action_id,
1594
1731
  )
1732
+
1733
+ if action_details["actionDetails"].get("containerId"):
1734
+ logging.info(
1735
+ "Using existing container ID for training: %s",
1736
+ action_details["actionDetails"]["containerId"],
1737
+ )
1738
+ self.docker_container = action_details["actionDetails"]["containerId"]
1739
+ cmd = "docker restart " + self.docker_container
1740
+ self.start(cmd, "train_log")
1741
+ return
1742
+
1595
1743
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1596
1744
  logging.info("cmd is: %s", cmd)
1597
1745
  self.start(cmd, "train_log")
@@ -1613,6 +1761,16 @@ def model_eval_execute(self: ActionInstance):
1613
1761
  model_family=model_family,
1614
1762
  action_id=action_id,
1615
1763
  )
1764
+ if action_details["actionDetails"].get("containerId"):
1765
+ logging.info(
1766
+ "Using existing container ID for training: %s",
1767
+ action_details["actionDetails"]["containerId"],
1768
+ )
1769
+ self.docker_container = action_details["actionDetails"]["containerId"]
1770
+ cmd = "docker restart " + self.docker_container
1771
+ self.start(cmd, "eval_log")
1772
+ return
1773
+
1616
1774
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1617
1775
  logging.info("cmd is: %s", cmd)
1618
1776
  self.start(cmd, "eval_log")
@@ -1637,6 +1795,16 @@ def model_export_execute(self: ActionInstance):
1637
1795
  model_family=model_family,
1638
1796
  action_id=action_id,
1639
1797
  )
1798
+ if action_details["actionDetails"].get("containerId"):
1799
+ logging.info(
1800
+ "Using existing container ID for training: %s",
1801
+ action_details["actionDetails"]["containerId"],
1802
+ )
1803
+ self.docker_container = action_details["actionDetails"]["containerId"]
1804
+ cmd = "docker restart " + self.docker_container
1805
+ self.start(cmd, "export_log")
1806
+ return
1807
+
1640
1808
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1641
1809
  logging.info("cmd is: %s", cmd)
1642
1810
  self.start(cmd, "export_log")
@@ -1681,6 +1849,16 @@ def streaming_gateway_execute(self: ActionInstance):
1681
1849
  self.docker_container = (
1682
1850
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1683
1851
  )
1852
+ if action_details["actionDetails"].get("containerId"):
1853
+ logging.info(
1854
+ "Using existing container ID for training: %s",
1855
+ action_details["actionDetails"]["containerId"],
1856
+ )
1857
+ self.docker_container = action_details["actionDetails"]["containerId"]
1858
+ cmd = "docker restart " + self.docker_container
1859
+ self.start(cmd, "streaming_gateway")
1860
+ return
1861
+
1684
1862
  cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1685
1863
  logging.info("cmd is: %s", cmd)
1686
1864
  self.start(cmd, "streaming_gateway")
@@ -1775,6 +1953,17 @@ def kafka_setup_execute(self: ActionInstance):
1775
1953
  else:
1776
1954
  pkgs = f"matrice_common matrice"
1777
1955
 
1956
+ if action_details["actionDetails"].get("containerId"):
1957
+ logging.info(
1958
+ "Using existing container ID for training: %s",
1959
+ action_details["actionDetails"]["containerId"],
1960
+ )
1961
+ self.docker_container = action_details["actionDetails"]["containerId"]
1962
+ cmd = "docker restart " + self.docker_container
1963
+ self.start(cmd, "kafka_setup")
1964
+ return
1965
+
1966
+
1778
1967
  # Kafka container with --net=host (Ports: 9092, 9093)
1779
1968
  cmd = (
1780
1969
  f"docker run --net=host "
@@ -1809,10 +1998,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
1809
1998
  image = self.docker_container
1810
1999
 
1811
2000
  self.setup_action_requirements(action_details)
2001
+
2002
+ if action_details["actionDetails"].get("containerId"):
2003
+ logging.info(
2004
+ "Using existing container ID for inference tracker: %s",
2005
+ action_details["actionDetails"]["containerId"],
2006
+ )
2007
+ self.docker_container = action_details["actionDetails"]["containerId"]
2008
+ cmd = "docker restart " + self.docker_container
2009
+ self.start(cmd, "inference_tracker_setup")
2010
+ return
1812
2011
 
1813
2012
  # This is the existing Docker run command
1814
2013
  worker_cmd = (
1815
2014
  f"docker run -d --pull=always --net=host "
2015
+ f"--cidfile ./{self.action_record_id}.cid "
1816
2016
  f"--name inference-tracker-worker "
1817
2017
  f"-v matrice_myvol:/matrice_data "
1818
2018
  f'-e ENV="{os.environ.get("ENV", "prod")}" '