matrice-compute 0.1.44__py3-none-any.whl → 0.1.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +21 -10
- matrice_compute/__init__.pyi +2056 -0
- matrice_compute/action_instance.py +21 -6
- matrice_compute/actions_manager.py +2 -1
- matrice_compute/actions_scaledown_manager.py +5 -0
- matrice_compute/instance_manager.py +26 -6
- matrice_compute/instance_utils.py +8 -8
- matrice_compute/k8s_scheduler.py +749 -0
- matrice_compute/prechecks.py +5 -6
- matrice_compute/resources_tracker.py +68 -53
- matrice_compute/scaling.py +31 -2
- matrice_compute/task_utils.py +51 -0
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/METADATA +4 -4
- matrice_compute-0.1.46.dist-info/RECORD +20 -0
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/WHEEL +1 -1
- matrice_compute-0.1.44.dist-info/RECORD +0 -18
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.44.dist-info → matrice_compute-0.1.46.dist-info}/top_level.txt +0 -0
|
@@ -21,6 +21,7 @@ from matrice_compute.scaling import (
|
|
|
21
21
|
Scaling,
|
|
22
22
|
)
|
|
23
23
|
from matrice_common.utils import log_errors
|
|
24
|
+
from typing import cast
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class ActionInstance:
|
|
@@ -369,6 +370,8 @@ class ActionInstance:
|
|
|
369
370
|
"MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
|
|
370
371
|
"MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
|
|
371
372
|
}
|
|
373
|
+
if os.environ.get("MATRICE_BASE_URL"):
|
|
374
|
+
env_vars["MATRICE_BASE_URL"] = os.environ["MATRICE_BASE_URL"]
|
|
372
375
|
if self.get_hugging_face_token(model_key):
|
|
373
376
|
env_vars["HUGGING_FACE_ACCESS_TOKEN"] = self.get_hugging_face_token(
|
|
374
377
|
model_key
|
|
@@ -563,6 +566,7 @@ class ActionInstance:
|
|
|
563
566
|
action_id,
|
|
564
567
|
model_codebase_url,
|
|
565
568
|
model_codebase_requirements_url,
|
|
569
|
+
scaling=self.scaling,
|
|
566
570
|
)
|
|
567
571
|
|
|
568
572
|
# Setup Docker credentials
|
|
@@ -872,7 +876,8 @@ class ActionInstance:
|
|
|
872
876
|
"bg-job-scheduler",
|
|
873
877
|
"DKR_CMD",
|
|
874
878
|
"OK",
|
|
875
|
-
f"Start docker container with command:
|
|
879
|
+
f"Start docker container with command: "
|
|
880
|
+
f"{cmd.replace(cast(str, self.matrice_access_key_id), 'MATRICE_ACCESS_KEY_ID').replace(cast(str, self.matrice_secret_access_key), 'MATRICE_SECRET_ACCESS_KEY')}",
|
|
876
881
|
)
|
|
877
882
|
|
|
878
883
|
@log_errors(raise_exception=False, log_error=False)
|
|
@@ -1140,6 +1145,8 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1140
1145
|
f"-v {dbPath}:{dbPath} "
|
|
1141
1146
|
f"--name {self.action_record_id}_{self.action_type} "
|
|
1142
1147
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1148
|
+
f"-v /etc/matrice/service-config.yaml:/etc/matrice/service-config.yaml "
|
|
1149
|
+
f'-e SERVICE_CONFIG_PATH="/etc/matrice/service-config.yaml" '
|
|
1143
1150
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1144
1151
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1145
1152
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
@@ -1274,9 +1281,11 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1274
1281
|
worker_cmd = (
|
|
1275
1282
|
f"docker run -d --pull=always --net=host "
|
|
1276
1283
|
f"--name {self.action_record_id}_{self.action_type} "
|
|
1284
|
+
f"-v /etc/matrice/service-config.yaml:/etc/matrice/service-config.yaml "
|
|
1277
1285
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1278
1286
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1279
1287
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1288
|
+
f'-e SERVICE_CONFIG_PATH="/etc/matrice/service-config.yaml" '
|
|
1280
1289
|
f' --restart=unless-stopped '
|
|
1281
1290
|
f"{image} "
|
|
1282
1291
|
f"./app "
|
|
@@ -1503,12 +1512,14 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1503
1512
|
# bg-redis management container with --net=host (Port: 8082)
|
|
1504
1513
|
cmd = (
|
|
1505
1514
|
f"docker run --net=host "
|
|
1506
|
-
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1507
|
-
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1508
|
-
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
1509
|
-
f"-e MATRICE_SECRET_ACCESS_KEY={shlex.quote(self.matrice_secret_access_key)} "
|
|
1515
|
+
f"-e REDIS_URL={shlex.quote(cast(str, env_vars['REDIS_URL']))} "
|
|
1516
|
+
f"-e REDIS_PASSWORD={shlex.quote(cast(str, env_vars['REDIS_PASSWORD']))} "
|
|
1517
|
+
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(cast(str, self.matrice_access_key_id))} "
|
|
1518
|
+
f"-e MATRICE_SECRET_ACCESS_KEY={shlex.quote(cast(str, self.matrice_secret_access_key))} "
|
|
1519
|
+
f'-e SERVICE_CONFIG_PATH="/etc/matrice/service-config.yaml" '
|
|
1510
1520
|
f"-e ENV={shlex.quote(os.environ.get('ENV', 'prod'))} "
|
|
1511
1521
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1522
|
+
f"-v /etc/matrice/service-config.yaml:/etc/matrice/service-config.yaml "
|
|
1512
1523
|
f"--shm-size=30G --pull=always "
|
|
1513
1524
|
f"{self.docker_container} "
|
|
1514
1525
|
f"{self.action_record_id} "
|
|
@@ -1869,10 +1880,12 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1869
1880
|
f"docker run -d --pull=always --net=host "
|
|
1870
1881
|
f"--name {self.action_record_id}_{self.action_type} "
|
|
1871
1882
|
f"-v matrice_myvol:/matrice_data "
|
|
1883
|
+
f"-v /etc/matrice/service-config.yaml:/etc/matrice/service-config.yaml "
|
|
1872
1884
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1873
1885
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1874
1886
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1875
1887
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1888
|
+
f'-e SERVICE_CONFIG_PATH="/etc/matrice/service-config.yaml" '
|
|
1876
1889
|
f' --restart=unless-stopped '
|
|
1877
1890
|
f"{image}"
|
|
1878
1891
|
)
|
|
@@ -1912,11 +1925,13 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
1912
1925
|
worker_cmd = (
|
|
1913
1926
|
f"docker run -d --pull=always --net=host "
|
|
1914
1927
|
f"--name {self.action_record_id}_{self.action_type} "
|
|
1915
|
-
f"-v {media_storage_path}:/storage "
|
|
1928
|
+
f"-v {media_storage_path}:/storage "
|
|
1929
|
+
f"-v /etc/matrice/service-config.yaml:/etc/matrice/service-config.yaml "
|
|
1916
1930
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1917
1931
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1918
1932
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1919
1933
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1934
|
+
f'-e SERVICE_CONFIG_PATH="/etc/matrice/service-config.yaml" '
|
|
1920
1935
|
f' --restart=unless-stopped '
|
|
1921
1936
|
f"{image}"
|
|
1922
1937
|
)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
+
from typing import Any
|
|
6
7
|
from matrice_compute.action_instance import (
|
|
7
8
|
ActionInstance,
|
|
8
9
|
)
|
|
@@ -41,7 +42,7 @@ class ActionsManager:
|
|
|
41
42
|
Returns:
|
|
42
43
|
list: List of fetched actions
|
|
43
44
|
"""
|
|
44
|
-
actions = []
|
|
45
|
+
actions: list[Any] = []
|
|
45
46
|
logging.info("Polling backend for new jobs")
|
|
46
47
|
result = self.scaling.assign_jobs(has_gpu())
|
|
47
48
|
if result is None:
|
|
@@ -37,6 +37,11 @@ class ActionsScaleDownManager:
|
|
|
37
37
|
if down_scaled_jobs:
|
|
38
38
|
for container in containers:
|
|
39
39
|
container_id = container.id
|
|
40
|
+
if container_id is None:
|
|
41
|
+
logging.warning(
|
|
42
|
+
"Skipping container with missing id while inspecting."
|
|
43
|
+
)
|
|
44
|
+
continue
|
|
40
45
|
inspect_data = self.docker_client.api.inspect_container(container_id)
|
|
41
46
|
action_record_id = next(
|
|
42
47
|
(arg for arg in inspect_data["Args"] if len(arg) == 24),
|
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
import subprocess
|
|
7
7
|
import threading
|
|
8
8
|
import time
|
|
9
|
+
from typing import Any, Optional
|
|
9
10
|
from kafka import KafkaProducer
|
|
10
11
|
from matrice_compute.actions_manager import ActionsManager
|
|
11
12
|
from matrice_compute.actions_scaledown_manager import ActionsScaleDownManager
|
|
@@ -31,6 +32,22 @@ class InstanceManager:
|
|
|
31
32
|
|
|
32
33
|
Now includes auto streaming capabilities for specified deployment IDs.
|
|
33
34
|
"""
|
|
35
|
+
# Instance attributes for type checking
|
|
36
|
+
scaling: Scaling
|
|
37
|
+
current_actions: dict[Any, Any]
|
|
38
|
+
actions_manager: ActionsManager
|
|
39
|
+
scale_down_manager: ActionsScaleDownManager
|
|
40
|
+
shutdown_manager: ShutdownManager
|
|
41
|
+
machine_resources_tracker: MachineResourcesTracker
|
|
42
|
+
actions_resources_tracker: ActionsResourcesTracker
|
|
43
|
+
kafka_resource_monitor: Optional[KafkaResourceMonitor]
|
|
44
|
+
container_resource_monitor: Optional[ContainerResourceMonitor]
|
|
45
|
+
compute_operations_handler: Optional[ComputeOperationsHandler]
|
|
46
|
+
poll_interval: int
|
|
47
|
+
container_monitor_thread: Optional[threading.Thread]
|
|
48
|
+
container_monitor_running: bool
|
|
49
|
+
container_kafka_producer: Optional[KafkaProducer]
|
|
50
|
+
encryption_key: str
|
|
34
51
|
|
|
35
52
|
def __init__(
|
|
36
53
|
self,
|
|
@@ -87,7 +104,7 @@ class InstanceManager:
|
|
|
87
104
|
logging.info("InstanceManager updated Jupyter token")
|
|
88
105
|
else:
|
|
89
106
|
logging.warning("No Jupyter token found in environment variables")
|
|
90
|
-
self.current_actions = {}
|
|
107
|
+
self.current_actions: dict[Any, Any] = {}
|
|
91
108
|
self.actions_manager = ActionsManager(self.scaling)
|
|
92
109
|
logging.info("InstanceManager initialized with actions manager")
|
|
93
110
|
self.scale_down_manager = ActionsScaleDownManager(self.scaling)
|
|
@@ -100,6 +117,7 @@ class InstanceManager:
|
|
|
100
117
|
logging.info("InstanceManager initialized with actions resources tracker")
|
|
101
118
|
|
|
102
119
|
# Initialize Kafka resource monitor using the same internal Kafka as scaling
|
|
120
|
+
self.kafka_resource_monitor = None
|
|
103
121
|
try:
|
|
104
122
|
kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
|
|
105
123
|
self.kafka_resource_monitor = KafkaResourceMonitor(
|
|
@@ -113,6 +131,7 @@ class InstanceManager:
|
|
|
113
131
|
self.kafka_resource_monitor = None
|
|
114
132
|
|
|
115
133
|
# Initialize Container resource monitor using the same internal Kafka as scaling
|
|
134
|
+
self.container_resource_monitor = None
|
|
116
135
|
try:
|
|
117
136
|
kafka_bootstrap = self.scaling.get_kafka_bootstrap_servers()
|
|
118
137
|
self.container_resource_monitor = ContainerResourceMonitor(
|
|
@@ -127,13 +146,14 @@ class InstanceManager:
|
|
|
127
146
|
|
|
128
147
|
# Initialize Compute Operations Handler for event-driven operations
|
|
129
148
|
# Uses EventListener from matrice_common for simplified Kafka consumption
|
|
149
|
+
self.compute_operations_handler = None
|
|
130
150
|
try:
|
|
131
|
-
|
|
151
|
+
instance_id_env = os.environ.get("INSTANCE_ID") or ""
|
|
132
152
|
self.compute_operations_handler = ComputeOperationsHandler(
|
|
133
153
|
actions_manager=self.actions_manager,
|
|
134
154
|
session=self.session,
|
|
135
155
|
scaling=self.scaling,
|
|
136
|
-
instance_id=
|
|
156
|
+
instance_id=instance_id_env
|
|
137
157
|
)
|
|
138
158
|
logging.info("InstanceManager initialized with Compute Operations Handler for instance ID: %s", instance_id)
|
|
139
159
|
except (ValueError, Exception) as e:
|
|
@@ -225,10 +245,10 @@ class InstanceManager:
|
|
|
225
245
|
raise Exception(
|
|
226
246
|
"SERVICE_PROVIDER and INSTANCE_ID must be set as environment variables or passed as arguments"
|
|
227
247
|
)
|
|
228
|
-
self.encryption_key = manual_instance_info["MATRICE_ENCRYPTION_KEY"]
|
|
248
|
+
self.encryption_key = str(manual_instance_info["MATRICE_ENCRYPTION_KEY"] or "")
|
|
229
249
|
|
|
230
|
-
access_key = manual_instance_info["MATRICE_ACCESS_KEY_ID"]
|
|
231
|
-
secret_key = manual_instance_info["MATRICE_SECRET_ACCESS_KEY"]
|
|
250
|
+
access_key = str(manual_instance_info["MATRICE_ACCESS_KEY_ID"] or "")
|
|
251
|
+
secret_key = str(manual_instance_info["MATRICE_SECRET_ACCESS_KEY"] or "")
|
|
232
252
|
|
|
233
253
|
if ( # Keys are not encrypted
|
|
234
254
|
self.encryption_key
|
|
@@ -15,9 +15,9 @@ from cryptography.hazmat.primitives.ciphers import (
|
|
|
15
15
|
)
|
|
16
16
|
from cryptography.hazmat.backends import default_backend
|
|
17
17
|
from matrice_common.utils import log_errors
|
|
18
|
+
from typing import Optional, Tuple
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
def get_instance_info(service_provider: str = None, instance_id: str = None) -> tuple:
|
|
20
|
+
def get_instance_info(service_provider: Optional[str] = None, instance_id: Optional[str] = None) -> tuple:
|
|
21
21
|
"""
|
|
22
22
|
Get instance provider and ID information.
|
|
23
23
|
|
|
@@ -450,7 +450,7 @@ def get_disk_space_usage() -> list:
|
|
|
450
450
|
|
|
451
451
|
|
|
452
452
|
@log_errors(default_return=None, raise_exception=False)
|
|
453
|
-
def get_max_file_system() -> str:
|
|
453
|
+
def get_max_file_system() -> Optional[str]:
|
|
454
454
|
"""
|
|
455
455
|
Get filesystem with maximum available space.
|
|
456
456
|
|
|
@@ -1060,7 +1060,7 @@ def get_decrypted_access_key_pair(
|
|
|
1060
1060
|
enc_access_key: str,
|
|
1061
1061
|
enc_secret_key: str,
|
|
1062
1062
|
encryption_key: str = "",
|
|
1063
|
-
) ->
|
|
1063
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
1064
1064
|
"""
|
|
1065
1065
|
Get decrypted access key pair.
|
|
1066
1066
|
|
|
@@ -1072,7 +1072,7 @@ def get_decrypted_access_key_pair(
|
|
|
1072
1072
|
Returns:
|
|
1073
1073
|
tuple: (access_key, secret_key) strings
|
|
1074
1074
|
"""
|
|
1075
|
-
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY")
|
|
1075
|
+
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY", "")
|
|
1076
1076
|
if not encryption_key:
|
|
1077
1077
|
logging.warning("Encryption key is not set, Will assume that the keys are not encrypted")
|
|
1078
1078
|
return enc_access_key, enc_secret_key
|
|
@@ -1107,7 +1107,7 @@ def get_encrypted_access_key_pair(
|
|
|
1107
1107
|
access_key: str,
|
|
1108
1108
|
secret_key: str,
|
|
1109
1109
|
encryption_key: str = "",
|
|
1110
|
-
) ->
|
|
1110
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
1111
1111
|
"""
|
|
1112
1112
|
Get encrypted access key pair.
|
|
1113
1113
|
|
|
@@ -1119,7 +1119,7 @@ def get_encrypted_access_key_pair(
|
|
|
1119
1119
|
Returns:
|
|
1120
1120
|
tuple: (encrypted_access_key, encrypted_secret_key) strings
|
|
1121
1121
|
"""
|
|
1122
|
-
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY")
|
|
1122
|
+
encryption_key = encryption_key or os.environ.get("MATRICE_ENCRYPTION_KEY", "")
|
|
1123
1123
|
if not encryption_key:
|
|
1124
1124
|
logging.warning("Encryption key is not set, returning unencrypted keys")
|
|
1125
1125
|
return access_key, secret_key
|
|
@@ -1155,7 +1155,7 @@ def get_encrypted_access_key_pair(
|
|
|
1155
1155
|
|
|
1156
1156
|
return encoded_access_key, encoded_secret_key
|
|
1157
1157
|
|
|
1158
|
-
def _get_private_ip() -> str:
|
|
1158
|
+
def _get_private_ip() -> Optional[str]:
|
|
1159
1159
|
"""
|
|
1160
1160
|
Get the actual private/LAN IP address using UDP socket trick.
|
|
1161
1161
|
This works reliably even in Docker, NAT, VPN, etc.
|