matrice-compute 0.1.43__py3-none-any.whl → 0.1.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +21 -10
- matrice_compute/__init__.pyi +2056 -0
- matrice_compute/action_instance.py +22 -6
- matrice_compute/actions_manager.py +2 -1
- matrice_compute/actions_scaledown_manager.py +5 -0
- matrice_compute/instance_manager.py +26 -6
- matrice_compute/instance_utils.py +8 -8
- matrice_compute/k8s_scheduler.py +749 -0
- matrice_compute/prechecks.py +5 -6
- matrice_compute/resources_tracker.py +68 -53
- matrice_compute/scaling.py +31 -2
- matrice_compute/task_utils.py +51 -0
- {matrice_compute-0.1.43.dist-info → matrice_compute-0.1.45.dist-info}/METADATA +4 -4
- matrice_compute-0.1.45.dist-info/RECORD +20 -0
- {matrice_compute-0.1.43.dist-info → matrice_compute-0.1.45.dist-info}/WHEEL +1 -1
- matrice_compute-0.1.43.dist-info/RECORD +0 -18
- {matrice_compute-0.1.43.dist-info → matrice_compute-0.1.45.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.43.dist-info → matrice_compute-0.1.45.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,749 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Kubernetes Scheduler for bg-job-scheduler
|
|
3
|
+
|
|
4
|
+
This module runs inside a Kubernetes cluster and:
|
|
5
|
+
1. Authenticates with Matrice API using access key and secret key (via matrice_common.session)
|
|
6
|
+
2. Polls for assigned actions using /v1/actions/assign_jobs_kubernetes/{cluster_id}
|
|
7
|
+
3. Creates K8s Jobs for each action using in-cluster authentication
|
|
8
|
+
4. Monitors job status and updates action records via existing action update API
|
|
9
|
+
5. Sends heartbeat to report cluster health
|
|
10
|
+
|
|
11
|
+
The K8s scheduler flow:
|
|
12
|
+
1. Register a cluster in compute_clusters with isKubernetes: true
|
|
13
|
+
2. When user submits a job with clusterName, processClusterName in be-action:
|
|
14
|
+
- Detects the cluster is K8s (isKubernetes: true)
|
|
15
|
+
- Sets kubernetesClusterId and executionMode: "kubernetes" in actionDetails
|
|
16
|
+
3. K8s scheduler polls /v1/actions/assign_jobs_kubernetes/{cluster_id}
|
|
17
|
+
4. Scheduler creates K8s Jobs for each action
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
import sys
|
|
23
|
+
import time
|
|
24
|
+
import logging
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
from typing import Optional, Dict, List, Any
|
|
27
|
+
|
|
28
|
+
from kubernetes import client, config
|
|
29
|
+
from kubernetes.client.rest import ApiException
|
|
30
|
+
from matrice_common.session import Session
|
|
31
|
+
|
|
32
|
+
logging.basicConfig(
|
|
33
|
+
level=logging.INFO,
|
|
34
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
35
|
+
handlers=[logging.StreamHandler(sys.stdout)],
|
|
36
|
+
force=True
|
|
37
|
+
)
|
|
38
|
+
logger = logging.getLogger("k8s_scheduler")
|
|
39
|
+
logger.setLevel(logging.INFO)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Action type to script mapping (matches VM mode's action_instance.py)
|
|
43
|
+
ACTION_SCRIPTS: Dict[str, str] = {
|
|
44
|
+
"model_train": "python3 train.py",
|
|
45
|
+
"model_eval": "python3 eval.py",
|
|
46
|
+
"model_export": "python3 export.py",
|
|
47
|
+
"deploy_add": "python3 deploy.py",
|
|
48
|
+
"data_import": "python3 /usr/src/app/main.py",
|
|
49
|
+
"data_add": "python3 /usr/src/app/main.py",
|
|
50
|
+
"data_split": "python3 /usr/src/app/data_split.py",
|
|
51
|
+
"data_prep": "python3 /usr/src/app/data_preparation.py",
|
|
52
|
+
"dataset_annotation": "python3 /usr/src/app/dataset_annotation.py",
|
|
53
|
+
"dataset_augmentation": "python3 /usr/src/app/data_augmentation.py",
|
|
54
|
+
"dataset_generation": "python3 /usr/src/app/synthetic_dataset_generation.py",
|
|
55
|
+
"image_build": "python3 main.py",
|
|
56
|
+
"resource_clone": "python3 main.py",
|
|
57
|
+
"streaming_gateway": "python3 /usr/src/app/streaming_gateway.py",
|
|
58
|
+
"deploy_aggregator": "python3 /usr/src/app/deploy_aggregator.py",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# Extra packages needed per action type
|
|
62
|
+
ACTION_EXTRA_PACKAGES: Dict[str, List[str]] = {
|
|
63
|
+
"data_import": ["matrice_dataset"],
|
|
64
|
+
"data_add": ["matrice_dataset"],
|
|
65
|
+
"data_split": ["matrice_dataset"],
|
|
66
|
+
"data_prep": ["matrice_dataset"],
|
|
67
|
+
"dataset_annotation": ["matrice_dataset"],
|
|
68
|
+
"dataset_augmentation": ["matrice_dataset"],
|
|
69
|
+
"dataset_generation": ["matrice_dataset"],
|
|
70
|
+
"deploy_add": ["matrice_inference", "matrice_analytics"],
|
|
71
|
+
"streaming_gateway": ["matrice_streaming"],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class K8sScheduler:
|
|
76
|
+
"""
|
|
77
|
+
Kubernetes Scheduler that polls for actions and creates K8s Jobs.
|
|
78
|
+
Runs inside the cluster using in-cluster authentication.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
# Action type to script mapping (matches VM mode's action_instance.py)
|
|
82
|
+
# Constants moved to module level: ACTION_SCRIPTS and ACTION_EXTRA_PACKAGES
|
|
83
|
+
|
|
84
|
+
# Running jobs keyed by action_id
|
|
85
|
+
running_jobs: Dict[str, str]
|
|
86
|
+
|
|
87
|
+
def __init__(self):
|
|
88
|
+
# Configuration from environment
|
|
89
|
+
self.cluster_id = os.environ.get("CLUSTER_ID")
|
|
90
|
+
self.job_namespace = os.environ.get("JOB_NAMESPACE", "matrice-jobs")
|
|
91
|
+
self.poll_interval = int(os.environ.get("POLL_INTERVAL", "10"))
|
|
92
|
+
self.env = os.environ.get("ENV", "prod")
|
|
93
|
+
|
|
94
|
+
# GPU configuration - whether this cluster has GPU nodes
|
|
95
|
+
is_gpu_str = os.environ.get("IS_GPU", "false").lower()
|
|
96
|
+
self.is_gpu = is_gpu_str in ("true", "1", "yes")
|
|
97
|
+
|
|
98
|
+
# Validate required config
|
|
99
|
+
if not self.cluster_id:
|
|
100
|
+
raise ValueError("CLUSTER_ID environment variable is required")
|
|
101
|
+
|
|
102
|
+
# Matrice credentials (for API authentication and action execution)
|
|
103
|
+
self.matrice_access_key = os.environ.get("MATRICE_ACCESS_KEY_ID")
|
|
104
|
+
self.matrice_secret_key = os.environ.get("MATRICE_SECRET_ACCESS_KEY")
|
|
105
|
+
|
|
106
|
+
if not self.matrice_access_key or not self.matrice_secret_key:
|
|
107
|
+
raise ValueError("MATRICE_ACCESS_KEY_ID and MATRICE_SECRET_ACCESS_KEY environment variables are required")
|
|
108
|
+
|
|
109
|
+
# Initialize Matrice session for API authentication
|
|
110
|
+
self.session = Session(
|
|
111
|
+
account_number="",
|
|
112
|
+
access_key=self.matrice_access_key,
|
|
113
|
+
secret_key=self.matrice_secret_key,
|
|
114
|
+
)
|
|
115
|
+
self.rpc = self.session.rpc
|
|
116
|
+
|
|
117
|
+
# Initialize Kubernetes client (in-cluster auth)
|
|
118
|
+
self._init_k8s_client()
|
|
119
|
+
|
|
120
|
+
# Track running jobs
|
|
121
|
+
self.running_jobs = {} # action_id -> job_name
|
|
122
|
+
|
|
123
|
+
# Scheduler start time
|
|
124
|
+
self.start_time = datetime.now()
|
|
125
|
+
|
|
126
|
+
# Ensure Docker Hub image pull secret exists
|
|
127
|
+
self._ensure_docker_hub_secret()
|
|
128
|
+
|
|
129
|
+
logger.info(f"K8s Scheduler initialized for cluster {self.cluster_id}")
|
|
130
|
+
logger.info(f"Job namespace: {self.job_namespace}")
|
|
131
|
+
logger.info(f"Poll interval: {self.poll_interval}s")
|
|
132
|
+
logger.info(f"GPU mode: {self.is_gpu}")
|
|
133
|
+
logger.info(f"Matrice session initialized with access key: {self.matrice_access_key[:8]}...")
|
|
134
|
+
|
|
135
|
+
def _init_k8s_client(self):
|
|
136
|
+
"""Initialize Kubernetes client using in-cluster config"""
|
|
137
|
+
try:
|
|
138
|
+
# Try in-cluster config first (when running inside K8s)
|
|
139
|
+
config.load_incluster_config()
|
|
140
|
+
logger.info("Using in-cluster Kubernetes configuration")
|
|
141
|
+
except config.ConfigException:
|
|
142
|
+
# Fall back to kubeconfig for local development
|
|
143
|
+
try:
|
|
144
|
+
config.load_kube_config()
|
|
145
|
+
logger.info("Using kubeconfig for Kubernetes configuration")
|
|
146
|
+
except config.ConfigException as e:
|
|
147
|
+
logger.error(f"Failed to configure Kubernetes client: {e}")
|
|
148
|
+
raise
|
|
149
|
+
|
|
150
|
+
self.batch_v1 = client.BatchV1Api()
|
|
151
|
+
self.core_v1 = client.CoreV1Api()
|
|
152
|
+
|
|
153
|
+
def _ensure_docker_hub_secret(self):
|
|
154
|
+
"""
|
|
155
|
+
Fetch Docker Hub credentials from be-compute API and create/update
|
|
156
|
+
the image pull secret in the job namespace.
|
|
157
|
+
|
|
158
|
+
Uses the same public API endpoint as VM mode: /v1/compute/get_docker_hub_credentials
|
|
159
|
+
"""
|
|
160
|
+
try:
|
|
161
|
+
# Fetch Docker Hub credentials from be-compute API
|
|
162
|
+
# Use the same public endpoint as VM mode
|
|
163
|
+
path = "/v1/compute/get_docker_hub_credentials"
|
|
164
|
+
|
|
165
|
+
response = self.rpc.get(path=path)
|
|
166
|
+
|
|
167
|
+
if not response or not response.get("success"):
|
|
168
|
+
logger.warning("Failed to fetch Docker Hub credentials from API, jobs may fail to pull private images")
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
creds = response.get("data", {})
|
|
172
|
+
username = creds.get("username")
|
|
173
|
+
password = creds.get("password")
|
|
174
|
+
|
|
175
|
+
if not username or not password:
|
|
176
|
+
logger.warning("Docker Hub credentials incomplete, jobs may fail to pull private images")
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
# Create docker-registry secret in job namespace
|
|
180
|
+
import base64
|
|
181
|
+
import json
|
|
182
|
+
|
|
183
|
+
# Create docker config JSON
|
|
184
|
+
docker_config = {
|
|
185
|
+
"auths": {
|
|
186
|
+
"https://index.docker.io/v1/": {
|
|
187
|
+
"username": username,
|
|
188
|
+
"password": password,
|
|
189
|
+
"auth": base64.b64encode(f"{username}:{password}".encode()).decode()
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
docker_config_json = json.dumps(docker_config)
|
|
195
|
+
|
|
196
|
+
# Create secret object
|
|
197
|
+
secret = client.V1Secret(
|
|
198
|
+
api_version="v1",
|
|
199
|
+
kind="Secret",
|
|
200
|
+
metadata=client.V1ObjectMeta(
|
|
201
|
+
name="matrice-registry",
|
|
202
|
+
namespace=self.job_namespace
|
|
203
|
+
),
|
|
204
|
+
type="kubernetes.io/dockerconfigjson",
|
|
205
|
+
data={
|
|
206
|
+
".dockerconfigjson": base64.b64encode(docker_config_json.encode()).decode()
|
|
207
|
+
}
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Try to create or update the secret
|
|
211
|
+
try:
|
|
212
|
+
self.core_v1.create_namespaced_secret(self.job_namespace, secret)
|
|
213
|
+
logger.info(f"Created Docker Hub secret 'matrice-registry' in namespace {self.job_namespace}")
|
|
214
|
+
except ApiException as e:
|
|
215
|
+
if e.status == 409: # Already exists
|
|
216
|
+
# Update existing secret
|
|
217
|
+
self.core_v1.replace_namespaced_secret("matrice-registry", self.job_namespace, secret)
|
|
218
|
+
logger.info(f"Updated Docker Hub secret 'matrice-registry' in namespace {self.job_namespace}")
|
|
219
|
+
else:
|
|
220
|
+
raise
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"Error creating Docker Hub secret: {e}")
|
|
224
|
+
logger.warning("Jobs requiring private Docker images may fail to start")
|
|
225
|
+
|
|
226
|
+
def _get_startup_command(self, action_type: str, action_id: str) -> List[str]:
|
|
227
|
+
"""
|
|
228
|
+
Build the startup command for the container.
|
|
229
|
+
|
|
230
|
+
This mirrors VM mode's get_base_docker_cmd() logic:
|
|
231
|
+
1. Install matrice SDK packages
|
|
232
|
+
2. Run the appropriate script for the action type
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
action_type: The type of action (model_train, data_import, etc.)
|
|
236
|
+
action_id: The action record ID
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
List of command arguments for the container
|
|
240
|
+
"""
|
|
241
|
+
# Determine PyPI index based on environment
|
|
242
|
+
pypi_index = (
|
|
243
|
+
"https://test.pypi.org/simple/"
|
|
244
|
+
if self.env in ["dev", "staging"]
|
|
245
|
+
else "https://pypi.org/simple/"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Base packages
|
|
249
|
+
packages = ["matrice_common", "matrice"]
|
|
250
|
+
|
|
251
|
+
# Add extra packages for specific action types
|
|
252
|
+
extra_pkgs = ACTION_EXTRA_PACKAGES.get(action_type, [])
|
|
253
|
+
packages.extend(extra_pkgs)
|
|
254
|
+
|
|
255
|
+
# Build pip install command
|
|
256
|
+
if self.env == "dev":
|
|
257
|
+
packages = [f"{pkg}>=1.0.0" for pkg in packages]
|
|
258
|
+
pip_cmd = f"pip install --pre --upgrade --force-reinstall --index-url {pypi_index} {' '.join(packages)}"
|
|
259
|
+
else:
|
|
260
|
+
pip_cmd = f"pip install --upgrade --force-reinstall --index-url {pypi_index} {' '.join(packages)}"
|
|
261
|
+
|
|
262
|
+
# Get the script for this action type
|
|
263
|
+
script = ACTION_SCRIPTS.get(action_type, "python3 main.py")
|
|
264
|
+
|
|
265
|
+
# Build full command
|
|
266
|
+
# Format: pip install SDK && run script with action_id
|
|
267
|
+
full_command = f"{pip_cmd} && {script} {action_id}"
|
|
268
|
+
|
|
269
|
+
# Use /bin/bash to match VM mode behavior
|
|
270
|
+
return ["/bin/bash", "-c", full_command]
|
|
271
|
+
|
|
272
|
+
def poll_pending_actions(self) -> List[Dict[str, Any]]:
|
|
273
|
+
"""
|
|
274
|
+
Poll for actions assigned to this Kubernetes cluster.
|
|
275
|
+
|
|
276
|
+
Uses the new K8s-specific endpoint:
|
|
277
|
+
- processClusterName in be-action detects K8s clusters and sets kubernetesClusterId
|
|
278
|
+
- Scheduler calls /v1/actions/assign_jobs_kubernetes/{cluster_id} to fetch assigned actions
|
|
279
|
+
"""
|
|
280
|
+
try:
|
|
281
|
+
# Use the K8s-specific endpoint
|
|
282
|
+
path = f"/v1/actions/assign_jobs_kubernetes/{self.cluster_id}"
|
|
283
|
+
response = self.rpc.get(path=path)
|
|
284
|
+
|
|
285
|
+
if response and response.get("success"):
|
|
286
|
+
actions = response.get("data", [])
|
|
287
|
+
if actions:
|
|
288
|
+
logger.info(f"Found {len(actions)} assigned actions for cluster {self.cluster_id}")
|
|
289
|
+
return actions if actions else []
|
|
290
|
+
else:
|
|
291
|
+
error_msg = response.get("message", "Unknown error") if response else "No response"
|
|
292
|
+
logger.warning(f"Failed to poll actions: {error_msg}")
|
|
293
|
+
return []
|
|
294
|
+
|
|
295
|
+
except Exception as e:
|
|
296
|
+
logger.error(f"Error polling for pending actions: {e}")
|
|
297
|
+
return []
|
|
298
|
+
|
|
299
|
+
def update_action_status(self, action_id: str, step_code: str, status: str,
|
|
300
|
+
description: str, extra_details: Optional[Dict] = None):
|
|
301
|
+
"""
|
|
302
|
+
Update action status using the existing action update endpoint.
|
|
303
|
+
|
|
304
|
+
Uses the standard action record update API that accepts:
|
|
305
|
+
- stepCode: The step code for the action
|
|
306
|
+
- status: Status (OK, ERROR, etc.)
|
|
307
|
+
- statusDescription: Human-readable description
|
|
308
|
+
|
|
309
|
+
Extra details are merged into the action record's actionDetails.
|
|
310
|
+
"""
|
|
311
|
+
try:
|
|
312
|
+
# Use RPC client for authenticated API calls
|
|
313
|
+
path = "/v1/actions"
|
|
314
|
+
payload: Dict[str, Any] = {
|
|
315
|
+
"_id": action_id,
|
|
316
|
+
"stepCode": step_code,
|
|
317
|
+
"status": status,
|
|
318
|
+
"statusDescription": description,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
# Merge extra details into actionDetails
|
|
322
|
+
if extra_details:
|
|
323
|
+
payload["actionDetails"] = extra_details
|
|
324
|
+
|
|
325
|
+
response = self.rpc.put(path=path, payload=payload)
|
|
326
|
+
|
|
327
|
+
if response and response.get("success"):
|
|
328
|
+
logger.debug(f"Updated action {action_id}: stepCode={step_code}, status={status}")
|
|
329
|
+
else:
|
|
330
|
+
error_msg = response.get("message", "Unknown error") if response else "No response"
|
|
331
|
+
logger.warning(f"Failed to update action {action_id}: {error_msg}")
|
|
332
|
+
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.error(f"Error updating action status: {e}")
|
|
335
|
+
|
|
336
|
+
def create_k8s_job(self, action: Dict[str, Any]) -> Optional[str]:
|
|
337
|
+
"""Create a Kubernetes Job for the given action"""
|
|
338
|
+
action_id = action.get("_id", action.get("id", ""))
|
|
339
|
+
action_details = action.get("actionDetails", {})
|
|
340
|
+
action_type = action.get("action", "unknown")
|
|
341
|
+
|
|
342
|
+
# Get service ID for job naming
|
|
343
|
+
service_id = action_details.get("serviceId", action_id)
|
|
344
|
+
|
|
345
|
+
# Generate job name
|
|
346
|
+
job_name = f"action-{action_type}-{service_id[:8]}".lower().replace("_", "-")
|
|
347
|
+
|
|
348
|
+
# Get configuration from action details
|
|
349
|
+
docker_image = action_details.get("docker")
|
|
350
|
+
if not docker_image:
|
|
351
|
+
logger.error(f"No docker image specified for action {action_id}")
|
|
352
|
+
self.update_action_status(
|
|
353
|
+
action_id, "ERROR", "ERROR",
|
|
354
|
+
"No docker image specified for action"
|
|
355
|
+
)
|
|
356
|
+
return None
|
|
357
|
+
|
|
358
|
+
namespace = action_details.get("kubernetesNamespace", self.job_namespace)
|
|
359
|
+
cpu_request = action_details.get("cpuRequest", "500m")
|
|
360
|
+
memory_request = action_details.get("memoryRequest", "512Mi")
|
|
361
|
+
cpu_limit = action_details.get("cpuLimit", "2000m")
|
|
362
|
+
memory_limit = action_details.get("memoryLimit", "4Gi")
|
|
363
|
+
gpu_required = action_details.get("gpuRequired", False)
|
|
364
|
+
gpu_count = action_details.get("gpuCount", 1)
|
|
365
|
+
gpu_resource_key = action_details.get("gpuResourceKey", "nvidia.com/gpu")
|
|
366
|
+
gpu_memory_limit = action_details.get("gpuMemoryLimit", "")
|
|
367
|
+
gpu_node_selector = action_details.get("gpuNodeSelector", "")
|
|
368
|
+
registry_secret = action_details.get("registrySecret", "matrice-registry")
|
|
369
|
+
|
|
370
|
+
# Build environment variables
|
|
371
|
+
env_vars = [
|
|
372
|
+
client.V1EnvVar(name="ENV", value=self.env),
|
|
373
|
+
client.V1EnvVar(name="ACTION_ID", value=action_id),
|
|
374
|
+
client.V1EnvVar(name="EXECUTION_MODE", value="kubernetes"),
|
|
375
|
+
client.V1EnvVar(name="KUBERNETES_CLUSTER_ID", value=self.cluster_id),
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
# Add Matrice credentials if available
|
|
379
|
+
if self.matrice_access_key:
|
|
380
|
+
env_vars.append(client.V1EnvVar(name="MATRICE_ACCESS_KEY_ID", value=self.matrice_access_key))
|
|
381
|
+
if self.matrice_secret_key:
|
|
382
|
+
env_vars.append(client.V1EnvVar(name="MATRICE_SECRET_ACCESS_KEY", value=self.matrice_secret_key))
|
|
383
|
+
|
|
384
|
+
# Add custom env vars from action
|
|
385
|
+
custom_env = action_details.get("envVars", {})
|
|
386
|
+
for key, value in custom_env.items():
|
|
387
|
+
env_vars.append(client.V1EnvVar(name=key, value=str(value)))
|
|
388
|
+
|
|
389
|
+
# Build resource requirements
|
|
390
|
+
resources = client.V1ResourceRequirements(
|
|
391
|
+
requests={
|
|
392
|
+
"cpu": cpu_request,
|
|
393
|
+
"memory": memory_request,
|
|
394
|
+
},
|
|
395
|
+
limits={
|
|
396
|
+
"cpu": cpu_limit,
|
|
397
|
+
"memory": memory_limit,
|
|
398
|
+
}
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Add GPU resources if required
|
|
402
|
+
if gpu_required:
|
|
403
|
+
resources.requests[gpu_resource_key] = str(gpu_count)
|
|
404
|
+
resources.limits[gpu_resource_key] = str(gpu_count)
|
|
405
|
+
if gpu_memory_limit:
|
|
406
|
+
resources.limits["nvidia.com/gpu-memory"] = gpu_memory_limit
|
|
407
|
+
|
|
408
|
+
# Build container with args only (don't override command/entrypoint from Dockerfile)
|
|
409
|
+
# The action images have their own ENTRYPOINT (e.g., "./main" for Go images)
|
|
410
|
+
# and expect the service_id as an argument
|
|
411
|
+
container = client.V1Container(
|
|
412
|
+
name="action-worker",
|
|
413
|
+
image=docker_image,
|
|
414
|
+
image_pull_policy="Always",
|
|
415
|
+
args=[service_id], # Pass service_id as argument to the container entrypoint
|
|
416
|
+
env=env_vars,
|
|
417
|
+
resources=resources,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Build pod spec
|
|
421
|
+
pod_spec = client.V1PodSpec(
|
|
422
|
+
restart_policy="Never",
|
|
423
|
+
containers=[container],
|
|
424
|
+
# Add tolerations for control-plane taint (common in single-node clusters)
|
|
425
|
+
tolerations=[
|
|
426
|
+
client.V1Toleration(
|
|
427
|
+
key="node-role.kubernetes.io/control-plane",
|
|
428
|
+
operator="Exists",
|
|
429
|
+
effect="NoSchedule"
|
|
430
|
+
)
|
|
431
|
+
]
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
# Add image pull secret if specified
|
|
435
|
+
if registry_secret:
|
|
436
|
+
pod_spec.image_pull_secrets = [
|
|
437
|
+
client.V1LocalObjectReference(name=registry_secret)
|
|
438
|
+
]
|
|
439
|
+
|
|
440
|
+
# Add node selector for GPU
|
|
441
|
+
if gpu_required and gpu_node_selector:
|
|
442
|
+
pod_spec.node_selector = {gpu_node_selector: "true"}
|
|
443
|
+
|
|
444
|
+
# Build job spec
|
|
445
|
+
job_spec = client.V1JobSpec(
|
|
446
|
+
backoff_limit=2,
|
|
447
|
+
ttl_seconds_after_finished=3600, # Clean up after 1 hour
|
|
448
|
+
template=client.V1PodTemplateSpec(
|
|
449
|
+
metadata=client.V1ObjectMeta(
|
|
450
|
+
labels={
|
|
451
|
+
"app": "matrice-action",
|
|
452
|
+
"action-id": action_id,
|
|
453
|
+
"action-type": action_type,
|
|
454
|
+
}
|
|
455
|
+
),
|
|
456
|
+
spec=pod_spec,
|
|
457
|
+
),
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Build job
|
|
461
|
+
job = client.V1Job(
|
|
462
|
+
api_version="batch/v1",
|
|
463
|
+
kind="Job",
|
|
464
|
+
metadata=client.V1ObjectMeta(
|
|
465
|
+
name=job_name,
|
|
466
|
+
namespace=namespace,
|
|
467
|
+
labels={
|
|
468
|
+
"app": "matrice-action",
|
|
469
|
+
"action-id": action_id,
|
|
470
|
+
"action-type": action_type,
|
|
471
|
+
"managed-by": "matrice-scheduler",
|
|
472
|
+
}
|
|
473
|
+
),
|
|
474
|
+
spec=job_spec,
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Ensure namespace exists
|
|
478
|
+
self._ensure_namespace(namespace)
|
|
479
|
+
|
|
480
|
+
# Create the job
|
|
481
|
+
try:
|
|
482
|
+
self.batch_v1.create_namespaced_job(namespace=namespace, body=job)
|
|
483
|
+
logger.info(f"Created K8s job {job_name} for action {action_id}")
|
|
484
|
+
|
|
485
|
+
# Update action status
|
|
486
|
+
self.update_action_status(
|
|
487
|
+
action_id,
|
|
488
|
+
"K8S_JOB_CREATED",
|
|
489
|
+
"OK",
|
|
490
|
+
f"Kubernetes job {job_name} created",
|
|
491
|
+
{
|
|
492
|
+
"kubernetesJobName": job_name,
|
|
493
|
+
"kubernetesNamespace": namespace,
|
|
494
|
+
"jobCreatedAt": datetime.now().isoformat(),
|
|
495
|
+
}
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Track the job
|
|
499
|
+
self.running_jobs[action_id] = job_name
|
|
500
|
+
|
|
501
|
+
return job_name
|
|
502
|
+
|
|
503
|
+
except ApiException as e:
|
|
504
|
+
if e.status == 409: # Already exists
|
|
505
|
+
logger.warning(f"Job {job_name} already exists")
|
|
506
|
+
self.running_jobs[action_id] = job_name
|
|
507
|
+
return job_name
|
|
508
|
+
else:
|
|
509
|
+
logger.error(f"Failed to create K8s job: {e}")
|
|
510
|
+
self.update_action_status(
|
|
511
|
+
action_id, "ERROR", "ERROR",
|
|
512
|
+
f"Failed to create Kubernetes job: {e.reason}"
|
|
513
|
+
)
|
|
514
|
+
return None
|
|
515
|
+
|
|
516
|
+
def _ensure_namespace(self, namespace: str):
|
|
517
|
+
"""Ensure the namespace exists"""
|
|
518
|
+
try:
|
|
519
|
+
self.core_v1.read_namespace(namespace)
|
|
520
|
+
except ApiException as e:
|
|
521
|
+
if e.status == 404:
|
|
522
|
+
# Create namespace
|
|
523
|
+
ns = client.V1Namespace(
|
|
524
|
+
metadata=client.V1ObjectMeta(
|
|
525
|
+
name=namespace,
|
|
526
|
+
labels={"managed-by": "matrice-scheduler"}
|
|
527
|
+
)
|
|
528
|
+
)
|
|
529
|
+
try:
|
|
530
|
+
self.core_v1.create_namespace(ns)
|
|
531
|
+
logger.info(f"Created namespace {namespace}")
|
|
532
|
+
except ApiException as create_err:
|
|
533
|
+
if create_err.status != 409: # Not already exists
|
|
534
|
+
raise
|
|
535
|
+
|
|
536
|
+
def check_job_status(self, action_id: str, job_name: str, namespace: str) -> Optional[str]:
|
|
537
|
+
"""Check the status of a K8s job and return status if completed. Also monitors resource usage."""
|
|
538
|
+
try:
|
|
539
|
+
job = self.batch_v1.read_namespaced_job(job_name, namespace)
|
|
540
|
+
|
|
541
|
+
# Get pod info for detailed logging and resource monitoring
|
|
542
|
+
pod_selector = f"job-name={job_name}"
|
|
543
|
+
pods = self.core_v1.list_namespaced_pod(
|
|
544
|
+
namespace=namespace,
|
|
545
|
+
label_selector=pod_selector
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
# Log detailed status
|
|
549
|
+
if pods.items:
|
|
550
|
+
pod = pods.items[0]
|
|
551
|
+
pod_name = pod.metadata.name
|
|
552
|
+
pod_phase = pod.status.phase
|
|
553
|
+
|
|
554
|
+
logger.info(
|
|
555
|
+
f"Job status - Action: {action_id}, Job: {job_name}, "
|
|
556
|
+
f"Pod: {pod_name}, Phase: {pod_phase}, "
|
|
557
|
+
f"Active: {job.status.active}, Succeeded: {job.status.succeeded}, Failed: {job.status.failed}"
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Get pod resource usage metrics if available
|
|
561
|
+
try:
|
|
562
|
+
resource_info = self._get_pod_resource_usage(namespace, pod_name)
|
|
563
|
+
if resource_info:
|
|
564
|
+
logger.info(f"Resource usage for {pod_name}: {resource_info}")
|
|
565
|
+
|
|
566
|
+
# Update action with resource info
|
|
567
|
+
self.update_action_status(
|
|
568
|
+
action_id,
|
|
569
|
+
"K8S_JOB_RUNNING",
|
|
570
|
+
"OK",
|
|
571
|
+
f"Job running - Pod: {pod_name}, Phase: {pod_phase}",
|
|
572
|
+
{
|
|
573
|
+
"podName": pod_name,
|
|
574
|
+
"podPhase": pod_phase,
|
|
575
|
+
"resourceUsage": resource_info
|
|
576
|
+
}
|
|
577
|
+
)
|
|
578
|
+
except Exception as e:
|
|
579
|
+
logger.debug(f"Could not fetch resource metrics: {e}")
|
|
580
|
+
|
|
581
|
+
if job.status.succeeded:
|
|
582
|
+
return "COMPLETED"
|
|
583
|
+
elif job.status.failed:
|
|
584
|
+
return "FAILED"
|
|
585
|
+
elif job.status.active:
|
|
586
|
+
return "RUNNING"
|
|
587
|
+
else:
|
|
588
|
+
return "PENDING"
|
|
589
|
+
|
|
590
|
+
except ApiException as e:
|
|
591
|
+
if e.status == 404:
|
|
592
|
+
return "NOT_FOUND"
|
|
593
|
+
logger.error(f"Error checking job status for {job_name}: {e}")
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
def _get_pod_resource_usage(self, namespace: str, pod_name: str) -> Optional[Dict[str, Dict[str, str]]]:
|
|
597
|
+
"""Get current resource usage for a pod"""
|
|
598
|
+
try:
|
|
599
|
+
# Read pod to get resource requests/limits
|
|
600
|
+
pod = self.core_v1.read_namespaced_pod(pod_name, namespace)
|
|
601
|
+
|
|
602
|
+
resource_info: Dict[str, Dict[str, str]] = {
|
|
603
|
+
"requests": {},
|
|
604
|
+
"limits": {}
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
for container in pod.spec.containers:
|
|
608
|
+
if container.resources:
|
|
609
|
+
if container.resources.requests:
|
|
610
|
+
resource_info["requests"] = {
|
|
611
|
+
"cpu": str(container.resources.requests.get("cpu", "unknown") if isinstance(container.resources.requests, dict) else "unknown"),
|
|
612
|
+
"memory": str(container.resources.requests.get("memory", "unknown") if isinstance(container.resources.requests, dict) else "unknown"),
|
|
613
|
+
"gpu": str(container.resources.requests.get("nvidia.com/gpu", "0") if isinstance(container.resources.requests, dict) else "0")
|
|
614
|
+
}
|
|
615
|
+
if container.resources.limits:
|
|
616
|
+
resource_info["limits"] = {
|
|
617
|
+
"cpu": str(container.resources.limits.get("cpu", "unknown") if isinstance(container.resources.limits, dict) else "unknown"),
|
|
618
|
+
"memory": str(container.resources.limits.get("memory", "unknown") if isinstance(container.resources.limits, dict) else "unknown"),
|
|
619
|
+
"gpu": str(container.resources.limits.get("nvidia.com/gpu", "0") if isinstance(container.resources.limits, dict) else "0"),
|
|
620
|
+
"gpuMemory": str(container.resources.limits.get("nvidia.com/gpu-memory", "") if isinstance(container.resources.limits, dict) else "")
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
return resource_info
|
|
624
|
+
except Exception as e:
|
|
625
|
+
logger.debug(f"Error getting pod resource usage: {e}")
|
|
626
|
+
return None
|
|
627
|
+
|
|
628
|
+
def monitor_running_jobs(self):
|
|
629
|
+
"""Monitor running jobs and update action statuses"""
|
|
630
|
+
for action_id, job_name in list(self.running_jobs.items()):
|
|
631
|
+
namespace = self.job_namespace
|
|
632
|
+
status = self.check_job_status(action_id, job_name, namespace)
|
|
633
|
+
|
|
634
|
+
if status == "COMPLETED":
|
|
635
|
+
logger.info(f"Job {job_name} completed successfully")
|
|
636
|
+
self.update_action_status(
|
|
637
|
+
action_id, "COMPLETED", "OK",
|
|
638
|
+
"Kubernetes job completed successfully"
|
|
639
|
+
)
|
|
640
|
+
del self.running_jobs[action_id]
|
|
641
|
+
|
|
642
|
+
elif status == "FAILED":
|
|
643
|
+
logger.warning(f"Job {job_name} failed")
|
|
644
|
+
self.update_action_status(
|
|
645
|
+
action_id, "ERROR", "ERROR",
|
|
646
|
+
"Kubernetes job failed"
|
|
647
|
+
)
|
|
648
|
+
del self.running_jobs[action_id]
|
|
649
|
+
|
|
650
|
+
elif status == "NOT_FOUND":
|
|
651
|
+
logger.warning(f"Job {job_name} not found, removing from tracking")
|
|
652
|
+
del self.running_jobs[action_id]
|
|
653
|
+
|
|
654
|
+
def send_heartbeat(self):
|
|
655
|
+
"""Send heartbeat to Matrice API with cluster health info"""
|
|
656
|
+
try:
|
|
657
|
+
# Get cluster health info
|
|
658
|
+
nodes_ready = 0
|
|
659
|
+
nodes_total = 0
|
|
660
|
+
gpus_available = 0
|
|
661
|
+
gpus_total = 0
|
|
662
|
+
|
|
663
|
+
try:
|
|
664
|
+
nodes = self.core_v1.list_node()
|
|
665
|
+
for node in nodes.items:
|
|
666
|
+
nodes_total += 1
|
|
667
|
+
for condition in node.status.conditions:
|
|
668
|
+
if condition.type == "Ready" and condition.status == "True":
|
|
669
|
+
nodes_ready += 1
|
|
670
|
+
|
|
671
|
+
# Check for GPU resources
|
|
672
|
+
allocatable = node.status.allocatable or {}
|
|
673
|
+
for key, value in allocatable.items():
|
|
674
|
+
if "gpu" in key.lower():
|
|
675
|
+
gpus_total += int(value)
|
|
676
|
+
except Exception as e:
|
|
677
|
+
logger.warning(f"Error getting node info: {e}")
|
|
678
|
+
|
|
679
|
+
# Get job counts
|
|
680
|
+
running_jobs = 0
|
|
681
|
+
pending_jobs = 0
|
|
682
|
+
try:
|
|
683
|
+
jobs = self.batch_v1.list_namespaced_job(self.job_namespace)
|
|
684
|
+
for job in jobs.items:
|
|
685
|
+
if job.status.active:
|
|
686
|
+
running_jobs += 1
|
|
687
|
+
elif not job.status.succeeded and not job.status.failed:
|
|
688
|
+
pending_jobs += 1
|
|
689
|
+
except Exception as e:
|
|
690
|
+
logger.warning(f"Error getting job info: {e}")
|
|
691
|
+
|
|
692
|
+
# Send heartbeat - for now just log the status
|
|
693
|
+
# TODO: Add K8s cluster heartbeat endpoint if needed
|
|
694
|
+
logger.debug(f"Cluster {self.cluster_id} status: nodes={nodes_ready}/{nodes_total}, "
|
|
695
|
+
f"gpus={gpus_available}/{gpus_total}, jobs={running_jobs} running, {pending_jobs} pending")
|
|
696
|
+
|
|
697
|
+
except Exception as e:
|
|
698
|
+
logger.error(f"Error sending heartbeat: {e}")
|
|
699
|
+
|
|
700
|
+
def start(self):
|
|
701
|
+
"""Main scheduler loop - matches InstanceManager.start() pattern"""
|
|
702
|
+
logger.info(f"Starting K8s Scheduler for cluster {self.cluster_id}")
|
|
703
|
+
|
|
704
|
+
heartbeat_counter = 0
|
|
705
|
+
heartbeat_interval = 6 # Send heartbeat every 6 poll cycles (60s if poll_interval=10)
|
|
706
|
+
|
|
707
|
+
while True:
|
|
708
|
+
try:
|
|
709
|
+
# Poll for pending actions
|
|
710
|
+
logger.info(f"Polling for pending actions for cluster {self.cluster_id} (Running jobs: {len(self.running_jobs)})")
|
|
711
|
+
pending_actions = self.poll_pending_actions()
|
|
712
|
+
|
|
713
|
+
if pending_actions:
|
|
714
|
+
logger.info(f"Found {len(pending_actions)} pending action(s)")
|
|
715
|
+
|
|
716
|
+
# Create jobs for pending actions
|
|
717
|
+
for action in pending_actions:
|
|
718
|
+
action_id = action.get("_id", action.get("id", ""))
|
|
719
|
+
action_type = action.get("action", "unknown")
|
|
720
|
+
if action_id not in self.running_jobs:
|
|
721
|
+
logger.info(f"Creating job for action {action_id} (type: {action_type})")
|
|
722
|
+
self.create_k8s_job(action)
|
|
723
|
+
else:
|
|
724
|
+
logger.debug(f"Skipping action {action_id} - already running")
|
|
725
|
+
|
|
726
|
+
# Monitor running jobs
|
|
727
|
+
if self.running_jobs:
|
|
728
|
+
logger.info(f"Monitoring {len(self.running_jobs)} running job(s)")
|
|
729
|
+
self.monitor_running_jobs()
|
|
730
|
+
|
|
731
|
+
# Send heartbeat periodically
|
|
732
|
+
heartbeat_counter += 1
|
|
733
|
+
if heartbeat_counter >= heartbeat_interval:
|
|
734
|
+
logger.info("Sending heartbeat to Matrice API")
|
|
735
|
+
self.send_heartbeat()
|
|
736
|
+
heartbeat_counter = 0
|
|
737
|
+
|
|
738
|
+
# Log summary
|
|
739
|
+
logger.info(f"Cycle complete - Running: {len(self.running_jobs)}, Pending: {len(pending_actions)}")
|
|
740
|
+
|
|
741
|
+
# Wait before next poll
|
|
742
|
+
time.sleep(self.poll_interval)
|
|
743
|
+
|
|
744
|
+
except KeyboardInterrupt:
|
|
745
|
+
logger.info("Scheduler stopped by user")
|
|
746
|
+
break
|
|
747
|
+
except Exception as e:
|
|
748
|
+
logger.error(f"Error in scheduler loop: {e}", exc_info=True)
|
|
749
|
+
time.sleep(self.poll_interval)
|