matrice-compute 0.1.44__py3-none-any.whl → 0.1.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,749 @@
1
+ """
2
+ Kubernetes Scheduler for bg-job-scheduler
3
+
4
+ This module runs inside a Kubernetes cluster and:
5
+ 1. Authenticates with Matrice API using access key and secret key (via matrice_common.session)
6
+ 2. Polls for assigned actions using /v1/actions/assign_jobs_kubernetes/{cluster_id}
7
+ 3. Creates K8s Jobs for each action using in-cluster authentication
8
+ 4. Monitors job status and updates action records via existing action update API
9
+ 5. Sends heartbeat to report cluster health
10
+
11
+ The K8s scheduler flow:
12
+ 1. Register a cluster in compute_clusters with isKubernetes: true
13
+ 2. When user submits a job with clusterName, processClusterName in be-action:
14
+ - Detects the cluster is K8s (isKubernetes: true)
15
+ - Sets kubernetesClusterId and executionMode: "kubernetes" in actionDetails
16
+ 3. K8s scheduler polls /v1/actions/assign_jobs_kubernetes/{cluster_id}
17
+ 4. Scheduler creates K8s Jobs for each action
18
+
19
+ """
20
+
21
+ import os
22
+ import sys
23
+ import time
24
+ import logging
25
+ from datetime import datetime
26
+ from typing import Optional, Dict, List, Any
27
+
28
+ from kubernetes import client, config
29
+ from kubernetes.client.rest import ApiException
30
+ from matrice_common.session import Session
31
+
32
+ logging.basicConfig(
33
+ level=logging.INFO,
34
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
35
+ handlers=[logging.StreamHandler(sys.stdout)],
36
+ force=True
37
+ )
38
+ logger = logging.getLogger("k8s_scheduler")
39
+ logger.setLevel(logging.INFO)
40
+
41
+
42
+ # Action type to script mapping (matches VM mode's action_instance.py)
43
+ ACTION_SCRIPTS: Dict[str, str] = {
44
+ "model_train": "python3 train.py",
45
+ "model_eval": "python3 eval.py",
46
+ "model_export": "python3 export.py",
47
+ "deploy_add": "python3 deploy.py",
48
+ "data_import": "python3 /usr/src/app/main.py",
49
+ "data_add": "python3 /usr/src/app/main.py",
50
+ "data_split": "python3 /usr/src/app/data_split.py",
51
+ "data_prep": "python3 /usr/src/app/data_preparation.py",
52
+ "dataset_annotation": "python3 /usr/src/app/dataset_annotation.py",
53
+ "dataset_augmentation": "python3 /usr/src/app/data_augmentation.py",
54
+ "dataset_generation": "python3 /usr/src/app/synthetic_dataset_generation.py",
55
+ "image_build": "python3 main.py",
56
+ "resource_clone": "python3 main.py",
57
+ "streaming_gateway": "python3 /usr/src/app/streaming_gateway.py",
58
+ "deploy_aggregator": "python3 /usr/src/app/deploy_aggregator.py",
59
+ }
60
+
61
+ # Extra packages needed per action type
62
+ ACTION_EXTRA_PACKAGES: Dict[str, List[str]] = {
63
+ "data_import": ["matrice_dataset"],
64
+ "data_add": ["matrice_dataset"],
65
+ "data_split": ["matrice_dataset"],
66
+ "data_prep": ["matrice_dataset"],
67
+ "dataset_annotation": ["matrice_dataset"],
68
+ "dataset_augmentation": ["matrice_dataset"],
69
+ "dataset_generation": ["matrice_dataset"],
70
+ "deploy_add": ["matrice_inference", "matrice_analytics"],
71
+ "streaming_gateway": ["matrice_streaming"],
72
+ }
73
+
74
+
75
+ class K8sScheduler:
76
+ """
77
+ Kubernetes Scheduler that polls for actions and creates K8s Jobs.
78
+ Runs inside the cluster using in-cluster authentication.
79
+ """
80
+
81
+ # Action type to script mapping (matches VM mode's action_instance.py)
82
+ # Constants moved to module level: ACTION_SCRIPTS and ACTION_EXTRA_PACKAGES
83
+
84
+ # Running jobs keyed by action_id
85
+ running_jobs: Dict[str, str]
86
+
87
+ def __init__(self):
88
+ # Configuration from environment
89
+ self.cluster_id = os.environ.get("CLUSTER_ID")
90
+ self.job_namespace = os.environ.get("JOB_NAMESPACE", "matrice-jobs")
91
+ self.poll_interval = int(os.environ.get("POLL_INTERVAL", "10"))
92
+ self.env = os.environ.get("ENV", "prod")
93
+
94
+ # GPU configuration - whether this cluster has GPU nodes
95
+ is_gpu_str = os.environ.get("IS_GPU", "false").lower()
96
+ self.is_gpu = is_gpu_str in ("true", "1", "yes")
97
+
98
+ # Validate required config
99
+ if not self.cluster_id:
100
+ raise ValueError("CLUSTER_ID environment variable is required")
101
+
102
+ # Matrice credentials (for API authentication and action execution)
103
+ self.matrice_access_key = os.environ.get("MATRICE_ACCESS_KEY_ID")
104
+ self.matrice_secret_key = os.environ.get("MATRICE_SECRET_ACCESS_KEY")
105
+
106
+ if not self.matrice_access_key or not self.matrice_secret_key:
107
+ raise ValueError("MATRICE_ACCESS_KEY_ID and MATRICE_SECRET_ACCESS_KEY environment variables are required")
108
+
109
+ # Initialize Matrice session for API authentication
110
+ self.session = Session(
111
+ account_number="",
112
+ access_key=self.matrice_access_key,
113
+ secret_key=self.matrice_secret_key,
114
+ )
115
+ self.rpc = self.session.rpc
116
+
117
+ # Initialize Kubernetes client (in-cluster auth)
118
+ self._init_k8s_client()
119
+
120
+ # Track running jobs
121
+ self.running_jobs = {} # action_id -> job_name
122
+
123
+ # Scheduler start time
124
+ self.start_time = datetime.now()
125
+
126
+ # Ensure Docker Hub image pull secret exists
127
+ self._ensure_docker_hub_secret()
128
+
129
+ logger.info(f"K8s Scheduler initialized for cluster {self.cluster_id}")
130
+ logger.info(f"Job namespace: {self.job_namespace}")
131
+ logger.info(f"Poll interval: {self.poll_interval}s")
132
+ logger.info(f"GPU mode: {self.is_gpu}")
133
+ logger.info(f"Matrice session initialized with access key: {self.matrice_access_key[:8]}...")
134
+
135
+ def _init_k8s_client(self):
136
+ """Initialize Kubernetes client using in-cluster config"""
137
+ try:
138
+ # Try in-cluster config first (when running inside K8s)
139
+ config.load_incluster_config()
140
+ logger.info("Using in-cluster Kubernetes configuration")
141
+ except config.ConfigException:
142
+ # Fall back to kubeconfig for local development
143
+ try:
144
+ config.load_kube_config()
145
+ logger.info("Using kubeconfig for Kubernetes configuration")
146
+ except config.ConfigException as e:
147
+ logger.error(f"Failed to configure Kubernetes client: {e}")
148
+ raise
149
+
150
+ self.batch_v1 = client.BatchV1Api()
151
+ self.core_v1 = client.CoreV1Api()
152
+
153
+ def _ensure_docker_hub_secret(self):
154
+ """
155
+ Fetch Docker Hub credentials from be-compute API and create/update
156
+ the image pull secret in the job namespace.
157
+
158
+ Uses the same public API endpoint as VM mode: /v1/compute/get_docker_hub_credentials
159
+ """
160
+ try:
161
+ # Fetch Docker Hub credentials from be-compute API
162
+ # Use the same public endpoint as VM mode
163
+ path = "/v1/compute/get_docker_hub_credentials"
164
+
165
+ response = self.rpc.get(path=path)
166
+
167
+ if not response or not response.get("success"):
168
+ logger.warning("Failed to fetch Docker Hub credentials from API, jobs may fail to pull private images")
169
+ return
170
+
171
+ creds = response.get("data", {})
172
+ username = creds.get("username")
173
+ password = creds.get("password")
174
+
175
+ if not username or not password:
176
+ logger.warning("Docker Hub credentials incomplete, jobs may fail to pull private images")
177
+ return
178
+
179
+ # Create docker-registry secret in job namespace
180
+ import base64
181
+ import json
182
+
183
+ # Create docker config JSON
184
+ docker_config = {
185
+ "auths": {
186
+ "https://index.docker.io/v1/": {
187
+ "username": username,
188
+ "password": password,
189
+ "auth": base64.b64encode(f"{username}:{password}".encode()).decode()
190
+ }
191
+ }
192
+ }
193
+
194
+ docker_config_json = json.dumps(docker_config)
195
+
196
+ # Create secret object
197
+ secret = client.V1Secret(
198
+ api_version="v1",
199
+ kind="Secret",
200
+ metadata=client.V1ObjectMeta(
201
+ name="matrice-registry",
202
+ namespace=self.job_namespace
203
+ ),
204
+ type="kubernetes.io/dockerconfigjson",
205
+ data={
206
+ ".dockerconfigjson": base64.b64encode(docker_config_json.encode()).decode()
207
+ }
208
+ )
209
+
210
+ # Try to create or update the secret
211
+ try:
212
+ self.core_v1.create_namespaced_secret(self.job_namespace, secret)
213
+ logger.info(f"Created Docker Hub secret 'matrice-registry' in namespace {self.job_namespace}")
214
+ except ApiException as e:
215
+ if e.status == 409: # Already exists
216
+ # Update existing secret
217
+ self.core_v1.replace_namespaced_secret("matrice-registry", self.job_namespace, secret)
218
+ logger.info(f"Updated Docker Hub secret 'matrice-registry' in namespace {self.job_namespace}")
219
+ else:
220
+ raise
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error creating Docker Hub secret: {e}")
224
+ logger.warning("Jobs requiring private Docker images may fail to start")
225
+
226
+ def _get_startup_command(self, action_type: str, action_id: str) -> List[str]:
227
+ """
228
+ Build the startup command for the container.
229
+
230
+ This mirrors VM mode's get_base_docker_cmd() logic:
231
+ 1. Install matrice SDK packages
232
+ 2. Run the appropriate script for the action type
233
+
234
+ Args:
235
+ action_type: The type of action (model_train, data_import, etc.)
236
+ action_id: The action record ID
237
+
238
+ Returns:
239
+ List of command arguments for the container
240
+ """
241
+ # Determine PyPI index based on environment
242
+ pypi_index = (
243
+ "https://test.pypi.org/simple/"
244
+ if self.env in ["dev", "staging"]
245
+ else "https://pypi.org/simple/"
246
+ )
247
+
248
+ # Base packages
249
+ packages = ["matrice_common", "matrice"]
250
+
251
+ # Add extra packages for specific action types
252
+ extra_pkgs = ACTION_EXTRA_PACKAGES.get(action_type, [])
253
+ packages.extend(extra_pkgs)
254
+
255
+ # Build pip install command
256
+ if self.env == "dev":
257
+ packages = [f"{pkg}>=1.0.0" for pkg in packages]
258
+ pip_cmd = f"pip install --pre --upgrade --force-reinstall --index-url {pypi_index} {' '.join(packages)}"
259
+ else:
260
+ pip_cmd = f"pip install --upgrade --force-reinstall --index-url {pypi_index} {' '.join(packages)}"
261
+
262
+ # Get the script for this action type
263
+ script = ACTION_SCRIPTS.get(action_type, "python3 main.py")
264
+
265
+ # Build full command
266
+ # Format: pip install SDK && run script with action_id
267
+ full_command = f"{pip_cmd} && {script} {action_id}"
268
+
269
+ # Use /bin/bash to match VM mode behavior
270
+ return ["/bin/bash", "-c", full_command]
271
+
272
+ def poll_pending_actions(self) -> List[Dict[str, Any]]:
273
+ """
274
+ Poll for actions assigned to this Kubernetes cluster.
275
+
276
+ Uses the new K8s-specific endpoint:
277
+ - processClusterName in be-action detects K8s clusters and sets kubernetesClusterId
278
+ - Scheduler calls /v1/actions/assign_jobs_kubernetes/{cluster_id} to fetch assigned actions
279
+ """
280
+ try:
281
+ # Use the K8s-specific endpoint
282
+ path = f"/v1/actions/assign_jobs_kubernetes/{self.cluster_id}"
283
+ response = self.rpc.get(path=path)
284
+
285
+ if response and response.get("success"):
286
+ actions = response.get("data", [])
287
+ if actions:
288
+ logger.info(f"Found {len(actions)} assigned actions for cluster {self.cluster_id}")
289
+ return actions if actions else []
290
+ else:
291
+ error_msg = response.get("message", "Unknown error") if response else "No response"
292
+ logger.warning(f"Failed to poll actions: {error_msg}")
293
+ return []
294
+
295
+ except Exception as e:
296
+ logger.error(f"Error polling for pending actions: {e}")
297
+ return []
298
+
299
+ def update_action_status(self, action_id: str, step_code: str, status: str,
300
+ description: str, extra_details: Optional[Dict] = None):
301
+ """
302
+ Update action status using the existing action update endpoint.
303
+
304
+ Uses the standard action record update API that accepts:
305
+ - stepCode: The step code for the action
306
+ - status: Status (OK, ERROR, etc.)
307
+ - statusDescription: Human-readable description
308
+
309
+ Extra details are merged into the action record's actionDetails.
310
+ """
311
+ try:
312
+ # Use RPC client for authenticated API calls
313
+ path = "/v1/actions"
314
+ payload: Dict[str, Any] = {
315
+ "_id": action_id,
316
+ "stepCode": step_code,
317
+ "status": status,
318
+ "statusDescription": description,
319
+ }
320
+
321
+ # Merge extra details into actionDetails
322
+ if extra_details:
323
+ payload["actionDetails"] = extra_details
324
+
325
+ response = self.rpc.put(path=path, payload=payload)
326
+
327
+ if response and response.get("success"):
328
+ logger.debug(f"Updated action {action_id}: stepCode={step_code}, status={status}")
329
+ else:
330
+ error_msg = response.get("message", "Unknown error") if response else "No response"
331
+ logger.warning(f"Failed to update action {action_id}: {error_msg}")
332
+
333
+ except Exception as e:
334
+ logger.error(f"Error updating action status: {e}")
335
+
336
+ def create_k8s_job(self, action: Dict[str, Any]) -> Optional[str]:
337
+ """Create a Kubernetes Job for the given action"""
338
+ action_id = action.get("_id", action.get("id", ""))
339
+ action_details = action.get("actionDetails", {})
340
+ action_type = action.get("action", "unknown")
341
+
342
+ # Get service ID for job naming
343
+ service_id = action_details.get("serviceId", action_id)
344
+
345
+ # Generate job name
346
+ job_name = f"action-{action_type}-{service_id[:8]}".lower().replace("_", "-")
347
+
348
+ # Get configuration from action details
349
+ docker_image = action_details.get("docker")
350
+ if not docker_image:
351
+ logger.error(f"No docker image specified for action {action_id}")
352
+ self.update_action_status(
353
+ action_id, "ERROR", "ERROR",
354
+ "No docker image specified for action"
355
+ )
356
+ return None
357
+
358
+ namespace = action_details.get("kubernetesNamespace", self.job_namespace)
359
+ cpu_request = action_details.get("cpuRequest", "500m")
360
+ memory_request = action_details.get("memoryRequest", "512Mi")
361
+ cpu_limit = action_details.get("cpuLimit", "2000m")
362
+ memory_limit = action_details.get("memoryLimit", "4Gi")
363
+ gpu_required = action_details.get("gpuRequired", False)
364
+ gpu_count = action_details.get("gpuCount", 1)
365
+ gpu_resource_key = action_details.get("gpuResourceKey", "nvidia.com/gpu")
366
+ gpu_memory_limit = action_details.get("gpuMemoryLimit", "")
367
+ gpu_node_selector = action_details.get("gpuNodeSelector", "")
368
+ registry_secret = action_details.get("registrySecret", "matrice-registry")
369
+
370
+ # Build environment variables
371
+ env_vars = [
372
+ client.V1EnvVar(name="ENV", value=self.env),
373
+ client.V1EnvVar(name="ACTION_ID", value=action_id),
374
+ client.V1EnvVar(name="EXECUTION_MODE", value="kubernetes"),
375
+ client.V1EnvVar(name="KUBERNETES_CLUSTER_ID", value=self.cluster_id),
376
+ ]
377
+
378
+ # Add Matrice credentials if available
379
+ if self.matrice_access_key:
380
+ env_vars.append(client.V1EnvVar(name="MATRICE_ACCESS_KEY_ID", value=self.matrice_access_key))
381
+ if self.matrice_secret_key:
382
+ env_vars.append(client.V1EnvVar(name="MATRICE_SECRET_ACCESS_KEY", value=self.matrice_secret_key))
383
+
384
+ # Add custom env vars from action
385
+ custom_env = action_details.get("envVars", {})
386
+ for key, value in custom_env.items():
387
+ env_vars.append(client.V1EnvVar(name=key, value=str(value)))
388
+
389
+ # Build resource requirements
390
+ resources = client.V1ResourceRequirements(
391
+ requests={
392
+ "cpu": cpu_request,
393
+ "memory": memory_request,
394
+ },
395
+ limits={
396
+ "cpu": cpu_limit,
397
+ "memory": memory_limit,
398
+ }
399
+ )
400
+
401
+ # Add GPU resources if required
402
+ if gpu_required:
403
+ resources.requests[gpu_resource_key] = str(gpu_count)
404
+ resources.limits[gpu_resource_key] = str(gpu_count)
405
+ if gpu_memory_limit:
406
+ resources.limits["nvidia.com/gpu-memory"] = gpu_memory_limit
407
+
408
+ # Build container with args only (don't override command/entrypoint from Dockerfile)
409
+ # The action images have their own ENTRYPOINT (e.g., "./main" for Go images)
410
+ # and expect the service_id as an argument
411
+ container = client.V1Container(
412
+ name="action-worker",
413
+ image=docker_image,
414
+ image_pull_policy="Always",
415
+ args=[service_id], # Pass service_id as argument to the container entrypoint
416
+ env=env_vars,
417
+ resources=resources,
418
+ )
419
+
420
+ # Build pod spec
421
+ pod_spec = client.V1PodSpec(
422
+ restart_policy="Never",
423
+ containers=[container],
424
+ # Add tolerations for control-plane taint (common in single-node clusters)
425
+ tolerations=[
426
+ client.V1Toleration(
427
+ key="node-role.kubernetes.io/control-plane",
428
+ operator="Exists",
429
+ effect="NoSchedule"
430
+ )
431
+ ]
432
+ )
433
+
434
+ # Add image pull secret if specified
435
+ if registry_secret:
436
+ pod_spec.image_pull_secrets = [
437
+ client.V1LocalObjectReference(name=registry_secret)
438
+ ]
439
+
440
+ # Add node selector for GPU
441
+ if gpu_required and gpu_node_selector:
442
+ pod_spec.node_selector = {gpu_node_selector: "true"}
443
+
444
+ # Build job spec
445
+ job_spec = client.V1JobSpec(
446
+ backoff_limit=2,
447
+ ttl_seconds_after_finished=3600, # Clean up after 1 hour
448
+ template=client.V1PodTemplateSpec(
449
+ metadata=client.V1ObjectMeta(
450
+ labels={
451
+ "app": "matrice-action",
452
+ "action-id": action_id,
453
+ "action-type": action_type,
454
+ }
455
+ ),
456
+ spec=pod_spec,
457
+ ),
458
+ )
459
+
460
+ # Build job
461
+ job = client.V1Job(
462
+ api_version="batch/v1",
463
+ kind="Job",
464
+ metadata=client.V1ObjectMeta(
465
+ name=job_name,
466
+ namespace=namespace,
467
+ labels={
468
+ "app": "matrice-action",
469
+ "action-id": action_id,
470
+ "action-type": action_type,
471
+ "managed-by": "matrice-scheduler",
472
+ }
473
+ ),
474
+ spec=job_spec,
475
+ )
476
+
477
+ # Ensure namespace exists
478
+ self._ensure_namespace(namespace)
479
+
480
+ # Create the job
481
+ try:
482
+ self.batch_v1.create_namespaced_job(namespace=namespace, body=job)
483
+ logger.info(f"Created K8s job {job_name} for action {action_id}")
484
+
485
+ # Update action status
486
+ self.update_action_status(
487
+ action_id,
488
+ "K8S_JOB_CREATED",
489
+ "OK",
490
+ f"Kubernetes job {job_name} created",
491
+ {
492
+ "kubernetesJobName": job_name,
493
+ "kubernetesNamespace": namespace,
494
+ "jobCreatedAt": datetime.now().isoformat(),
495
+ }
496
+ )
497
+
498
+ # Track the job
499
+ self.running_jobs[action_id] = job_name
500
+
501
+ return job_name
502
+
503
+ except ApiException as e:
504
+ if e.status == 409: # Already exists
505
+ logger.warning(f"Job {job_name} already exists")
506
+ self.running_jobs[action_id] = job_name
507
+ return job_name
508
+ else:
509
+ logger.error(f"Failed to create K8s job: {e}")
510
+ self.update_action_status(
511
+ action_id, "ERROR", "ERROR",
512
+ f"Failed to create Kubernetes job: {e.reason}"
513
+ )
514
+ return None
515
+
516
+ def _ensure_namespace(self, namespace: str):
517
+ """Ensure the namespace exists"""
518
+ try:
519
+ self.core_v1.read_namespace(namespace)
520
+ except ApiException as e:
521
+ if e.status == 404:
522
+ # Create namespace
523
+ ns = client.V1Namespace(
524
+ metadata=client.V1ObjectMeta(
525
+ name=namespace,
526
+ labels={"managed-by": "matrice-scheduler"}
527
+ )
528
+ )
529
+ try:
530
+ self.core_v1.create_namespace(ns)
531
+ logger.info(f"Created namespace {namespace}")
532
+ except ApiException as create_err:
533
+ if create_err.status != 409: # Not already exists
534
+ raise
535
+
536
+ def check_job_status(self, action_id: str, job_name: str, namespace: str) -> Optional[str]:
537
+ """Check the status of a K8s job and return status if completed. Also monitors resource usage."""
538
+ try:
539
+ job = self.batch_v1.read_namespaced_job(job_name, namespace)
540
+
541
+ # Get pod info for detailed logging and resource monitoring
542
+ pod_selector = f"job-name={job_name}"
543
+ pods = self.core_v1.list_namespaced_pod(
544
+ namespace=namespace,
545
+ label_selector=pod_selector
546
+ )
547
+
548
+ # Log detailed status
549
+ if pods.items:
550
+ pod = pods.items[0]
551
+ pod_name = pod.metadata.name
552
+ pod_phase = pod.status.phase
553
+
554
+ logger.info(
555
+ f"Job status - Action: {action_id}, Job: {job_name}, "
556
+ f"Pod: {pod_name}, Phase: {pod_phase}, "
557
+ f"Active: {job.status.active}, Succeeded: {job.status.succeeded}, Failed: {job.status.failed}"
558
+ )
559
+
560
+ # Get pod resource usage metrics if available
561
+ try:
562
+ resource_info = self._get_pod_resource_usage(namespace, pod_name)
563
+ if resource_info:
564
+ logger.info(f"Resource usage for {pod_name}: {resource_info}")
565
+
566
+ # Update action with resource info
567
+ self.update_action_status(
568
+ action_id,
569
+ "K8S_JOB_RUNNING",
570
+ "OK",
571
+ f"Job running - Pod: {pod_name}, Phase: {pod_phase}",
572
+ {
573
+ "podName": pod_name,
574
+ "podPhase": pod_phase,
575
+ "resourceUsage": resource_info
576
+ }
577
+ )
578
+ except Exception as e:
579
+ logger.debug(f"Could not fetch resource metrics: {e}")
580
+
581
+ if job.status.succeeded:
582
+ return "COMPLETED"
583
+ elif job.status.failed:
584
+ return "FAILED"
585
+ elif job.status.active:
586
+ return "RUNNING"
587
+ else:
588
+ return "PENDING"
589
+
590
+ except ApiException as e:
591
+ if e.status == 404:
592
+ return "NOT_FOUND"
593
+ logger.error(f"Error checking job status for {job_name}: {e}")
594
+ return None
595
+
596
+ def _get_pod_resource_usage(self, namespace: str, pod_name: str) -> Optional[Dict[str, Dict[str, str]]]:
597
+ """Get current resource usage for a pod"""
598
+ try:
599
+ # Read pod to get resource requests/limits
600
+ pod = self.core_v1.read_namespaced_pod(pod_name, namespace)
601
+
602
+ resource_info: Dict[str, Dict[str, str]] = {
603
+ "requests": {},
604
+ "limits": {}
605
+ }
606
+
607
+ for container in pod.spec.containers:
608
+ if container.resources:
609
+ if container.resources.requests:
610
+ resource_info["requests"] = {
611
+ "cpu": str(container.resources.requests.get("cpu", "unknown") if isinstance(container.resources.requests, dict) else "unknown"),
612
+ "memory": str(container.resources.requests.get("memory", "unknown") if isinstance(container.resources.requests, dict) else "unknown"),
613
+ "gpu": str(container.resources.requests.get("nvidia.com/gpu", "0") if isinstance(container.resources.requests, dict) else "0")
614
+ }
615
+ if container.resources.limits:
616
+ resource_info["limits"] = {
617
+ "cpu": str(container.resources.limits.get("cpu", "unknown") if isinstance(container.resources.limits, dict) else "unknown"),
618
+ "memory": str(container.resources.limits.get("memory", "unknown") if isinstance(container.resources.limits, dict) else "unknown"),
619
+ "gpu": str(container.resources.limits.get("nvidia.com/gpu", "0") if isinstance(container.resources.limits, dict) else "0"),
620
+ "gpuMemory": str(container.resources.limits.get("nvidia.com/gpu-memory", "") if isinstance(container.resources.limits, dict) else "")
621
+ }
622
+
623
+ return resource_info
624
+ except Exception as e:
625
+ logger.debug(f"Error getting pod resource usage: {e}")
626
+ return None
627
+
628
+ def monitor_running_jobs(self):
629
+ """Monitor running jobs and update action statuses"""
630
+ for action_id, job_name in list(self.running_jobs.items()):
631
+ namespace = self.job_namespace
632
+ status = self.check_job_status(action_id, job_name, namespace)
633
+
634
+ if status == "COMPLETED":
635
+ logger.info(f"Job {job_name} completed successfully")
636
+ self.update_action_status(
637
+ action_id, "COMPLETED", "OK",
638
+ "Kubernetes job completed successfully"
639
+ )
640
+ del self.running_jobs[action_id]
641
+
642
+ elif status == "FAILED":
643
+ logger.warning(f"Job {job_name} failed")
644
+ self.update_action_status(
645
+ action_id, "ERROR", "ERROR",
646
+ "Kubernetes job failed"
647
+ )
648
+ del self.running_jobs[action_id]
649
+
650
+ elif status == "NOT_FOUND":
651
+ logger.warning(f"Job {job_name} not found, removing from tracking")
652
+ del self.running_jobs[action_id]
653
+
654
+ def send_heartbeat(self):
655
+ """Send heartbeat to Matrice API with cluster health info"""
656
+ try:
657
+ # Get cluster health info
658
+ nodes_ready = 0
659
+ nodes_total = 0
660
+ gpus_available = 0
661
+ gpus_total = 0
662
+
663
+ try:
664
+ nodes = self.core_v1.list_node()
665
+ for node in nodes.items:
666
+ nodes_total += 1
667
+ for condition in node.status.conditions:
668
+ if condition.type == "Ready" and condition.status == "True":
669
+ nodes_ready += 1
670
+
671
+ # Check for GPU resources
672
+ allocatable = node.status.allocatable or {}
673
+ for key, value in allocatable.items():
674
+ if "gpu" in key.lower():
675
+ gpus_total += int(value)
676
+ except Exception as e:
677
+ logger.warning(f"Error getting node info: {e}")
678
+
679
+ # Get job counts
680
+ running_jobs = 0
681
+ pending_jobs = 0
682
+ try:
683
+ jobs = self.batch_v1.list_namespaced_job(self.job_namespace)
684
+ for job in jobs.items:
685
+ if job.status.active:
686
+ running_jobs += 1
687
+ elif not job.status.succeeded and not job.status.failed:
688
+ pending_jobs += 1
689
+ except Exception as e:
690
+ logger.warning(f"Error getting job info: {e}")
691
+
692
+ # Send heartbeat - for now just log the status
693
+ # TODO: Add K8s cluster heartbeat endpoint if needed
694
+ logger.debug(f"Cluster {self.cluster_id} status: nodes={nodes_ready}/{nodes_total}, "
695
+ f"gpus={gpus_available}/{gpus_total}, jobs={running_jobs} running, {pending_jobs} pending")
696
+
697
+ except Exception as e:
698
+ logger.error(f"Error sending heartbeat: {e}")
699
+
700
+ def start(self):
701
+ """Main scheduler loop - matches InstanceManager.start() pattern"""
702
+ logger.info(f"Starting K8s Scheduler for cluster {self.cluster_id}")
703
+
704
+ heartbeat_counter = 0
705
+ heartbeat_interval = 6 # Send heartbeat every 6 poll cycles (60s if poll_interval=10)
706
+
707
+ while True:
708
+ try:
709
+ # Poll for pending actions
710
+ logger.info(f"Polling for pending actions for cluster {self.cluster_id} (Running jobs: {len(self.running_jobs)})")
711
+ pending_actions = self.poll_pending_actions()
712
+
713
+ if pending_actions:
714
+ logger.info(f"Found {len(pending_actions)} pending action(s)")
715
+
716
+ # Create jobs for pending actions
717
+ for action in pending_actions:
718
+ action_id = action.get("_id", action.get("id", ""))
719
+ action_type = action.get("action", "unknown")
720
+ if action_id not in self.running_jobs:
721
+ logger.info(f"Creating job for action {action_id} (type: {action_type})")
722
+ self.create_k8s_job(action)
723
+ else:
724
+ logger.debug(f"Skipping action {action_id} - already running")
725
+
726
+ # Monitor running jobs
727
+ if self.running_jobs:
728
+ logger.info(f"Monitoring {len(self.running_jobs)} running job(s)")
729
+ self.monitor_running_jobs()
730
+
731
+ # Send heartbeat periodically
732
+ heartbeat_counter += 1
733
+ if heartbeat_counter >= heartbeat_interval:
734
+ logger.info("Sending heartbeat to Matrice API")
735
+ self.send_heartbeat()
736
+ heartbeat_counter = 0
737
+
738
+ # Log summary
739
+ logger.info(f"Cycle complete - Running: {len(self.running_jobs)}, Pending: {len(pending_actions)}")
740
+
741
+ # Wait before next poll
742
+ time.sleep(self.poll_interval)
743
+
744
+ except KeyboardInterrupt:
745
+ logger.info("Scheduler stopped by user")
746
+ break
747
+ except Exception as e:
748
+ logger.error(f"Error in scheduler loop: {e}", exc_info=True)
749
+ time.sleep(self.poll_interval)