anyscale 0.26.47__py3-none-any.whl → 0.26.49__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. anyscale/__init__.py +0 -7
  2. anyscale/_private/anyscale_client/README.md +115 -0
  3. anyscale/_private/anyscale_client/anyscale_client.py +12 -213
  4. anyscale/_private/anyscale_client/common.py +0 -55
  5. anyscale/_private/anyscale_client/fake_anyscale_client.py +19 -46
  6. anyscale/_private/docgen/__main__.py +32 -47
  7. anyscale/_private/docgen/generator.py +32 -16
  8. anyscale/_private/docgen/generator_legacy.py +58 -6
  9. anyscale/_private/docgen/models.md +3 -2
  10. anyscale/_private/workload/workload_config.py +16 -8
  11. anyscale/_private/workload/workload_sdk.py +24 -7
  12. anyscale/client/README.md +10 -2
  13. anyscale/client/openapi_client/__init__.py +6 -2
  14. anyscale/client/openapi_client/api/default_api.py +558 -8
  15. anyscale/client/openapi_client/models/__init__.py +6 -2
  16. anyscale/client/openapi_client/models/{alert_type.py → alert_issue_type.py} +8 -20
  17. anyscale/client/openapi_client/models/baseimagesenum.py +1 -2
  18. anyscale/client/openapi_client/models/cloud.py +31 -3
  19. anyscale/client/openapi_client/models/cloud_deployment.py +30 -3
  20. anyscale/client/openapi_client/models/cloud_with_cloud_resource.py +29 -1
  21. anyscale/client/openapi_client/models/cloud_with_cloud_resource_gcp.py +29 -1
  22. anyscale/client/openapi_client/models/dataset_metrics.py +6 -6
  23. anyscale/client/openapi_client/models/dataset_state.py +2 -1
  24. anyscale/client/openapi_client/models/decorated_cloud_deployment.py +481 -0
  25. anyscale/client/openapi_client/models/decoratedclouddeployment_response.py +121 -0
  26. anyscale/client/openapi_client/models/describe_system_workload_response.py +32 -6
  27. anyscale/client/openapi_client/models/experimental_workspace.py +29 -1
  28. anyscale/client/openapi_client/models/experimental_workspaces_sort_field.py +2 -1
  29. anyscale/client/openapi_client/models/metrics_query_response.py +121 -0
  30. anyscale/client/openapi_client/models/{clouddeployment_response.py → metricsqueryresponse_response.py} +11 -11
  31. anyscale/client/openapi_client/models/operator_metrics.py +8 -9
  32. anyscale/client/openapi_client/models/operator_status.py +102 -0
  33. anyscale/client/openapi_client/models/organization_usage_alert.py +20 -20
  34. anyscale/client/openapi_client/models/supportedbaseimagesenum.py +1 -2
  35. anyscale/cloud/models.py +330 -0
  36. anyscale/commands/cloud_commands.py +136 -44
  37. anyscale/commands/command_examples.py +54 -134
  38. anyscale/commands/compute_config_commands.py +7 -11
  39. anyscale/compute_config/__init__.py +2 -16
  40. anyscale/compute_config/_private/compute_config_sdk.py +27 -17
  41. anyscale/compute_config/commands.py +14 -44
  42. anyscale/compute_config/models.py +49 -26
  43. anyscale/controllers/cloud_controller.py +289 -171
  44. anyscale/controllers/cloud_file_storage_utils.py +204 -0
  45. anyscale/controllers/kubernetes_verifier.py +1570 -0
  46. anyscale/job/_private/job_sdk.py +17 -8
  47. anyscale/job/models.py +1 -1
  48. anyscale/scripts.py +0 -2
  49. anyscale/sdk/anyscale_client/models/baseimagesenum.py +1 -2
  50. anyscale/sdk/anyscale_client/models/cloud.py +31 -3
  51. anyscale/sdk/anyscale_client/models/supportedbaseimagesenum.py +1 -2
  52. anyscale/shared_anyscale_utils/headers.py +3 -0
  53. anyscale/shared_anyscale_utils/utils/id_gen.py +1 -0
  54. anyscale/version.py +1 -1
  55. anyscale/workspace/models.py +14 -7
  56. {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/METADATA +1 -1
  57. {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/RECORD +62 -73
  58. anyscale/commands/llm/dataset_commands.py +0 -269
  59. anyscale/commands/llm/group.py +0 -15
  60. anyscale/commands/llm/models_commands.py +0 -123
  61. anyscale/controllers/llm/__init__.py +0 -0
  62. anyscale/controllers/llm/models_controller.py +0 -144
  63. anyscale/llm/__init__.py +0 -2
  64. anyscale/llm/dataset/__init__.py +0 -2
  65. anyscale/llm/dataset/_private/__init__.py +0 -0
  66. anyscale/llm/dataset/_private/docs.py +0 -63
  67. anyscale/llm/dataset/_private/models.py +0 -71
  68. anyscale/llm/dataset/_private/sdk.py +0 -147
  69. anyscale/llm/model/__init__.py +0 -2
  70. anyscale/llm/model/_private/models_sdk.py +0 -62
  71. anyscale/llm/model/commands.py +0 -93
  72. anyscale/llm/model/models.py +0 -171
  73. anyscale/llm/model/sdk.py +0 -62
  74. anyscale/llm/sdk.py +0 -27
  75. {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/WHEEL +0 -0
  76. {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/entry_points.txt +0 -0
  77. {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/licenses/LICENSE +0 -0
  78. {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/licenses/NOTICE +0 -0
  79. {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1570 @@
1
+ """
2
+ Kubernetes Cloud Deployment Verifier
3
+
4
+ Handles verification of Kubernetes-based cloud deployments including:
5
+ - Operator pod health and connectivity
6
+ - File storage (CSI drivers, PVCs, NFS)
7
+ - Network connectivity
8
+ - Gateway support
9
+ - Nginx ingress controller
10
+ """
11
+
12
+
13
+ from contextlib import contextmanager, suppress
14
+ from dataclasses import dataclass
15
+ import json
16
+ import os
17
+ import shutil
18
+ import signal
19
+ import socket
20
+ import subprocess
21
+ import time
22
+ from typing import Dict, List, Optional
23
+
24
+ import click
25
+ import requests
26
+
27
+ from anyscale.cli_logger import BlockLogger
28
+ from anyscale.client.openapi_client.models.cloud_deployment import CloudDeployment
29
+ from anyscale.client.openapi_client.models.cloud_providers import CloudProviders
30
+ from anyscale.client.openapi_client.models.file_storage import FileStorage
31
+ from anyscale.client.openapi_client.models.kubernetes_config import (
32
+ KubernetesConfig as OpenAPIKubernetesConfig,
33
+ )
34
+ from anyscale.controllers.cloud_file_storage_utils import verify_file_storage_exists
35
+
36
+
37
+ # =============================================================================
38
+ # CONSTANTS
39
+ # =============================================================================
40
+
41
+ # Operator configuration
42
+ OPERATOR_HEALTH_PORT = 2113
43
+ OPERATOR_CONFIG_ENDPOINT = "/config"
44
+ OPERATOR_HEALTH_ENDPOINT = "/healthz/run"
45
+ DEFAULT_OPERATOR_NAMESPACE = "anyscale-operator"
46
+
47
+ # Network and timing configuration
48
+ PORT_FORWARD_WAIT_TIME = 3 # seconds to wait for port forward to establish
49
+ HTTP_REQUEST_TIMEOUT = 10 # seconds for HTTP requests to operator
50
+ PORT_FORWARD_TERMINATION_TIMEOUT = 5 # seconds to wait for graceful termination
51
+
52
+ OPERATOR_LABEL_SELECTOR = "app=anyscale-operator"
53
+
54
+ # Gateway resource types to check
55
+ GATEWAY_RESOURCE_TYPES = [
56
+ "gateway.gateway", # Gateway API v1
57
+ "gateways.gateway.networking.k8s.io", # Full API path
58
+ "gateway", # Short name
59
+ "gw", # Common alias
60
+ ]
61
+
62
+ # NGINX ingress controller configurations
63
+ NGINX_INGRESS_CONFIGS = [
64
+ {"namespace": "ingress-nginx", "label": "app.kubernetes.io/name=ingress-nginx"},
65
+ {"namespace": "nginx-ingress", "label": "app=nginx-ingress"},
66
+ {"namespace": "kube-system", "label": "app.kubernetes.io/name=ingress-nginx"},
67
+ {"namespace": "default", "label": "app=nginx-ingress"},
68
+ ]
69
+
70
+ # Ingress controller name patterns for fallback search
71
+ INGRESS_CONTROLLER_KEYWORDS = [
72
+ "ingress",
73
+ "haproxy",
74
+ "traefik",
75
+ "contour",
76
+ "ambassador",
77
+ "istio-gateway",
78
+ "nginx",
79
+ ]
80
+
81
+ # kubectl binary search paths
82
+ KUBECTL_COMMON_PATHS = [
83
+ "/usr/local/bin/kubectl",
84
+ "/usr/bin/kubectl",
85
+ "/bin/kubectl",
86
+ "/opt/homebrew/bin/kubectl", # macOS homebrew
87
+ "~/.local/bin/kubectl", # User local install
88
+ ]
89
+
90
+ # Status and result strings
91
+ PASSED_STATUS = "PASSED"
92
+ FAILED_STATUS = "FAILED"
93
+ RUNNING_STATUS = "Running"
94
+
95
+ # Verification component names (for consistent reporting)
96
+ class VerificationComponents:
97
+ OPERATOR_POD_INSTALLED = "Operator Pod Installed"
98
+ OPERATOR_HEALTH = "Operator Health"
99
+ OPERATOR_IDENTITY = "Operator Identity"
100
+ FILE_STORAGE = "File Storage"
101
+ GATEWAY_SUPPORT = "Gateway Support"
102
+ NGINX_INGRESS = "NGINX Ingress"
103
+
104
+
105
+ # =============================================================================
106
+ # EXCEPTIONS
107
+ # =============================================================================
108
+
109
+
110
+ class KubernetesVerificationError(Exception):
111
+ """Base exception for all Kubernetes verification errors."""
112
+
113
+
114
+ class KubectlError(KubernetesVerificationError):
115
+ """Raised when kubectl commands fail."""
116
+
117
+ def __init__(
118
+ self, message: str, command: Optional[str] = None, stderr: Optional[str] = None
119
+ ):
120
+ super().__init__(message)
121
+ self.command = command
122
+ self.stderr = stderr
123
+
124
+
125
+ class KubectlNotFoundError(KubernetesVerificationError):
126
+ """Raised when kubectl binary cannot be found."""
127
+
128
+
129
+ class OperatorPodNotFoundError(KubernetesVerificationError):
130
+ """Raised when the Anyscale operator pod cannot be found."""
131
+
132
+
133
+ class OperatorConnectionError(KubernetesVerificationError):
134
+ """Raised when connection to the operator fails."""
135
+
136
+ def __init__(
137
+ self,
138
+ message: str,
139
+ pod_name: Optional[str] = None,
140
+ endpoint: Optional[str] = None,
141
+ ):
142
+ super().__init__(message)
143
+ self.pod_name = pod_name
144
+ self.endpoint = endpoint
145
+
146
+
147
+ class PortForwardError(KubernetesVerificationError):
148
+ """Raised when port forwarding to a pod fails."""
149
+
150
+ def __init__(
151
+ self, message: str, pod_name: Optional[str] = None, port: Optional[int] = None
152
+ ):
153
+ super().__init__(message)
154
+ self.pod_name = pod_name
155
+ self.port = port
156
+
157
+
158
+ class IdentityVerificationError(KubernetesVerificationError):
159
+ """Raised when operator identity verification fails."""
160
+
161
+ def __init__(
162
+ self,
163
+ message: str,
164
+ expected_identity: Optional[str] = None,
165
+ actual_identity: Optional[str] = None,
166
+ ):
167
+ super().__init__(message)
168
+ self.expected_identity = expected_identity
169
+ self.actual_identity = actual_identity
170
+
171
+
172
+ class FileStorageVerificationError(KubernetesVerificationError):
173
+ """Raised when file storage verification fails."""
174
+
175
+
176
+ class GatewayVerificationError(KubernetesVerificationError):
177
+ """Raised when gateway verification fails."""
178
+
179
+ def __init__(self, message: str, gateway_name: Optional[str] = None):
180
+ super().__init__(message)
181
+ self.gateway_name = gateway_name
182
+
183
+
184
+ class ResourceNotFoundError(KubernetesVerificationError):
185
+ """Raised when a required Kubernetes resource is not found."""
186
+
187
+ def __init__(
188
+ self,
189
+ message: str,
190
+ resource_type: Optional[str] = None,
191
+ resource_name: Optional[str] = None,
192
+ namespace: Optional[str] = None,
193
+ ):
194
+ super().__init__(message)
195
+ self.resource_type = resource_type
196
+ self.resource_name = resource_name
197
+ self.namespace = namespace
198
+
199
+
200
+ # =============================================================================
201
+ # DATA MODELS
202
+ # =============================================================================
203
+
204
+
205
+ @dataclass
206
+ class VerificationResults:
207
+ """Tracks the results of all verification steps."""
208
+
209
+ operator_pod_installed: bool = False
210
+ operator_health: bool = False
211
+ operator_identity: bool = False
212
+ file_storage: bool = False
213
+ gateway_support: bool = False
214
+ nginx_ingress: bool = False
215
+
216
+ def to_dict(self) -> Dict[str, bool]:
217
+ """Convert to dictionary format matching original implementation."""
218
+ return {
219
+ VerificationComponents.OPERATOR_POD_INSTALLED: self.operator_pod_installed,
220
+ VerificationComponents.OPERATOR_HEALTH: self.operator_health,
221
+ VerificationComponents.OPERATOR_IDENTITY: self.operator_identity,
222
+ VerificationComponents.FILE_STORAGE: self.file_storage,
223
+ VerificationComponents.GATEWAY_SUPPORT: self.gateway_support,
224
+ VerificationComponents.NGINX_INGRESS: self.nginx_ingress,
225
+ }
226
+
227
+ @property
228
+ def overall_success(self) -> bool:
229
+ """Return True if all verification steps passed."""
230
+ return all(
231
+ [
232
+ self.operator_pod_installed,
233
+ self.operator_health,
234
+ self.operator_identity,
235
+ self.file_storage,
236
+ self.gateway_support,
237
+ self.nginx_ingress,
238
+ ]
239
+ )
240
+
241
+
242
+ @dataclass
243
+ class KubernetesConfig:
244
+ """Configuration for Kubernetes cluster access."""
245
+
246
+ context: str
247
+ operator_namespace: str
248
+
249
+ def __post_init__(self):
250
+ """Validate configuration after initialization."""
251
+ if not self.context:
252
+ raise ValueError("Kubernetes context cannot be empty")
253
+ if not self.operator_namespace:
254
+ raise ValueError("Operator namespace cannot be empty")
255
+
256
+
257
+ @dataclass
258
+ class OperatorHealthData:
259
+ """Data retrieved from operator health endpoint."""
260
+
261
+ status_code: int
262
+ response_text: Optional[str] = None
263
+
264
+ @property
265
+ def is_healthy(self) -> bool:
266
+ """Return True if operator is healthy."""
267
+ return self.status_code == 200
268
+
269
+
270
+ @dataclass
271
+ class OperatorConfigData:
272
+ """Data retrieved from operator config endpoint."""
273
+
274
+ status_code: int
275
+ response_text: str
276
+ config_data: Optional[Dict] = None
277
+ config_error: Optional[str] = None
278
+
279
+ @property
280
+ def is_valid(self) -> bool:
281
+ """Return True if config data is valid."""
282
+ return self.status_code == 200 and self.config_data is not None
283
+
284
+
285
+ @dataclass
286
+ class OperatorData:
287
+ """Combined data from operator health and config endpoints."""
288
+
289
+ health: OperatorHealthData
290
+ config: OperatorConfigData
291
+
292
+ @classmethod
293
+ def from_dict(cls, data: Dict) -> "OperatorData":
294
+ """Create OperatorData from dictionary format used in original code."""
295
+ health = OperatorHealthData(
296
+ status_code=data["health_status"], response_text=data.get("health_response")
297
+ )
298
+
299
+ config = OperatorConfigData(
300
+ status_code=data["config_status"],
301
+ response_text=data["config_response"],
302
+ config_data=data.get("config_data"),
303
+ config_error=data.get("config_error"),
304
+ )
305
+
306
+ return cls(health=health, config=config)
307
+
308
+
309
+ @dataclass
310
+ class GatewayConfig:
311
+ """Gateway configuration from operator."""
312
+
313
+ enabled: bool = False
314
+ name: Optional[str] = None
315
+
316
+ @classmethod
317
+ def from_operator_config(cls, config_data: Optional[Dict]) -> "GatewayConfig":
318
+ """Extract gateway config from operator configuration."""
319
+ if not config_data:
320
+ return cls()
321
+
322
+ gateway_config = config_data.get("gateway", {})
323
+ if not gateway_config:
324
+ return cls()
325
+
326
+ return cls(
327
+ enabled=gateway_config.get("enable", False), name=gateway_config.get("name")
328
+ )
329
+
330
+ @property
331
+ def requires_verification(self) -> bool:
332
+ """Return True if gateway verification is required."""
333
+ return self.enabled and self.name is not None
334
+
335
+
336
+ # =============================================================================
337
+ # KUBECTL OPERATIONS
338
+ # =============================================================================
339
+
340
+
341
+ class KubectlOperations:
342
+ """Utility class for executing kubectl commands with consistent error handling."""
343
+
344
+ def __init__(self, context: str, logger: BlockLogger):
345
+ self.context = context
346
+ self.log = logger
347
+ self._kubectl_path: Optional[str] = None
348
+
349
+ def get_resource(
350
+ self, resource_type: str, name: str, namespace: Optional[str] = None
351
+ ) -> Dict:
352
+ """Get a single Kubernetes resource by name."""
353
+ cmd_args = ["get", resource_type, name, "--context", self.context, "-o", "json"]
354
+ if namespace:
355
+ cmd_args.extend(["-n", namespace])
356
+
357
+ try:
358
+ result = self._run_kubectl_command(cmd_args)
359
+ return json.loads(result.stdout)
360
+ except subprocess.CalledProcessError as e:
361
+ if "not found" in e.stderr.lower():
362
+ raise ResourceNotFoundError(
363
+ f"{resource_type} '{name}' not found",
364
+ resource_type=resource_type,
365
+ resource_name=name,
366
+ namespace=namespace,
367
+ )
368
+ raise KubectlError(
369
+ f"Failed to get {resource_type} '{name}': {e.stderr}",
370
+ command=" ".join(cmd_args),
371
+ stderr=e.stderr,
372
+ )
373
+ except json.JSONDecodeError as e:
374
+ raise KubectlError(
375
+ f"Invalid JSON response from kubectl: {e}", command=" ".join(cmd_args)
376
+ )
377
+
378
+ def list_resources(
379
+ self,
380
+ resource_type: str,
381
+ namespace: Optional[str] = None,
382
+ label_selector: Optional[str] = None,
383
+ all_namespaces: bool = False,
384
+ ) -> List[Dict]:
385
+ """List Kubernetes resources with optional filtering."""
386
+ cmd_args = ["get", resource_type, "--context", self.context, "-o", "json"]
387
+
388
+ if all_namespaces:
389
+ cmd_args.append("--all-namespaces")
390
+ elif namespace:
391
+ cmd_args.extend(["-n", namespace])
392
+
393
+ if label_selector:
394
+ cmd_args.extend(["-l", label_selector])
395
+
396
+ try:
397
+ result = self._run_kubectl_command(cmd_args)
398
+ data = json.loads(result.stdout)
399
+ return data.get("items", [])
400
+ except subprocess.CalledProcessError as e:
401
+ raise KubectlError(
402
+ f"Failed to list {resource_type}: {e.stderr}",
403
+ command=" ".join(cmd_args),
404
+ stderr=e.stderr,
405
+ )
406
+ except json.JSONDecodeError as e:
407
+ raise KubectlError(
408
+ f"Invalid JSON response from kubectl: {e}", command=" ".join(cmd_args)
409
+ )
410
+
411
+ def get_resource_field(
412
+ self,
413
+ resource_type: str,
414
+ name: str,
415
+ jsonpath: str,
416
+ namespace: Optional[str] = None,
417
+ ) -> str:
418
+ """Get a specific field from a Kubernetes resource using jsonpath."""
419
+ cmd_args = [
420
+ "get",
421
+ resource_type,
422
+ name,
423
+ "--context",
424
+ self.context,
425
+ "-o",
426
+ f"jsonpath={jsonpath}",
427
+ ]
428
+ if namespace:
429
+ cmd_args.extend(["-n", namespace])
430
+
431
+ try:
432
+ result = self._run_kubectl_command(cmd_args)
433
+ return result.stdout.strip()
434
+ except subprocess.CalledProcessError as e:
435
+ if "not found" in e.stderr.lower():
436
+ raise ResourceNotFoundError(
437
+ f"{resource_type} '{name}' not found",
438
+ resource_type=resource_type,
439
+ resource_name=name,
440
+ namespace=namespace,
441
+ )
442
+ raise KubectlError(
443
+ f"Failed to get field from {resource_type} '{name}': {e.stderr}",
444
+ command=" ".join(cmd_args),
445
+ stderr=e.stderr,
446
+ )
447
+
448
+ def get_available_contexts(self) -> List[str]:
449
+ """Get list of available kubectl contexts."""
450
+ try:
451
+ result = self._run_kubectl_command(["config", "get-contexts", "-o", "name"])
452
+ contexts = [
453
+ ctx.strip() for ctx in result.stdout.strip().split("\n") if ctx.strip()
454
+ ]
455
+ return contexts
456
+ except subprocess.CalledProcessError as e:
457
+ raise KubectlError(
458
+ f"Failed to get kubectl contexts: {e.stderr}",
459
+ command="kubectl config get-contexts -o name",
460
+ stderr=e.stderr,
461
+ )
462
+
463
+ def get_current_context(self) -> Optional[str]:
464
+ """Get the current kubectl context."""
465
+ try:
466
+ result = self._run_kubectl_command(["config", "current-context"])
467
+ return result.stdout.strip()
468
+ except subprocess.CalledProcessError as e:
469
+ if "current-context is not set" in e.stderr.lower():
470
+ return None
471
+ raise KubectlError(
472
+ f"Failed to get current context: {e.stderr}",
473
+ command="kubectl config current-context",
474
+ stderr=e.stderr,
475
+ )
476
+
477
+ def start_port_forward(
478
+ self, pod_name: str, local_port: int, remote_port: int, namespace: str
479
+ ) -> subprocess.Popen:
480
+ """Start port forwarding to a pod."""
481
+ cmd_args = [
482
+ "port-forward",
483
+ "--context",
484
+ self.context,
485
+ "-n",
486
+ namespace,
487
+ pod_name,
488
+ f"{local_port}:{remote_port}",
489
+ ]
490
+
491
+ try:
492
+ cmd = self._get_kubectl_cmd(cmd_args)
493
+ process = subprocess.Popen(
494
+ cmd,
495
+ stdout=subprocess.PIPE,
496
+ stderr=subprocess.PIPE,
497
+ preexec_fn=os.setsid, # Create new process group for cleanup
498
+ )
499
+ return process
500
+ except (subprocess.CalledProcessError, OSError) as e:
501
+ raise KubectlError(
502
+ f"Failed to start port forward to {pod_name}: {e}",
503
+ command=" ".join(cmd_args),
504
+ )
505
+
506
+ def check_kubectl_available(self) -> bool:
507
+ """Check if kubectl command is available."""
508
+ try:
509
+ self._run_kubectl_command(["version", "--client"])
510
+ return True
511
+ except (subprocess.CalledProcessError, FileNotFoundError, KubectlNotFoundError):
512
+ return False
513
+
514
+ def get_pod_status(self, pod_name: str, namespace: str) -> str:
515
+ """
516
+ Get pod status phase in specific namespace.
517
+
518
+ Args:
519
+ pod_name: Name of the pod
520
+ namespace: Namespace containing the pod
521
+
522
+ Returns:
523
+ Pod status phase (e.g., "Running", "Pending") or "unknown" if cannot be determined
524
+ """
525
+ try:
526
+ return self.get_resource_field(
527
+ "pod", pod_name, "{.status.phase}", namespace=namespace
528
+ )
529
+ except (KubectlError, ResourceNotFoundError):
530
+ # Return "unknown" if status cannot be determined
531
+ return "unknown"
532
+
533
+ def is_pod_running(self, pod_name: str, namespace: str) -> bool:
534
+ """
535
+ Check if pod is in running state.
536
+
537
+ Args:
538
+ pod_name: Name of the pod
539
+ namespace: Namespace containing the pod
540
+
541
+ Returns:
542
+ True if pod is running, False otherwise
543
+ """
544
+ try:
545
+ status = self.get_resource_field(
546
+ "pod", pod_name, "{.status.phase}", namespace=namespace
547
+ )
548
+ return status == RUNNING_STATUS
549
+ except (KubectlError, ResourceNotFoundError):
550
+ # Return False if status check fails
551
+ return False
552
+
553
+ def _run_kubectl_command(self, args: List[str]) -> subprocess.CompletedProcess:
554
+ """Execute a kubectl command with the given arguments."""
555
+ cmd = self._get_kubectl_cmd(args)
556
+ return subprocess.run(cmd, capture_output=True, text=True, check=True)
557
+
558
+ def _get_kubectl_cmd(self, args: List[str]) -> List[str]:
559
+ """Get kubectl command with proper binary path."""
560
+ kubectl_path = self._find_kubectl_binary()
561
+ if not kubectl_path:
562
+ raise KubectlNotFoundError(
563
+ "kubectl command not found. Please install kubectl and ensure it's in your PATH."
564
+ )
565
+ return [kubectl_path] + args
566
+
567
+ def _find_kubectl_binary(self) -> Optional[str]:
568
+ """Find kubectl binary in common locations."""
569
+ if self._kubectl_path:
570
+ return self._kubectl_path
571
+
572
+ # Try to find kubectl using shutil.which first (respects PATH)
573
+ kubectl_path = shutil.which("kubectl")
574
+ if kubectl_path:
575
+ self._kubectl_path = kubectl_path
576
+ return kubectl_path
577
+
578
+ # Try common installation locations
579
+ for path in KUBECTL_COMMON_PATHS:
580
+ expanded_path = os.path.expanduser(path)
581
+ if os.path.isfile(expanded_path) and os.access(expanded_path, os.X_OK):
582
+ self._kubectl_path = expanded_path
583
+ return expanded_path
584
+
585
+ return None
586
+
587
+
588
+ # =============================================================================
589
+ # OPERATOR VERIFIER
590
+ # =============================================================================
591
+
592
+
593
+ class OperatorVerifier:
594
+ """Handles verification of Anyscale operator pod, health, and identity."""
595
+
596
+ def __init__(
597
+ self,
598
+ kubectl_ops: KubectlOperations,
599
+ k8s_config: KubernetesConfig,
600
+ logger: BlockLogger,
601
+ ):
602
+ self.kubectl = kubectl_ops
603
+ self.config = k8s_config
604
+ self.log = logger
605
+
606
+ def find_operator_pod(self) -> str:
607
+ """Find and verify operator pod is running."""
608
+ try:
609
+ pods = self.kubectl.list_resources(
610
+ "pods",
611
+ namespace=self.config.operator_namespace,
612
+ label_selector=OPERATOR_LABEL_SELECTOR,
613
+ )
614
+ except KubectlError as e:
615
+ raise OperatorPodNotFoundError(f"Failed to list operator pods: {e}")
616
+
617
+ if not pods:
618
+ raise OperatorPodNotFoundError(
619
+ "No Anyscale operator pods found. Expected pods with labels like "
620
+ "'app=anyscale-operator'"
621
+ )
622
+
623
+ operator_pod = pods[0]["metadata"]["name"]
624
+
625
+ if not self.kubectl.is_pod_running(
626
+ operator_pod, self.config.operator_namespace
627
+ ):
628
+ raise OperatorPodNotFoundError(
629
+ f"Operator pod '{operator_pod}' is not running"
630
+ )
631
+
632
+ return operator_pod
633
+
634
+ def get_operator_data(self, pod_name: str) -> OperatorData:
635
+ """Port forward to operator and fetch both health and config data."""
636
+ try:
637
+ with self._port_forward_to_operator(pod_name) as local_port:
638
+ # Fetch health data
639
+ health_data = self._fetch_health_data(local_port)
640
+
641
+ # Fetch config data
642
+ config_data = self._fetch_config_data(local_port)
643
+
644
+ return OperatorData(health=health_data, config=config_data)
645
+
646
+ except requests.RequestException as e:
647
+ raise OperatorConnectionError(
648
+ f"Cannot connect to operator endpoints: {e}", pod_name=pod_name
649
+ )
650
+ except RuntimeError as e:
651
+ raise PortForwardError(
652
+ f"Port forwarding failed: {e}",
653
+ pod_name=pod_name,
654
+ port=OPERATOR_HEALTH_PORT,
655
+ )
656
+
657
+ def verify_operator_health(self, operator_data: OperatorData) -> bool:
658
+ """Verify operator health using pre-fetched data."""
659
+ if operator_data.health.is_healthy:
660
+ return True
661
+ else:
662
+ self.log.error(
663
+ f"Health check failed - HTTP {operator_data.health.status_code}"
664
+ )
665
+ if operator_data.health.response_text:
666
+ self.log.error(f"Response: {operator_data.health.response_text}")
667
+ return False
668
+
669
+ def verify_operator_identity(
670
+ self,
671
+ operator_data: OperatorData,
672
+ kubernetes_config: OpenAPIKubernetesConfig,
673
+ cloud_provider: Optional[CloudProviders],
674
+ ) -> bool:
675
+ """Verify operator identity using pre-fetched config data."""
676
+ # Validate kubernetes_config contents
677
+ expected_identity = kubernetes_config.anyscale_operator_iam_identity
678
+ if not expected_identity:
679
+ self.log.error(
680
+ "Missing 'anyscale_operator_iam_identity' in kubernetes config"
681
+ )
682
+ return False
683
+
684
+ # Validate config response
685
+ if not operator_data.config.is_valid:
686
+ self.log.error(
687
+ f"Config endpoint returned HTTP {operator_data.config.status_code}"
688
+ )
689
+ if operator_data.config.response_text:
690
+ self.log.error(f"Response: {operator_data.config.response_text}")
691
+ return False
692
+
693
+ # Extract actual identity from config
694
+ if operator_data.config.config_data is None:
695
+ self.log.error("Operator config data is None")
696
+ return False
697
+
698
+ actual_identity = operator_data.config.config_data.get("iamIdentity")
699
+ if not actual_identity:
700
+ self.log.error("Operator config missing 'iamIdentity' field")
701
+ return False
702
+
703
+ # Perform identity comparison
704
+ if self._evaluate_identity_match(
705
+ expected_identity, actual_identity, cloud_provider
706
+ ):
707
+ self.log.info(
708
+ f"AWS identity match: Role matches (Expected: {expected_identity})"
709
+ )
710
+ self.log.info("Expected IAM role matches actual assumed role")
711
+ return True
712
+ else:
713
+ self.log.error("Operator identity mismatch")
714
+ self.log.error(f"Expected: {expected_identity}")
715
+ self.log.error(f"Actual: {actual_identity}")
716
+ return False
717
+
718
+ @contextmanager
719
+ def _port_forward_to_operator(self, pod_name: str):
720
+ """Context manager that port forwards to operator pod."""
721
+ port_forward_process = None
722
+ local_port = None
723
+ try:
724
+ # Get a free port for port forwarding
725
+ local_port = self._get_free_port()
726
+ self.log.info(f"Using local port {local_port} for port forwarding")
727
+
728
+ # Start port forwarding to the pod
729
+ self.log.info(
730
+ f"Starting port forward to pod {pod_name} on port {local_port}:{OPERATOR_HEALTH_PORT}..."
731
+ )
732
+
733
+ port_forward_process = self.kubectl.start_port_forward(
734
+ pod_name,
735
+ local_port,
736
+ OPERATOR_HEALTH_PORT,
737
+ self.config.operator_namespace,
738
+ )
739
+
740
+ # Wait for port forward to establish
741
+ self.log.info("Waiting for port forward to establish...")
742
+ time.sleep(PORT_FORWARD_WAIT_TIME)
743
+
744
+ # Check if port forward process is still running
745
+ if port_forward_process.poll() is not None:
746
+ stderr = (
747
+ port_forward_process.stderr.read().decode()
748
+ if port_forward_process.stderr
749
+ else ""
750
+ )
751
+ raise RuntimeError(f"Port forward failed to start: {stderr}")
752
+
753
+ # Yield the local port to the calling function
754
+ yield local_port
755
+
756
+ finally:
757
+ # Clean up port forward process
758
+ if port_forward_process and port_forward_process.poll() is None:
759
+ try:
760
+ # Kill the entire process group to ensure cleanup
761
+ os.killpg(os.getpgid(port_forward_process.pid), signal.SIGTERM)
762
+ port_forward_process.wait(timeout=PORT_FORWARD_TERMINATION_TIMEOUT)
763
+ except (ProcessLookupError, subprocess.TimeoutExpired):
764
+ # Force kill if graceful termination fails
765
+ with suppress(ProcessLookupError):
766
+ os.killpg(os.getpgid(port_forward_process.pid), signal.SIGKILL)
767
+ except (OSError, ValueError) as e:
768
+ self.log.warning(f"Port forward cleanup warning: {e}")
769
+
770
+ def _get_free_port(self) -> int:
771
+ """Get a random free port on localhost."""
772
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
773
+ s.bind(("", 0))
774
+ s.listen(1)
775
+ port = s.getsockname()[1]
776
+ return port
777
+
778
+ def _fetch_health_data(self, local_port: int) -> OperatorHealthData:
779
+ """Fetch health data from operator."""
780
+ response = requests.get(
781
+ f"http://localhost:{local_port}{OPERATOR_HEALTH_ENDPOINT}",
782
+ timeout=HTTP_REQUEST_TIMEOUT,
783
+ )
784
+
785
+ return OperatorHealthData(
786
+ status_code=response.status_code,
787
+ response_text=response.text if response.status_code != 200 else None,
788
+ )
789
+
790
+ def _fetch_config_data(self, local_port: int) -> OperatorConfigData:
791
+ """Fetch config data from operator."""
792
+ response = requests.get(
793
+ f"http://localhost:{local_port}{OPERATOR_CONFIG_ENDPOINT}",
794
+ timeout=HTTP_REQUEST_TIMEOUT,
795
+ )
796
+
797
+ config_data = None
798
+ config_error = None
799
+
800
+ if response.status_code == 200:
801
+ try:
802
+ config_data = response.json()
803
+ except json.JSONDecodeError as e:
804
+ config_error = str(e)
805
+
806
+ return OperatorConfigData(
807
+ status_code=response.status_code,
808
+ response_text=response.text,
809
+ config_data=config_data,
810
+ config_error=config_error,
811
+ )
812
+
813
+ def _evaluate_identity_match(
814
+ self,
815
+ expected_identity: str,
816
+ actual_identity: str,
817
+ cloud_provider: Optional[CloudProviders],
818
+ ) -> bool:
819
+ """Evaluate if the operator identity matches expected identity based on cloud provider."""
820
+ if not expected_identity or not actual_identity:
821
+ return False
822
+
823
+ # Convert to string for comparison, default to AWS
824
+ cloud_provider_str = str(cloud_provider) if cloud_provider else "AWS"
825
+
826
+ # Handle cloud provider specific identity comparison
827
+ if cloud_provider_str == "AWS":
828
+ return self._evaluate_aws_identity(expected_identity, actual_identity)
829
+ elif cloud_provider_str == "GCP":
830
+ return self._evaluate_gcp_identity(expected_identity, actual_identity)
831
+ elif cloud_provider_str == "AZURE":
832
+ return self._evaluate_azure_identity(expected_identity, actual_identity)
833
+ else:
834
+ # For unknown providers, fall back to exact string comparison
835
+ self.log.warning(
836
+ f"Unknown cloud provider '{cloud_provider}', using exact string comparison"
837
+ )
838
+ return expected_identity == actual_identity
839
+
840
+ def _evaluate_aws_identity(
841
+ self, expected_identity: str, actual_identity: str
842
+ ) -> bool:
843
+ """Evaluate AWS IAM identity comparison."""
844
+ try:
845
+ # If they're exactly equal, that's fine
846
+ if expected_identity == actual_identity:
847
+ return True
848
+
849
+ # Check if actual is an assumed role version of expected role
850
+ if self._is_aws_assumed_role(actual_identity):
851
+ # Extract the role name from both ARNs
852
+ expected_role = self._extract_aws_role_name(expected_identity)
853
+ actual_role = self._extract_aws_role_name_from_assumed_role(
854
+ actual_identity
855
+ )
856
+
857
+ if expected_role and actual_role and expected_role == actual_role:
858
+ # Also check account ID matches
859
+ expected_account = self._extract_aws_account_id(expected_identity)
860
+ actual_account = self._extract_aws_account_id(actual_identity)
861
+
862
+ if expected_account == actual_account:
863
+ self.log.info(
864
+ f"AWS identity match: Role '{expected_role}' (account: {expected_account})"
865
+ )
866
+ return True
867
+
868
+ return False
869
+
870
+ except (ValueError, IndexError, AttributeError) as e:
871
+ self.log.error(f"Error evaluating AWS identity: {e}")
872
+ return False
873
+
874
+ def _evaluate_gcp_identity(
875
+ self, expected_identity: str, actual_identity: str
876
+ ) -> bool:
877
+ """Evaluate GCP identity comparison."""
878
+ return expected_identity == actual_identity
879
+
880
+ def _evaluate_azure_identity(
881
+ self, expected_identity: str, actual_identity: str
882
+ ) -> bool:
883
+ """Evaluate Azure identity comparison."""
884
+ return expected_identity == actual_identity
885
+
886
+ def _is_aws_assumed_role(self, arn: str) -> bool:
887
+ """Check if ARN is an assumed role ARN."""
888
+ return arn.startswith("arn:aws:sts:") and ":assumed-role/" in arn
889
+
890
+ def _extract_aws_role_name(self, role_arn: str) -> Optional[str]:
891
+ """Extract role name from IAM role ARN."""
892
+ try:
893
+ if ":role/" in role_arn:
894
+ return role_arn.split(":role/")[-1]
895
+ return None
896
+ except (ValueError, IndexError):
897
+ return None
898
+
899
+ def _extract_aws_role_name_from_assumed_role(
900
+ self, assumed_role_arn: str
901
+ ) -> Optional[str]:
902
+ """Extract role name from assumed role ARN."""
903
+ try:
904
+ if ":assumed-role/" in assumed_role_arn:
905
+ parts = assumed_role_arn.split(":assumed-role/")[-1].split("/")
906
+ if len(parts) >= 1:
907
+ return parts[0] # Role name is first part after assumed-role/
908
+ return None
909
+ except (ValueError, IndexError):
910
+ return None
911
+
912
+ def _extract_aws_account_id(self, arn: str) -> Optional[str]:
913
+ """Extract AWS account ID from any ARN."""
914
+ try:
915
+ # ARN format: arn:partition:service:region:account-id:resource
916
+ parts = arn.split(":")
917
+ if len(parts) >= 5:
918
+ return parts[4]
919
+ return None
920
+ except (ValueError, IndexError):
921
+ return None
922
+
923
+
924
+ # =============================================================================
925
+ # STORAGE VERIFIER
926
+ # =============================================================================
927
+
928
+
929
+ class StorageVerifier:
930
+ """Handles verification of file storage components for Kubernetes deployments."""
931
+
932
+ def __init__(
933
+ self,
934
+ kubectl_ops: KubectlOperations,
935
+ k8s_config: KubernetesConfig,
936
+ logger: BlockLogger,
937
+ ):
938
+ self.kubectl = kubectl_ops
939
+ self.config = k8s_config
940
+ self.log = logger
941
+
942
+ def verify_file_storage(
943
+ self, file_storage: FileStorage, cloud_deployment: CloudDeployment
944
+ ) -> bool:
945
+ """Verify file storage configuration (non-functional checks only)."""
946
+ self.log.info("Verifying file storage configuration...")
947
+ verification_results = []
948
+
949
+ if getattr(file_storage, "csi_ephemeral_volume_driver", None):
950
+ driver_name = file_storage.csi_ephemeral_volume_driver
951
+ if driver_name:
952
+ self.log.info(f"Checking CSI driver: {driver_name}")
953
+ result = self._verify_csi_driver(driver_name)
954
+ verification_results.append(("CSI driver", result))
955
+
956
+ if getattr(file_storage, "persistent_volume_claim", None):
957
+ pvc_name = file_storage.persistent_volume_claim
958
+ if pvc_name:
959
+ self.log.info(f"Checking PVC: {pvc_name}")
960
+ result = self._verify_pvc(pvc_name)
961
+ verification_results.append(("PVC", result))
962
+
963
+ if getattr(file_storage, "file_storage_id", None):
964
+ self.log.info("Checking NFS file storage exists via cloud provider APIs...")
965
+ try:
966
+ nfs_exists = verify_file_storage_exists(
967
+ file_storage, cloud_deployment, logger=self.log
968
+ )
969
+ verification_results.append(("NFS", nfs_exists))
970
+ except (ValueError, KeyError, TypeError, ImportError) as e:
971
+ self.log.error(
972
+ f"Cloud provider API error while verifying file storage: {e}"
973
+ )
974
+ raise RuntimeError(
975
+ f"Cloud provider API error while verifying file storage: {e}"
976
+ ) from e
977
+
978
+ # Return overall success
979
+ if verification_results:
980
+ return all(result for _, result in verification_results)
981
+ else:
982
+ self.log.info("INFO: No file storage components found to verify")
983
+ return True
984
+
985
+ def _verify_csi_driver(self, driver_name: str) -> bool:
986
+ """Check if CSI driver exists on cluster."""
987
+ try:
988
+ driver_info = self.kubectl.get_resource("csidriver", driver_name)
989
+
990
+ # Parse driver details for logging
991
+ driver_spec = driver_info.get("spec", {})
992
+ self.log.info(f"CSI driver '{driver_name}' is available")
993
+ self.log.info(
994
+ f"Attach required: {driver_spec.get('attachRequired', 'unknown')}"
995
+ )
996
+ self.log.info(
997
+ f"Pod info on mount: {driver_spec.get('podInfoOnMount', 'unknown')}"
998
+ )
999
+ return True
1000
+
1001
+ except ResourceNotFoundError:
1002
+ self.log.error(f"CSI driver '{driver_name}' not found")
1003
+ self.log.error("Available CSI drivers:")
1004
+ self._list_available_csi_drivers()
1005
+ return False
1006
+
1007
+ except Exception as e: # noqa: BLE001
1008
+ self.log.error(f"Failed to query CSI driver: {e}")
1009
+ raise RuntimeError(
1010
+ f"kubectl error while verifying CSI driver '{driver_name}': {e}"
1011
+ ) from e
1012
+
1013
+ def _verify_pvc(self, pvc_name: str) -> bool:
1014
+ """Check if PVC exists and is bound in operator namespace."""
1015
+ try:
1016
+ pvc_data = self.kubectl.get_resource(
1017
+ "pvc", pvc_name, namespace=self.config.operator_namespace
1018
+ )
1019
+
1020
+ status = pvc_data.get("status", {})
1021
+ phase = status.get("phase")
1022
+ capacity = status.get("capacity", {})
1023
+ storage_class = pvc_data.get("spec", {}).get("storageClassName")
1024
+
1025
+ if phase == "Bound":
1026
+ self.log.info(f"PVC '{pvc_name}' is bound")
1027
+ self.log.info(f"Capacity: {capacity.get('storage', 'unknown')}")
1028
+ self.log.info(f"Storage class: {storage_class or 'default'}")
1029
+ return True
1030
+ else:
1031
+ self.log.error(
1032
+ f"FAILED: PVC '{pvc_name}' is not bound (status: {phase})"
1033
+ )
1034
+ return False
1035
+
1036
+ except ResourceNotFoundError:
1037
+ self.log.error(
1038
+ f"FAILED: PVC '{pvc_name}' not found in namespace '{self.config.operator_namespace}'"
1039
+ )
1040
+ self.log.error("Available PVCs in namespace:")
1041
+ self._list_available_pvcs()
1042
+ return False
1043
+
1044
+ except Exception as e: # noqa: BLE001
1045
+ self.log.error(f"FAILED: Failed to check PVC '{pvc_name}': {e}")
1046
+ raise RuntimeError(
1047
+ f"kubectl error while verifying PVC '{pvc_name}': {e}"
1048
+ ) from e
1049
+
1050
+ def _list_available_csi_drivers(self) -> None:
1051
+ """List available CSI drivers for troubleshooting."""
1052
+ try:
1053
+ drivers = self.kubectl.list_resources("csidrivers")
1054
+ if drivers:
1055
+ for driver in drivers:
1056
+ name = driver.get("metadata", {}).get("name", "unknown")
1057
+ self.log.error(f" - {name}")
1058
+ else:
1059
+ self.log.error(" (no CSI drivers found in cluster)")
1060
+ except Exception: # noqa: BLE001
1061
+ self.log.error(" (failed to list CSI drivers)")
1062
+
1063
+ def _list_available_pvcs(self) -> None:
1064
+ """List available PVCs for troubleshooting."""
1065
+ try:
1066
+ pvcs = self.kubectl.list_resources(
1067
+ "pvcs", namespace=self.config.operator_namespace
1068
+ )
1069
+ if pvcs:
1070
+ for pvc in pvcs:
1071
+ name = pvc.get("metadata", {}).get("name", "unknown")
1072
+ self.log.error(f" - {name}")
1073
+ else:
1074
+ self.log.error(
1075
+ f" (no PVCs found in namespace '{self.config.operator_namespace}')"
1076
+ )
1077
+ except Exception: # noqa: BLE001
1078
+ self.log.error(" (failed to list PVCs)")
1079
+
1080
+
1081
+ # =============================================================================
1082
+ # GATEWAY VERIFIER
1083
+ # =============================================================================
1084
+
1085
+
1086
+ class GatewayVerifier:
1087
+ """Handles verification of gateway and ingress components for Kubernetes deployments."""
1088
+
1089
+ def __init__(
1090
+ self,
1091
+ kubectl_ops: KubectlOperations,
1092
+ k8s_config: KubernetesConfig,
1093
+ logger: BlockLogger,
1094
+ ):
1095
+ self.kubectl = kubectl_ops
1096
+ self.config = k8s_config
1097
+ self.log = logger
1098
+
1099
+ def verify_gateway_support(self, operator_data: OperatorData) -> bool:
1100
+ """Verify gateway support using pre-fetched config data."""
1101
+ if not operator_data.config.is_valid:
1102
+ self.log.warning(
1103
+ "Could not retrieve operator configuration - skipping gateway verification"
1104
+ )
1105
+ return True
1106
+
1107
+ # Extract gateway configuration from operator data
1108
+ gateway_config = GatewayConfig.from_operator_config(
1109
+ operator_data.config.config_data
1110
+ )
1111
+
1112
+ if not gateway_config.enabled:
1113
+ self.log.info(
1114
+ "Gateway support is not enabled - skipping gateway verification"
1115
+ )
1116
+ return True
1117
+
1118
+ if not gateway_config.requires_verification:
1119
+ self.log.error(
1120
+ "Gateway is enabled but no gateway name found in operator configuration"
1121
+ )
1122
+ return False
1123
+
1124
+ # Verify gateway exists in cluster
1125
+ assert (
1126
+ gateway_config.name is not None
1127
+ ) # guaranteed by requires_verification check
1128
+ return self._verify_gateway_exists(gateway_config.name)
1129
+
1130
+ def verify_nginx_ingress(self) -> bool:
1131
+ """Check for NGINX ingress controller (warning only)."""
1132
+ try:
1133
+ self.log.info("Checking for NGINX ingress controller...")
1134
+
1135
+ # Try different NGINX ingress controller configurations
1136
+ for config_dict in NGINX_INGRESS_CONFIGS:
1137
+ nginx_pod = self._find_nginx_pod(
1138
+ config_dict["namespace"], config_dict["label"]
1139
+ )
1140
+ if nginx_pod:
1141
+ if self.kubectl.is_pod_running(nginx_pod, config_dict["namespace"]):
1142
+ self.log.info(
1143
+ f"PASSED: Found running NGINX ingress controller: {nginx_pod} "
1144
+ f"(namespace: {config_dict['namespace']})"
1145
+ )
1146
+ return True
1147
+ else:
1148
+ pod_status = self.kubectl.get_pod_status(
1149
+ nginx_pod, config_dict["namespace"]
1150
+ )
1151
+ self.log.warning(
1152
+ f"WARNING: Found NGINX ingress controller '{nginx_pod}' "
1153
+ f"but it's not running (status: {pod_status})"
1154
+ )
1155
+
1156
+ # Try fallback search by name patterns
1157
+ if self._find_nginx_by_name_pattern():
1158
+ return True
1159
+
1160
+ # No NGINX ingress controller found
1161
+ self.log.warning("No NGINX ingress controller found")
1162
+ self.log.warning("This may impact ingress routing capabilities")
1163
+ self.log.warning("Available ingress controllers:")
1164
+ self._list_available_ingress_controllers()
1165
+ return False
1166
+
1167
+ except (KubectlError, ResourceNotFoundError) as e:
1168
+ self.log.warning(f"WARNING: Could not verify NGINX ingress controller: {e}")
1169
+ raise RuntimeError(
1170
+ f"kubectl error during NGINX ingress verification: {e}"
1171
+ ) from e
1172
+
1173
+ def _verify_gateway_exists(self, gateway_name: str) -> bool:
1174
+ """Verify that the specified gateway exists in the cluster."""
1175
+ try:
1176
+ # Try to find gateway in common Gateway API resource types
1177
+ for resource_type in GATEWAY_RESOURCE_TYPES:
1178
+ if self._check_gateway_resource(resource_type, gateway_name):
1179
+ return True
1180
+
1181
+ # If not found in operator namespace, try cluster-wide search
1182
+ self.log.info(
1183
+ f"Gateway '{gateway_name}' not found in operator namespace, "
1184
+ "searching cluster-wide..."
1185
+ )
1186
+ for resource_type in GATEWAY_RESOURCE_TYPES:
1187
+ if self._check_gateway_resource_cluster_wide(
1188
+ resource_type, gateway_name
1189
+ ):
1190
+ return True
1191
+
1192
+ self.log.error(f"FAILED: Gateway '{gateway_name}' not found in cluster")
1193
+ self.log.error("Available gateways:")
1194
+ self._list_available_gateways()
1195
+ return False
1196
+
1197
+ except (KubectlError, ResourceNotFoundError) as e:
1198
+ self.log.error(f"FAILED: Failed to verify gateway '{gateway_name}': {e}")
1199
+ raise RuntimeError(
1200
+ f"kubectl error while verifying gateway '{gateway_name}': {e}"
1201
+ ) from e
1202
+
1203
+ def _check_gateway_resource(self, resource_type: str, gateway_name: str) -> bool:
1204
+ """Check for gateway resource in operator namespace."""
1205
+ try:
1206
+ gateway_data = self.kubectl.get_resource(
1207
+ resource_type, gateway_name, namespace=self.config.operator_namespace
1208
+ )
1209
+
1210
+ self.log.info(
1211
+ f"PASSED: Gateway '{gateway_name}' found in namespace '{self.config.operator_namespace}'"
1212
+ )
1213
+
1214
+ # Log gateway status if available
1215
+ status = gateway_data.get("status", {})
1216
+ conditions = status.get("conditions", [])
1217
+ for condition in conditions:
1218
+ if (
1219
+ condition.get("type") == "Ready"
1220
+ and condition.get("status") == "True"
1221
+ ):
1222
+ self.log.info(" Status: Ready")
1223
+ break
1224
+
1225
+ return True
1226
+
1227
+ except ResourceNotFoundError:
1228
+ return False
1229
+
1230
+ def _check_gateway_resource_cluster_wide(
1231
+ self, resource_type: str, gateway_name: str
1232
+ ) -> bool:
1233
+ """Check for gateway resource cluster-wide."""
1234
+ try:
1235
+ gateways = self.kubectl.list_resources(resource_type, all_namespaces=True)
1236
+
1237
+ for gateway in gateways:
1238
+ if gateway.get("metadata", {}).get("name") == gateway_name:
1239
+ namespace = gateway.get("metadata", {}).get("namespace", "unknown")
1240
+ self.log.info(
1241
+ f"PASSED: Gateway '{gateway_name}' found in namespace '{namespace}'"
1242
+ )
1243
+ return True
1244
+
1245
+ return False
1246
+
1247
+ except Exception: # noqa: BLE001
1248
+ # Broad exception handling for fallback case
1249
+ return False
1250
+
1251
+ def _find_nginx_pod(self, namespace: str, label_selector: str) -> Optional[str]:
1252
+ """Find NGINX ingress pod by label selector in specific namespace."""
1253
+ try:
1254
+ pods = self.kubectl.list_resources(
1255
+ "pods", namespace=namespace, label_selector=label_selector
1256
+ )
1257
+
1258
+ if pods:
1259
+ return pods[0]["metadata"]["name"]
1260
+ return None
1261
+
1262
+ except Exception: # noqa: BLE001
1263
+ # Broad exception handling for fallback pod discovery
1264
+ return None
1265
+
1266
+ def _find_nginx_by_name_pattern(self) -> bool:
1267
+ """Find NGINX ingress controller by name pattern across all namespaces."""
1268
+ try:
1269
+ pods = self.kubectl.list_resources("pods", all_namespaces=True)
1270
+
1271
+ # Look for pods with names containing NGINX and ingress keywords
1272
+ for pod in pods:
1273
+ metadata = pod.get("metadata", {})
1274
+ name = metadata.get("name", "").lower()
1275
+ namespace = metadata.get("namespace", "")
1276
+ status_phase = pod.get("status", {}).get("phase", "")
1277
+
1278
+ if "nginx" in name and "ingress" in name:
1279
+ if status_phase == RUNNING_STATUS:
1280
+ self.log.info(
1281
+ f"PASSED: Found NGINX ingress controller by name pattern: "
1282
+ f"{metadata['name']} (namespace: {namespace})"
1283
+ )
1284
+ return True
1285
+ else:
1286
+ self.log.warning(
1287
+ f"WARNING: Found NGINX ingress controller '{metadata['name']}' "
1288
+ f"but it's not running (status: {status_phase})"
1289
+ )
1290
+
1291
+ return False
1292
+
1293
+ except Exception: # noqa: BLE001
1294
+ # Broad exception handling for fallback case
1295
+ return False
1296
+
1297
+ def _list_available_gateways(self) -> None:
1298
+ """List available gateways for troubleshooting."""
1299
+ try:
1300
+ for resource_type in GATEWAY_RESOURCE_TYPES:
1301
+ gateways = self.kubectl.list_resources(
1302
+ resource_type, all_namespaces=True
1303
+ )
1304
+
1305
+ if gateways:
1306
+ self.log.error(f"Available {resource_type}:")
1307
+ for gw in gateways:
1308
+ name = gw.get("metadata", {}).get("name", "unknown")
1309
+ self.log.error(f" - {name}")
1310
+ return
1311
+
1312
+ self.log.error(" (no gateways found in cluster)")
1313
+
1314
+ except Exception: # noqa: BLE001
1315
+ # Broad exception handling for troubleshooting helper
1316
+ self.log.error(" (failed to list gateways)")
1317
+
1318
+ def _list_available_ingress_controllers(self) -> None:
1319
+ """List available ingress controllers for troubleshooting."""
1320
+ try:
1321
+ pods = self.kubectl.list_resources("pods", all_namespaces=True)
1322
+
1323
+ ingress_controllers = []
1324
+ for pod in pods:
1325
+ metadata = pod.get("metadata", {})
1326
+ name = metadata.get("name", "").lower()
1327
+ namespace = metadata.get("namespace", "")
1328
+
1329
+ # Look for common ingress controller name patterns
1330
+ if any(keyword in name for keyword in INGRESS_CONTROLLER_KEYWORDS):
1331
+ ingress_controllers.append(
1332
+ f"{metadata['name']} (namespace: {namespace})"
1333
+ )
1334
+
1335
+ if ingress_controllers:
1336
+ for controller in ingress_controllers:
1337
+ self.log.warning(f" - {controller}")
1338
+ else:
1339
+ self.log.warning(" (no ingress controllers found)")
1340
+
1341
+ except Exception: # noqa: BLE001
1342
+ # Broad exception handling for troubleshooting helper
1343
+ self.log.warning(" (failed to list ingress controllers)")
1344
+
1345
+
1346
+ # =============================================================================
1347
+ # MAIN VERIFIER CLASS
1348
+ # =============================================================================
1349
+
1350
+
1351
+ class KubernetesCloudDeploymentVerifier:
1352
+ """Verifies Kubernetes-based cloud deployments with comprehensive checks"""
1353
+
1354
+ def __init__(self, logger: BlockLogger, api_client):
1355
+ self.log = logger
1356
+ self.api_client = api_client
1357
+ self.k8s_config: Optional[KubernetesConfig] = None
1358
+ self.results = VerificationResults()
1359
+
1360
+ def verify(self, cloud_deployment: CloudDeployment) -> bool:
1361
+ """
1362
+ Main verification workflow for Kubernetes cloud deployments.
1363
+
1364
+ Performs comprehensive checks including operator health, identity verification,
1365
+ file storage, networking, and gateway configuration.
1366
+
1367
+ Args:
1368
+ cloud_deployment: The cloud deployment configuration
1369
+ """
1370
+ deployment_name = cloud_deployment.name or cloud_deployment.cloud_deployment_id
1371
+ self.log.info(f"Starting Kubernetes verification for: {deployment_name}")
1372
+
1373
+ if cloud_deployment.file_storage is not None and isinstance(
1374
+ cloud_deployment.file_storage, dict
1375
+ ):
1376
+ cloud_deployment.file_storage = FileStorage(**cloud_deployment.file_storage)
1377
+
1378
+ try:
1379
+ return self._run_verification_steps(cloud_deployment)
1380
+
1381
+ except click.ClickException:
1382
+ # Re-raise ClickExceptions as they contain user-friendly messages
1383
+ raise
1384
+ except requests.RequestException as e:
1385
+ self.log.error(f"Network error during verification: {e}")
1386
+ return False
1387
+ except (subprocess.CalledProcessError, OSError) as e:
1388
+ self.log.error(f"System error during verification: {e}")
1389
+ return False
1390
+ except (KeyError, ValueError, json.JSONDecodeError) as e:
1391
+ self.log.error(f"Data parsing error during verification: {e}")
1392
+ return False
1393
+
1394
+ def _passed_or_failed_str_from_bool(self, is_passing: bool) -> str:
1395
+ """Return PASSED or FAILED string for verification results, matching VM verification format."""
1396
+ return PASSED_STATUS if is_passing else FAILED_STATUS
1397
+
1398
+ @contextmanager
1399
+ def _verification_step(self, step_name: str):
1400
+ """Context manager for verification steps that indents detailed output."""
1401
+ self.log.info(f"{step_name}...")
1402
+ with self.log.indent():
1403
+ yield
1404
+
1405
+ def _run_verification_steps(self, cloud_deployment: CloudDeployment) -> bool:
1406
+ """Execute the verification steps in sequence."""
1407
+ # Step 1: Configure kubectl
1408
+ with self._verification_step("Configuring kubectl access"):
1409
+ self._get_kubectl_config()
1410
+
1411
+ # k8s_config is guaranteed to be set by _get_kubectl_config()
1412
+ assert self.k8s_config is not None
1413
+
1414
+ # Initialize utility classes
1415
+ kubectl_ops = KubectlOperations(self.k8s_config.context, self.log)
1416
+ operator_verifier = OperatorVerifier(kubectl_ops, self.k8s_config, self.log)
1417
+ storage_verifier = StorageVerifier(kubectl_ops, self.k8s_config, self.log)
1418
+ gateway_verifier = GatewayVerifier(kubectl_ops, self.k8s_config, self.log)
1419
+
1420
+ # Step 2: Find and verify operator pod
1421
+ with self._verification_step("Finding operator pod"):
1422
+ try:
1423
+ operator_pod = operator_verifier.find_operator_pod()
1424
+ self.results.operator_pod_installed = True
1425
+ except OperatorPodNotFoundError as e:
1426
+ self.log.error(
1427
+ "Failed to find operator pod, please make sure the operator is running"
1428
+ )
1429
+ self.log.error(f"Error: {e}")
1430
+ return False
1431
+
1432
+ # Step 3: Port forward and fetch operator data (health + config)
1433
+ with self._verification_step("Verifying operator status"):
1434
+ try:
1435
+ operator_data = operator_verifier.get_operator_data(operator_pod)
1436
+ except (OperatorConnectionError, PortForwardError) as e:
1437
+ self.log.error(
1438
+ "Failed to connect to operator, please make sure the operator is running version >= 0.7.0 and has status reporting enabled"
1439
+ )
1440
+ self.log.error(f"Error: {e}")
1441
+ return False
1442
+
1443
+ self.log.info("Verifying operator health...")
1444
+ self.results.operator_health = operator_verifier.verify_operator_health(
1445
+ operator_data
1446
+ )
1447
+ self.log.info(
1448
+ f"Operator Health: {self._passed_or_failed_str_from_bool(self.results.operator_health)}"
1449
+ )
1450
+
1451
+ self.log.info("Verifying operator identity...")
1452
+ if cloud_deployment.kubernetes_config is None:
1453
+ self.log.error(
1454
+ "Kubernetes configuration is missing from cloud deployment"
1455
+ )
1456
+ self.results.operator_identity = False
1457
+ else:
1458
+ self.results.operator_identity = operator_verifier.verify_operator_identity(
1459
+ operator_data,
1460
+ cloud_deployment.kubernetes_config,
1461
+ cloud_deployment.provider,
1462
+ )
1463
+ self.log.info(
1464
+ f"Operator Identity: {self._passed_or_failed_str_from_bool(self.results.operator_identity)}"
1465
+ )
1466
+
1467
+ # Step 4: Check file storage
1468
+ with self._verification_step("Checking file storage"):
1469
+ if cloud_deployment.file_storage is None:
1470
+ self.log.info(
1471
+ "INFO: No file storage configured - skipping file storage verification"
1472
+ )
1473
+ self.results.file_storage = True
1474
+ else:
1475
+ self.results.file_storage = storage_verifier.verify_file_storage(
1476
+ cloud_deployment.file_storage, cloud_deployment
1477
+ )
1478
+ self.log.info(
1479
+ f"File Storage: {self._passed_or_failed_str_from_bool(self.results.file_storage)}"
1480
+ )
1481
+
1482
+ # Step 5: Verify gateway support
1483
+ with self._verification_step("Verifying gateway support"):
1484
+ self.results.gateway_support = gateway_verifier.verify_gateway_support(
1485
+ operator_data
1486
+ )
1487
+ self.log.info(
1488
+ f"Gateway Support: {self._passed_or_failed_str_from_bool(self.results.gateway_support)}"
1489
+ )
1490
+
1491
+ # Step 6: Check NGINX ingress (warning only)
1492
+ with self._verification_step("Checking NGINX ingress controller"):
1493
+ self.results.nginx_ingress = gateway_verifier.verify_nginx_ingress()
1494
+ self.log.info(
1495
+ f"NGINX Ingress: {self._passed_or_failed_str_from_bool(self.results.nginx_ingress)}"
1496
+ )
1497
+
1498
+ self._show_verification_summary()
1499
+
1500
+ if self.results.overall_success:
1501
+ self.log.info(
1502
+ "Kubernetes cloud deployment verification completed successfully"
1503
+ )
1504
+ else:
1505
+ self.log.error("Kubernetes cloud deployment verification failed")
1506
+
1507
+ return self.results.overall_success
1508
+
1509
+ def _show_verification_summary(self):
1510
+ """Show verification results summary in the same format as VM verification."""
1511
+ verification_result_summary = ["Verification result:"]
1512
+
1513
+ for component, result in self.results.to_dict().items():
1514
+ verification_result_summary.append(
1515
+ f"{component}: {self._passed_or_failed_str_from_bool(result)}"
1516
+ )
1517
+
1518
+ self.log.info("\n".join(verification_result_summary))
1519
+
1520
+ def _get_kubectl_config(self):
1521
+ """Get kubectl context and operator namespace from user"""
1522
+ # Check if kubectl is available
1523
+ temp_kubectl = KubectlOperations("", self.log)
1524
+ if not temp_kubectl.check_kubectl_available():
1525
+ raise click.ClickException(
1526
+ "kubectl command not found. Please install kubectl and ensure it's in your PATH."
1527
+ )
1528
+
1529
+ # Get available contexts
1530
+ contexts = temp_kubectl.get_available_contexts()
1531
+ if not contexts:
1532
+ raise click.ClickException(
1533
+ "No kubectl contexts found. Please configure kubectl to access your Kubernetes cluster."
1534
+ )
1535
+
1536
+ # Prompt for context selection
1537
+ if len(contexts) > 1:
1538
+ self.log.info("Available kubectl contexts:")
1539
+ for i, ctx in enumerate(contexts):
1540
+ current_marker = (
1541
+ " (current)" if ctx == temp_kubectl.get_current_context() else ""
1542
+ )
1543
+ self.log.info(f" {i+1}. {ctx}{current_marker}")
1544
+
1545
+ choice = click.prompt(
1546
+ "Select context number",
1547
+ type=click.IntRange(1, len(contexts)),
1548
+ default=1,
1549
+ )
1550
+ kubectl_context = contexts[choice - 1]
1551
+ else:
1552
+ kubectl_context = contexts[0]
1553
+ self.log.info(f"Using kubectl context: {kubectl_context}")
1554
+
1555
+ # Prompt for operator namespace
1556
+ operator_namespace = click.prompt(
1557
+ "Enter the Anyscale operator namespace",
1558
+ default=DEFAULT_OPERATOR_NAMESPACE,
1559
+ type=str,
1560
+ show_default=True,
1561
+ )
1562
+
1563
+ self.k8s_config = KubernetesConfig(
1564
+ context=kubectl_context, operator_namespace=operator_namespace
1565
+ )
1566
+
1567
+ self.log.info(
1568
+ f"Configured: context='{self.k8s_config.context}', "
1569
+ f"namespace='{self.k8s_config.operator_namespace}'"
1570
+ )