anyscale 0.26.47__py3-none-any.whl → 0.26.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. anyscale/__init__.py +0 -7
  2. anyscale/_private/anyscale_client/anyscale_client.py +1 -208
  3. anyscale/_private/anyscale_client/common.py +0 -55
  4. anyscale/_private/anyscale_client/fake_anyscale_client.py +19 -46
  5. anyscale/_private/docgen/__main__.py +24 -45
  6. anyscale/_private/docgen/generator.py +32 -16
  7. anyscale/_private/docgen/generator_legacy.py +58 -6
  8. anyscale/_private/docgen/models.md +3 -2
  9. anyscale/_private/workload/workload_config.py +16 -8
  10. anyscale/_private/workload/workload_sdk.py +22 -5
  11. anyscale/client/README.md +4 -1
  12. anyscale/client/openapi_client/__init__.py +2 -1
  13. anyscale/client/openapi_client/api/default_api.py +253 -4
  14. anyscale/client/openapi_client/models/__init__.py +2 -1
  15. anyscale/client/openapi_client/models/{alert_type.py → alert_issue_type.py} +8 -20
  16. anyscale/client/openapi_client/models/baseimagesenum.py +1 -2
  17. anyscale/client/openapi_client/models/cloud.py +31 -3
  18. anyscale/client/openapi_client/models/cloud_deployment.py +30 -3
  19. anyscale/client/openapi_client/models/cloud_with_cloud_resource.py +29 -1
  20. anyscale/client/openapi_client/models/cloud_with_cloud_resource_gcp.py +29 -1
  21. anyscale/client/openapi_client/models/dataset_metrics.py +6 -6
  22. anyscale/client/openapi_client/models/dataset_state.py +2 -1
  23. anyscale/client/openapi_client/models/describe_system_workload_response.py +32 -6
  24. anyscale/client/openapi_client/models/experimental_workspace.py +29 -1
  25. anyscale/client/openapi_client/models/experimental_workspaces_sort_field.py +2 -1
  26. anyscale/client/openapi_client/models/operator_metrics.py +8 -9
  27. anyscale/client/openapi_client/models/operator_status.py +102 -0
  28. anyscale/client/openapi_client/models/organization_usage_alert.py +20 -20
  29. anyscale/client/openapi_client/models/supportedbaseimagesenum.py +1 -2
  30. anyscale/cloud/models.py +330 -0
  31. anyscale/commands/cloud_commands.py +132 -43
  32. anyscale/commands/command_examples.py +54 -134
  33. anyscale/commands/compute_config_commands.py +7 -11
  34. anyscale/compute_config/__init__.py +2 -16
  35. anyscale/compute_config/_private/compute_config_sdk.py +27 -17
  36. anyscale/compute_config/commands.py +14 -44
  37. anyscale/compute_config/models.py +49 -26
  38. anyscale/controllers/cloud_controller.py +289 -171
  39. anyscale/controllers/cloud_file_storage_utils.py +204 -0
  40. anyscale/controllers/kubernetes_verifier.py +1567 -0
  41. anyscale/job/_private/job_sdk.py +17 -8
  42. anyscale/job/models.py +1 -1
  43. anyscale/scripts.py +0 -2
  44. anyscale/sdk/anyscale_client/models/baseimagesenum.py +1 -2
  45. anyscale/sdk/anyscale_client/models/cloud.py +31 -3
  46. anyscale/sdk/anyscale_client/models/supportedbaseimagesenum.py +1 -2
  47. anyscale/shared_anyscale_utils/utils/id_gen.py +1 -0
  48. anyscale/version.py +1 -1
  49. anyscale/workspace/models.py +14 -7
  50. {anyscale-0.26.47.dist-info → anyscale-0.26.48.dist-info}/METADATA +1 -1
  51. {anyscale-0.26.47.dist-info → anyscale-0.26.48.dist-info}/RECORD +56 -70
  52. anyscale/commands/llm/dataset_commands.py +0 -269
  53. anyscale/commands/llm/group.py +0 -15
  54. anyscale/commands/llm/models_commands.py +0 -123
  55. anyscale/controllers/llm/__init__.py +0 -0
  56. anyscale/controllers/llm/models_controller.py +0 -144
  57. anyscale/llm/__init__.py +0 -2
  58. anyscale/llm/dataset/__init__.py +0 -2
  59. anyscale/llm/dataset/_private/__init__.py +0 -0
  60. anyscale/llm/dataset/_private/docs.py +0 -63
  61. anyscale/llm/dataset/_private/models.py +0 -71
  62. anyscale/llm/dataset/_private/sdk.py +0 -147
  63. anyscale/llm/model/__init__.py +0 -2
  64. anyscale/llm/model/_private/models_sdk.py +0 -62
  65. anyscale/llm/model/commands.py +0 -93
  66. anyscale/llm/model/models.py +0 -171
  67. anyscale/llm/model/sdk.py +0 -62
  68. anyscale/llm/sdk.py +0 -27
  69. {anyscale-0.26.47.dist-info → anyscale-0.26.48.dist-info}/WHEEL +0 -0
  70. {anyscale-0.26.47.dist-info → anyscale-0.26.48.dist-info}/entry_points.txt +0 -0
  71. {anyscale-0.26.47.dist-info → anyscale-0.26.48.dist-info}/licenses/LICENSE +0 -0
  72. {anyscale-0.26.47.dist-info → anyscale-0.26.48.dist-info}/licenses/NOTICE +0 -0
  73. {anyscale-0.26.47.dist-info → anyscale-0.26.48.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1567 @@
1
+ """
2
+ Kubernetes Cloud Deployment Verifier
3
+
4
+ Handles verification of Kubernetes-based cloud deployments including:
5
+ - Operator pod health and connectivity
6
+ - File storage (CSI drivers, PVCs, NFS)
7
+ - Network connectivity
8
+ - Gateway support
9
+ - Nginx ingress controller
10
+ """
11
+
12
+
13
+ from contextlib import contextmanager, suppress
14
+ from dataclasses import dataclass
15
+ import json
16
+ import os
17
+ import shutil
18
+ import signal
19
+ import socket
20
+ import subprocess
21
+ import time
22
+ from typing import Dict, List, Optional
23
+
24
+ import click
25
+ import requests
26
+
27
+ from anyscale.cli_logger import BlockLogger
28
+ from anyscale.client.openapi_client.models.cloud_deployment import CloudDeployment
29
+ from anyscale.client.openapi_client.models.cloud_providers import CloudProviders
30
+ from anyscale.client.openapi_client.models.file_storage import FileStorage
31
+ from anyscale.controllers.cloud_file_storage_utils import verify_file_storage_exists
32
+
33
+
34
+ # =============================================================================
35
+ # CONSTANTS
36
+ # =============================================================================
37
+
38
+ # Operator configuration
39
+ OPERATOR_HEALTH_PORT = 2113
40
+ OPERATOR_CONFIG_ENDPOINT = "/config"
41
+ OPERATOR_HEALTH_ENDPOINT = "/healthz/run"
42
+ DEFAULT_OPERATOR_NAMESPACE = "anyscale-operator"
43
+
44
+ # Network and timing configuration
45
+ PORT_FORWARD_WAIT_TIME = 3 # seconds to wait for port forward to establish
46
+ HTTP_REQUEST_TIMEOUT = 10 # seconds for HTTP requests to operator
47
+ PORT_FORWARD_TERMINATION_TIMEOUT = 5 # seconds to wait for graceful termination
48
+
49
+ OPERATOR_LABEL_SELECTOR = "app=anyscale-operator"
50
+
51
+ # Gateway resource types to check
52
+ GATEWAY_RESOURCE_TYPES = [
53
+ "gateway.gateway", # Gateway API v1
54
+ "gateways.gateway.networking.k8s.io", # Full API path
55
+ "gateway", # Short name
56
+ "gw", # Common alias
57
+ ]
58
+
59
+ # NGINX ingress controller configurations
60
+ NGINX_INGRESS_CONFIGS = [
61
+ {"namespace": "ingress-nginx", "label": "app.kubernetes.io/name=ingress-nginx"},
62
+ {"namespace": "nginx-ingress", "label": "app=nginx-ingress"},
63
+ {"namespace": "kube-system", "label": "app.kubernetes.io/name=ingress-nginx"},
64
+ {"namespace": "default", "label": "app=nginx-ingress"},
65
+ ]
66
+
67
+ # Ingress controller name patterns for fallback search
68
+ INGRESS_CONTROLLER_KEYWORDS = [
69
+ "ingress",
70
+ "haproxy",
71
+ "traefik",
72
+ "contour",
73
+ "ambassador",
74
+ "istio-gateway",
75
+ "nginx",
76
+ ]
77
+
78
+ # kubectl binary search paths
79
+ KUBECTL_COMMON_PATHS = [
80
+ "/usr/local/bin/kubectl",
81
+ "/usr/bin/kubectl",
82
+ "/bin/kubectl",
83
+ "/opt/homebrew/bin/kubectl", # macOS homebrew
84
+ "~/.local/bin/kubectl", # User local install
85
+ ]
86
+
87
+ # Status and result strings
88
+ PASSED_STATUS = "PASSED"
89
+ FAILED_STATUS = "FAILED"
90
+ RUNNING_STATUS = "Running"
91
+
92
+ # Verification component names (for consistent reporting)
93
+ class VerificationComponents:
94
+ OPERATOR_POD_INSTALLED = "Operator Pod Installed"
95
+ OPERATOR_HEALTH = "Operator Health"
96
+ OPERATOR_IDENTITY = "Operator Identity"
97
+ FILE_STORAGE = "File Storage"
98
+ GATEWAY_SUPPORT = "Gateway Support"
99
+ NGINX_INGRESS = "NGINX Ingress"
100
+
101
+
102
+ # =============================================================================
103
+ # EXCEPTIONS
104
+ # =============================================================================
105
+
106
+
107
+ class KubernetesVerificationError(Exception):
108
+ """Base exception for all Kubernetes verification errors."""
109
+
110
+
111
+ class KubectlError(KubernetesVerificationError):
112
+ """Raised when kubectl commands fail."""
113
+
114
+ def __init__(
115
+ self, message: str, command: Optional[str] = None, stderr: Optional[str] = None
116
+ ):
117
+ super().__init__(message)
118
+ self.command = command
119
+ self.stderr = stderr
120
+
121
+
122
+ class KubectlNotFoundError(KubernetesVerificationError):
123
+ """Raised when kubectl binary cannot be found."""
124
+
125
+
126
+ class OperatorPodNotFoundError(KubernetesVerificationError):
127
+ """Raised when the Anyscale operator pod cannot be found."""
128
+
129
+
130
+ class OperatorConnectionError(KubernetesVerificationError):
131
+ """Raised when connection to the operator fails."""
132
+
133
+ def __init__(
134
+ self,
135
+ message: str,
136
+ pod_name: Optional[str] = None,
137
+ endpoint: Optional[str] = None,
138
+ ):
139
+ super().__init__(message)
140
+ self.pod_name = pod_name
141
+ self.endpoint = endpoint
142
+
143
+
144
+ class PortForwardError(KubernetesVerificationError):
145
+ """Raised when port forwarding to a pod fails."""
146
+
147
+ def __init__(
148
+ self, message: str, pod_name: Optional[str] = None, port: Optional[int] = None
149
+ ):
150
+ super().__init__(message)
151
+ self.pod_name = pod_name
152
+ self.port = port
153
+
154
+
155
+ class IdentityVerificationError(KubernetesVerificationError):
156
+ """Raised when operator identity verification fails."""
157
+
158
+ def __init__(
159
+ self,
160
+ message: str,
161
+ expected_identity: Optional[str] = None,
162
+ actual_identity: Optional[str] = None,
163
+ ):
164
+ super().__init__(message)
165
+ self.expected_identity = expected_identity
166
+ self.actual_identity = actual_identity
167
+
168
+
169
+ class FileStorageVerificationError(KubernetesVerificationError):
170
+ """Raised when file storage verification fails."""
171
+
172
+
173
+ class GatewayVerificationError(KubernetesVerificationError):
174
+ """Raised when gateway verification fails."""
175
+
176
+ def __init__(self, message: str, gateway_name: Optional[str] = None):
177
+ super().__init__(message)
178
+ self.gateway_name = gateway_name
179
+
180
+
181
+ class ResourceNotFoundError(KubernetesVerificationError):
182
+ """Raised when a required Kubernetes resource is not found."""
183
+
184
+ def __init__(
185
+ self,
186
+ message: str,
187
+ resource_type: Optional[str] = None,
188
+ resource_name: Optional[str] = None,
189
+ namespace: Optional[str] = None,
190
+ ):
191
+ super().__init__(message)
192
+ self.resource_type = resource_type
193
+ self.resource_name = resource_name
194
+ self.namespace = namespace
195
+
196
+
197
+ # =============================================================================
198
+ # DATA MODELS
199
+ # =============================================================================
200
+
201
+
202
+ @dataclass
203
+ class VerificationResults:
204
+ """Tracks the results of all verification steps."""
205
+
206
+ operator_pod_installed: bool = False
207
+ operator_health: bool = False
208
+ operator_identity: bool = False
209
+ file_storage: bool = False
210
+ gateway_support: bool = False
211
+ nginx_ingress: bool = False
212
+
213
+ def to_dict(self) -> Dict[str, bool]:
214
+ """Convert to dictionary format matching original implementation."""
215
+ return {
216
+ VerificationComponents.OPERATOR_POD_INSTALLED: self.operator_pod_installed,
217
+ VerificationComponents.OPERATOR_HEALTH: self.operator_health,
218
+ VerificationComponents.OPERATOR_IDENTITY: self.operator_identity,
219
+ VerificationComponents.FILE_STORAGE: self.file_storage,
220
+ VerificationComponents.GATEWAY_SUPPORT: self.gateway_support,
221
+ VerificationComponents.NGINX_INGRESS: self.nginx_ingress,
222
+ }
223
+
224
+ @property
225
+ def overall_success(self) -> bool:
226
+ """Return True if all verification steps passed."""
227
+ return all(
228
+ [
229
+ self.operator_pod_installed,
230
+ self.operator_health,
231
+ self.operator_identity,
232
+ self.file_storage,
233
+ self.gateway_support,
234
+ self.nginx_ingress,
235
+ ]
236
+ )
237
+
238
+
239
+ @dataclass
240
+ class KubernetesConfig:
241
+ """Configuration for Kubernetes cluster access."""
242
+
243
+ context: str
244
+ operator_namespace: str
245
+
246
+ def __post_init__(self):
247
+ """Validate configuration after initialization."""
248
+ if not self.context:
249
+ raise ValueError("Kubernetes context cannot be empty")
250
+ if not self.operator_namespace:
251
+ raise ValueError("Operator namespace cannot be empty")
252
+
253
+
254
+ @dataclass
255
+ class OperatorHealthData:
256
+ """Data retrieved from operator health endpoint."""
257
+
258
+ status_code: int
259
+ response_text: Optional[str] = None
260
+
261
+ @property
262
+ def is_healthy(self) -> bool:
263
+ """Return True if operator is healthy."""
264
+ return self.status_code == 200
265
+
266
+
267
+ @dataclass
268
+ class OperatorConfigData:
269
+ """Data retrieved from operator config endpoint."""
270
+
271
+ status_code: int
272
+ response_text: str
273
+ config_data: Optional[Dict] = None
274
+ config_error: Optional[str] = None
275
+
276
+ @property
277
+ def is_valid(self) -> bool:
278
+ """Return True if config data is valid."""
279
+ return self.status_code == 200 and self.config_data is not None
280
+
281
+
282
+ @dataclass
283
+ class OperatorData:
284
+ """Combined data from operator health and config endpoints."""
285
+
286
+ health: OperatorHealthData
287
+ config: OperatorConfigData
288
+
289
+ @classmethod
290
+ def from_dict(cls, data: Dict) -> "OperatorData":
291
+ """Create OperatorData from dictionary format used in original code."""
292
+ health = OperatorHealthData(
293
+ status_code=data["health_status"], response_text=data.get("health_response")
294
+ )
295
+
296
+ config = OperatorConfigData(
297
+ status_code=data["config_status"],
298
+ response_text=data["config_response"],
299
+ config_data=data.get("config_data"),
300
+ config_error=data.get("config_error"),
301
+ )
302
+
303
+ return cls(health=health, config=config)
304
+
305
+
306
+ @dataclass
307
+ class GatewayConfig:
308
+ """Gateway configuration from operator."""
309
+
310
+ enabled: bool = False
311
+ name: Optional[str] = None
312
+
313
+ @classmethod
314
+ def from_operator_config(cls, config_data: Optional[Dict]) -> "GatewayConfig":
315
+ """Extract gateway config from operator configuration."""
316
+ if not config_data:
317
+ return cls()
318
+
319
+ gateway_config = config_data.get("gateway", {})
320
+ if not gateway_config:
321
+ return cls()
322
+
323
+ return cls(
324
+ enabled=gateway_config.get("enable", False), name=gateway_config.get("name")
325
+ )
326
+
327
+ @property
328
+ def requires_verification(self) -> bool:
329
+ """Return True if gateway verification is required."""
330
+ return self.enabled and self.name is not None
331
+
332
+
333
+ # =============================================================================
334
+ # KUBECTL OPERATIONS
335
+ # =============================================================================
336
+
337
+
338
+ class KubectlOperations:
339
+ """Utility class for executing kubectl commands with consistent error handling."""
340
+
341
+ def __init__(self, context: str, logger: BlockLogger):
342
+ self.context = context
343
+ self.log = logger
344
+ self._kubectl_path: Optional[str] = None
345
+
346
+ def get_resource(
347
+ self, resource_type: str, name: str, namespace: Optional[str] = None
348
+ ) -> Dict:
349
+ """Get a single Kubernetes resource by name."""
350
+ cmd_args = ["get", resource_type, name, "--context", self.context, "-o", "json"]
351
+ if namespace:
352
+ cmd_args.extend(["-n", namespace])
353
+
354
+ try:
355
+ result = self._run_kubectl_command(cmd_args)
356
+ return json.loads(result.stdout)
357
+ except subprocess.CalledProcessError as e:
358
+ if "not found" in e.stderr.lower():
359
+ raise ResourceNotFoundError(
360
+ f"{resource_type} '{name}' not found",
361
+ resource_type=resource_type,
362
+ resource_name=name,
363
+ namespace=namespace,
364
+ )
365
+ raise KubectlError(
366
+ f"Failed to get {resource_type} '{name}': {e.stderr}",
367
+ command=" ".join(cmd_args),
368
+ stderr=e.stderr,
369
+ )
370
+ except json.JSONDecodeError as e:
371
+ raise KubectlError(
372
+ f"Invalid JSON response from kubectl: {e}", command=" ".join(cmd_args)
373
+ )
374
+
375
+ def list_resources(
376
+ self,
377
+ resource_type: str,
378
+ namespace: Optional[str] = None,
379
+ label_selector: Optional[str] = None,
380
+ all_namespaces: bool = False,
381
+ ) -> List[Dict]:
382
+ """List Kubernetes resources with optional filtering."""
383
+ cmd_args = ["get", resource_type, "--context", self.context, "-o", "json"]
384
+
385
+ if all_namespaces:
386
+ cmd_args.append("--all-namespaces")
387
+ elif namespace:
388
+ cmd_args.extend(["-n", namespace])
389
+
390
+ if label_selector:
391
+ cmd_args.extend(["-l", label_selector])
392
+
393
+ try:
394
+ result = self._run_kubectl_command(cmd_args)
395
+ data = json.loads(result.stdout)
396
+ return data.get("items", [])
397
+ except subprocess.CalledProcessError as e:
398
+ raise KubectlError(
399
+ f"Failed to list {resource_type}: {e.stderr}",
400
+ command=" ".join(cmd_args),
401
+ stderr=e.stderr,
402
+ )
403
+ except json.JSONDecodeError as e:
404
+ raise KubectlError(
405
+ f"Invalid JSON response from kubectl: {e}", command=" ".join(cmd_args)
406
+ )
407
+
408
+ def get_resource_field(
409
+ self,
410
+ resource_type: str,
411
+ name: str,
412
+ jsonpath: str,
413
+ namespace: Optional[str] = None,
414
+ ) -> str:
415
+ """Get a specific field from a Kubernetes resource using jsonpath."""
416
+ cmd_args = [
417
+ "get",
418
+ resource_type,
419
+ name,
420
+ "--context",
421
+ self.context,
422
+ "-o",
423
+ f"jsonpath={jsonpath}",
424
+ ]
425
+ if namespace:
426
+ cmd_args.extend(["-n", namespace])
427
+
428
+ try:
429
+ result = self._run_kubectl_command(cmd_args)
430
+ return result.stdout.strip()
431
+ except subprocess.CalledProcessError as e:
432
+ if "not found" in e.stderr.lower():
433
+ raise ResourceNotFoundError(
434
+ f"{resource_type} '{name}' not found",
435
+ resource_type=resource_type,
436
+ resource_name=name,
437
+ namespace=namespace,
438
+ )
439
+ raise KubectlError(
440
+ f"Failed to get field from {resource_type} '{name}': {e.stderr}",
441
+ command=" ".join(cmd_args),
442
+ stderr=e.stderr,
443
+ )
444
+
445
+ def get_available_contexts(self) -> List[str]:
446
+ """Get list of available kubectl contexts."""
447
+ try:
448
+ result = self._run_kubectl_command(["config", "get-contexts", "-o", "name"])
449
+ contexts = [
450
+ ctx.strip() for ctx in result.stdout.strip().split("\n") if ctx.strip()
451
+ ]
452
+ return contexts
453
+ except subprocess.CalledProcessError as e:
454
+ raise KubectlError(
455
+ f"Failed to get kubectl contexts: {e.stderr}",
456
+ command="kubectl config get-contexts -o name",
457
+ stderr=e.stderr,
458
+ )
459
+
460
+ def get_current_context(self) -> Optional[str]:
461
+ """Get the current kubectl context."""
462
+ try:
463
+ result = self._run_kubectl_command(["config", "current-context"])
464
+ return result.stdout.strip()
465
+ except subprocess.CalledProcessError as e:
466
+ if "current-context is not set" in e.stderr.lower():
467
+ return None
468
+ raise KubectlError(
469
+ f"Failed to get current context: {e.stderr}",
470
+ command="kubectl config current-context",
471
+ stderr=e.stderr,
472
+ )
473
+
474
+ def start_port_forward(
475
+ self, pod_name: str, local_port: int, remote_port: int, namespace: str
476
+ ) -> subprocess.Popen:
477
+ """Start port forwarding to a pod."""
478
+ cmd_args = [
479
+ "port-forward",
480
+ "--context",
481
+ self.context,
482
+ "-n",
483
+ namespace,
484
+ pod_name,
485
+ f"{local_port}:{remote_port}",
486
+ ]
487
+
488
+ try:
489
+ cmd = self._get_kubectl_cmd(cmd_args)
490
+ process = subprocess.Popen(
491
+ cmd,
492
+ stdout=subprocess.PIPE,
493
+ stderr=subprocess.PIPE,
494
+ preexec_fn=os.setsid, # Create new process group for cleanup
495
+ )
496
+ return process
497
+ except (subprocess.CalledProcessError, OSError) as e:
498
+ raise KubectlError(
499
+ f"Failed to start port forward to {pod_name}: {e}",
500
+ command=" ".join(cmd_args),
501
+ )
502
+
503
+ def check_kubectl_available(self) -> bool:
504
+ """Check if kubectl command is available."""
505
+ try:
506
+ self._run_kubectl_command(["version", "--client"])
507
+ return True
508
+ except (subprocess.CalledProcessError, FileNotFoundError, KubectlNotFoundError):
509
+ return False
510
+
511
+ def get_pod_status(self, pod_name: str, namespace: str) -> str:
512
+ """
513
+ Get pod status phase in specific namespace.
514
+
515
+ Args:
516
+ pod_name: Name of the pod
517
+ namespace: Namespace containing the pod
518
+
519
+ Returns:
520
+ Pod status phase (e.g., "Running", "Pending") or "unknown" if cannot be determined
521
+ """
522
+ try:
523
+ return self.get_resource_field(
524
+ "pod", pod_name, "{.status.phase}", namespace=namespace
525
+ )
526
+ except (KubectlError, ResourceNotFoundError):
527
+ # Return "unknown" if status cannot be determined
528
+ return "unknown"
529
+
530
+ def is_pod_running(self, pod_name: str, namespace: str) -> bool:
531
+ """
532
+ Check if pod is in running state.
533
+
534
+ Args:
535
+ pod_name: Name of the pod
536
+ namespace: Namespace containing the pod
537
+
538
+ Returns:
539
+ True if pod is running, False otherwise
540
+ """
541
+ try:
542
+ status = self.get_resource_field(
543
+ "pod", pod_name, "{.status.phase}", namespace=namespace
544
+ )
545
+ return status == RUNNING_STATUS
546
+ except (KubectlError, ResourceNotFoundError):
547
+ # Return False if status check fails
548
+ return False
549
+
550
+ def _run_kubectl_command(self, args: List[str]) -> subprocess.CompletedProcess:
551
+ """Execute a kubectl command with the given arguments."""
552
+ cmd = self._get_kubectl_cmd(args)
553
+ return subprocess.run(cmd, capture_output=True, text=True, check=True)
554
+
555
+ def _get_kubectl_cmd(self, args: List[str]) -> List[str]:
556
+ """Get kubectl command with proper binary path."""
557
+ kubectl_path = self._find_kubectl_binary()
558
+ if not kubectl_path:
559
+ raise KubectlNotFoundError(
560
+ "kubectl command not found. Please install kubectl and ensure it's in your PATH."
561
+ )
562
+ return [kubectl_path] + args
563
+
564
+ def _find_kubectl_binary(self) -> Optional[str]:
565
+ """Find kubectl binary in common locations."""
566
+ if self._kubectl_path:
567
+ return self._kubectl_path
568
+
569
+ # Try to find kubectl using shutil.which first (respects PATH)
570
+ kubectl_path = shutil.which("kubectl")
571
+ if kubectl_path:
572
+ self._kubectl_path = kubectl_path
573
+ return kubectl_path
574
+
575
+ # Try common installation locations
576
+ for path in KUBECTL_COMMON_PATHS:
577
+ expanded_path = os.path.expanduser(path)
578
+ if os.path.isfile(expanded_path) and os.access(expanded_path, os.X_OK):
579
+ self._kubectl_path = expanded_path
580
+ return expanded_path
581
+
582
+ return None
583
+
584
+
585
+ # =============================================================================
586
+ # OPERATOR VERIFIER
587
+ # =============================================================================
588
+
589
+
590
+ class OperatorVerifier:
591
+ """Handles verification of Anyscale operator pod, health, and identity."""
592
+
593
+ def __init__(
594
+ self,
595
+ kubectl_ops: KubectlOperations,
596
+ k8s_config: KubernetesConfig,
597
+ logger: BlockLogger,
598
+ ):
599
+ self.kubectl = kubectl_ops
600
+ self.config = k8s_config
601
+ self.log = logger
602
+
603
+ def find_operator_pod(self) -> str:
604
+ """Find and verify operator pod is running."""
605
+ try:
606
+ pods = self.kubectl.list_resources(
607
+ "pods",
608
+ namespace=self.config.operator_namespace,
609
+ label_selector=OPERATOR_LABEL_SELECTOR,
610
+ )
611
+ except KubectlError as e:
612
+ raise OperatorPodNotFoundError(f"Failed to list operator pods: {e}")
613
+
614
+ if not pods:
615
+ raise OperatorPodNotFoundError(
616
+ "No Anyscale operator pods found. Expected pods with labels like "
617
+ "'app=anyscale-operator'"
618
+ )
619
+
620
+ operator_pod = pods[0]["metadata"]["name"]
621
+
622
+ if not self.kubectl.is_pod_running(
623
+ operator_pod, self.config.operator_namespace
624
+ ):
625
+ raise OperatorPodNotFoundError(
626
+ f"Operator pod '{operator_pod}' is not running"
627
+ )
628
+
629
+ return operator_pod
630
+
631
+ def get_operator_data(self, pod_name: str) -> OperatorData:
632
+ """Port forward to operator and fetch both health and config data."""
633
+ try:
634
+ with self._port_forward_to_operator(pod_name) as local_port:
635
+ # Fetch health data
636
+ health_data = self._fetch_health_data(local_port)
637
+
638
+ # Fetch config data
639
+ config_data = self._fetch_config_data(local_port)
640
+
641
+ return OperatorData(health=health_data, config=config_data)
642
+
643
+ except requests.RequestException as e:
644
+ raise OperatorConnectionError(
645
+ f"Cannot connect to operator endpoints: {e}", pod_name=pod_name
646
+ )
647
+ except RuntimeError as e:
648
+ raise PortForwardError(
649
+ f"Port forwarding failed: {e}",
650
+ pod_name=pod_name,
651
+ port=OPERATOR_HEALTH_PORT,
652
+ )
653
+
654
+ def verify_operator_health(self, operator_data: OperatorData) -> bool:
655
+ """Verify operator health using pre-fetched data."""
656
+ if operator_data.health.is_healthy:
657
+ return True
658
+ else:
659
+ self.log.error(
660
+ f"Health check failed - HTTP {operator_data.health.status_code}"
661
+ )
662
+ if operator_data.health.response_text:
663
+ self.log.error(f"Response: {operator_data.health.response_text}")
664
+ return False
665
+
666
+ def verify_operator_identity(
667
+ self,
668
+ operator_data: OperatorData,
669
+ kubernetes_config: Dict,
670
+ cloud_provider: Optional[CloudProviders],
671
+ ) -> bool:
672
+ """Verify operator identity using pre-fetched config data."""
673
+ # Validate kubernetes_config contents
674
+ expected_identity = kubernetes_config.get("anyscale_operator_iam_identity")
675
+ if not expected_identity:
676
+ self.log.error(
677
+ "Missing 'anyscale_operator_iam_identity' in kubernetes config"
678
+ )
679
+ return False
680
+
681
+ # Validate config response
682
+ if not operator_data.config.is_valid:
683
+ self.log.error(
684
+ f"Config endpoint returned HTTP {operator_data.config.status_code}"
685
+ )
686
+ if operator_data.config.response_text:
687
+ self.log.error(f"Response: {operator_data.config.response_text}")
688
+ return False
689
+
690
+ # Extract actual identity from config
691
+ if operator_data.config.config_data is None:
692
+ self.log.error("Operator config data is None")
693
+ return False
694
+
695
+ actual_identity = operator_data.config.config_data.get("iamIdentity")
696
+ if not actual_identity:
697
+ self.log.error("Operator config missing 'iamIdentity' field")
698
+ return False
699
+
700
+ # Perform identity comparison
701
+ if self._evaluate_identity_match(
702
+ expected_identity, actual_identity, cloud_provider
703
+ ):
704
+ self.log.info(
705
+ f"AWS identity match: Role matches (Expected: {expected_identity})"
706
+ )
707
+ self.log.info("Expected IAM role matches actual assumed role")
708
+ return True
709
+ else:
710
+ self.log.error("Operator identity mismatch")
711
+ self.log.error(f"Expected: {expected_identity}")
712
+ self.log.error(f"Actual: {actual_identity}")
713
+ return False
714
+
715
+ @contextmanager
716
+ def _port_forward_to_operator(self, pod_name: str):
717
+ """Context manager that port forwards to operator pod."""
718
+ port_forward_process = None
719
+ local_port = None
720
+ try:
721
+ # Get a free port for port forwarding
722
+ local_port = self._get_free_port()
723
+ self.log.info(f"Using local port {local_port} for port forwarding")
724
+
725
+ # Start port forwarding to the pod
726
+ self.log.info(
727
+ f"Starting port forward to pod {pod_name} on port {local_port}:{OPERATOR_HEALTH_PORT}..."
728
+ )
729
+
730
+ port_forward_process = self.kubectl.start_port_forward(
731
+ pod_name,
732
+ local_port,
733
+ OPERATOR_HEALTH_PORT,
734
+ self.config.operator_namespace,
735
+ )
736
+
737
+ # Wait for port forward to establish
738
+ self.log.info("Waiting for port forward to establish...")
739
+ time.sleep(PORT_FORWARD_WAIT_TIME)
740
+
741
+ # Check if port forward process is still running
742
+ if port_forward_process.poll() is not None:
743
+ stderr = (
744
+ port_forward_process.stderr.read().decode()
745
+ if port_forward_process.stderr
746
+ else ""
747
+ )
748
+ raise RuntimeError(f"Port forward failed to start: {stderr}")
749
+
750
+ # Yield the local port to the calling function
751
+ yield local_port
752
+
753
+ finally:
754
+ # Clean up port forward process
755
+ if port_forward_process and port_forward_process.poll() is None:
756
+ try:
757
+ # Kill the entire process group to ensure cleanup
758
+ os.killpg(os.getpgid(port_forward_process.pid), signal.SIGTERM)
759
+ port_forward_process.wait(timeout=PORT_FORWARD_TERMINATION_TIMEOUT)
760
+ except (ProcessLookupError, subprocess.TimeoutExpired):
761
+ # Force kill if graceful termination fails
762
+ with suppress(ProcessLookupError):
763
+ os.killpg(os.getpgid(port_forward_process.pid), signal.SIGKILL)
764
+ except (OSError, ValueError) as e:
765
+ self.log.warning(f"Port forward cleanup warning: {e}")
766
+
767
+ def _get_free_port(self) -> int:
768
+ """Get a random free port on localhost."""
769
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
770
+ s.bind(("", 0))
771
+ s.listen(1)
772
+ port = s.getsockname()[1]
773
+ return port
774
+
775
+ def _fetch_health_data(self, local_port: int) -> OperatorHealthData:
776
+ """Fetch health data from operator."""
777
+ response = requests.get(
778
+ f"http://localhost:{local_port}{OPERATOR_HEALTH_ENDPOINT}",
779
+ timeout=HTTP_REQUEST_TIMEOUT,
780
+ )
781
+
782
+ return OperatorHealthData(
783
+ status_code=response.status_code,
784
+ response_text=response.text if response.status_code != 200 else None,
785
+ )
786
+
787
+ def _fetch_config_data(self, local_port: int) -> OperatorConfigData:
788
+ """Fetch config data from operator."""
789
+ response = requests.get(
790
+ f"http://localhost:{local_port}{OPERATOR_CONFIG_ENDPOINT}",
791
+ timeout=HTTP_REQUEST_TIMEOUT,
792
+ )
793
+
794
+ config_data = None
795
+ config_error = None
796
+
797
+ if response.status_code == 200:
798
+ try:
799
+ config_data = response.json()
800
+ except json.JSONDecodeError as e:
801
+ config_error = str(e)
802
+
803
+ return OperatorConfigData(
804
+ status_code=response.status_code,
805
+ response_text=response.text,
806
+ config_data=config_data,
807
+ config_error=config_error,
808
+ )
809
+
810
+ def _evaluate_identity_match(
811
+ self,
812
+ expected_identity: str,
813
+ actual_identity: str,
814
+ cloud_provider: Optional[CloudProviders],
815
+ ) -> bool:
816
+ """Evaluate if the operator identity matches expected identity based on cloud provider."""
817
+ if not expected_identity or not actual_identity:
818
+ return False
819
+
820
+ # Convert to string for comparison, default to AWS
821
+ cloud_provider_str = str(cloud_provider) if cloud_provider else "AWS"
822
+
823
+ # Handle cloud provider specific identity comparison
824
+ if cloud_provider_str == "AWS":
825
+ return self._evaluate_aws_identity(expected_identity, actual_identity)
826
+ elif cloud_provider_str == "GCP":
827
+ return self._evaluate_gcp_identity(expected_identity, actual_identity)
828
+ elif cloud_provider_str == "AZURE":
829
+ return self._evaluate_azure_identity(expected_identity, actual_identity)
830
+ else:
831
+ # For unknown providers, fall back to exact string comparison
832
+ self.log.warning(
833
+ f"Unknown cloud provider '{cloud_provider}', using exact string comparison"
834
+ )
835
+ return expected_identity == actual_identity
836
+
837
+ def _evaluate_aws_identity(
838
+ self, expected_identity: str, actual_identity: str
839
+ ) -> bool:
840
+ """Evaluate AWS IAM identity comparison."""
841
+ try:
842
+ # If they're exactly equal, that's fine
843
+ if expected_identity == actual_identity:
844
+ return True
845
+
846
+ # Check if actual is an assumed role version of expected role
847
+ if self._is_aws_assumed_role(actual_identity):
848
+ # Extract the role name from both ARNs
849
+ expected_role = self._extract_aws_role_name(expected_identity)
850
+ actual_role = self._extract_aws_role_name_from_assumed_role(
851
+ actual_identity
852
+ )
853
+
854
+ if expected_role and actual_role and expected_role == actual_role:
855
+ # Also check account ID matches
856
+ expected_account = self._extract_aws_account_id(expected_identity)
857
+ actual_account = self._extract_aws_account_id(actual_identity)
858
+
859
+ if expected_account == actual_account:
860
+ self.log.info(
861
+ f"AWS identity match: Role '{expected_role}' (account: {expected_account})"
862
+ )
863
+ return True
864
+
865
+ return False
866
+
867
+ except (ValueError, IndexError, AttributeError) as e:
868
+ self.log.error(f"Error evaluating AWS identity: {e}")
869
+ return False
870
+
871
+ def _evaluate_gcp_identity(
872
+ self, expected_identity: str, actual_identity: str
873
+ ) -> bool:
874
+ """Evaluate GCP identity comparison."""
875
+ return expected_identity == actual_identity
876
+
877
+ def _evaluate_azure_identity(
878
+ self, expected_identity: str, actual_identity: str
879
+ ) -> bool:
880
+ """Evaluate Azure identity comparison."""
881
+ return expected_identity == actual_identity
882
+
883
+ def _is_aws_assumed_role(self, arn: str) -> bool:
884
+ """Check if ARN is an assumed role ARN."""
885
+ return arn.startswith("arn:aws:sts:") and ":assumed-role/" in arn
886
+
887
+ def _extract_aws_role_name(self, role_arn: str) -> Optional[str]:
888
+ """Extract role name from IAM role ARN."""
889
+ try:
890
+ if ":role/" in role_arn:
891
+ return role_arn.split(":role/")[-1]
892
+ return None
893
+ except (ValueError, IndexError):
894
+ return None
895
+
896
+ def _extract_aws_role_name_from_assumed_role(
897
+ self, assumed_role_arn: str
898
+ ) -> Optional[str]:
899
+ """Extract role name from assumed role ARN."""
900
+ try:
901
+ if ":assumed-role/" in assumed_role_arn:
902
+ parts = assumed_role_arn.split(":assumed-role/")[-1].split("/")
903
+ if len(parts) >= 1:
904
+ return parts[0] # Role name is first part after assumed-role/
905
+ return None
906
+ except (ValueError, IndexError):
907
+ return None
908
+
909
+ def _extract_aws_account_id(self, arn: str) -> Optional[str]:
910
+ """Extract AWS account ID from any ARN."""
911
+ try:
912
+ # ARN format: arn:partition:service:region:account-id:resource
913
+ parts = arn.split(":")
914
+ if len(parts) >= 5:
915
+ return parts[4]
916
+ return None
917
+ except (ValueError, IndexError):
918
+ return None
919
+
920
+
921
+ # =============================================================================
922
+ # STORAGE VERIFIER
923
+ # =============================================================================
924
+
925
+
926
+ class StorageVerifier:
927
+ """Handles verification of file storage components for Kubernetes deployments."""
928
+
929
+ def __init__(
930
+ self,
931
+ kubectl_ops: KubectlOperations,
932
+ k8s_config: KubernetesConfig,
933
+ logger: BlockLogger,
934
+ ):
935
+ self.kubectl = kubectl_ops
936
+ self.config = k8s_config
937
+ self.log = logger
938
+
939
+ def verify_file_storage(
940
+ self, file_storage: FileStorage, cloud_deployment: CloudDeployment
941
+ ) -> bool:
942
+ """Verify file storage configuration (non-functional checks only)."""
943
+ self.log.info("Verifying file storage configuration...")
944
+ verification_results = []
945
+
946
+ if getattr(file_storage, "csi_ephemeral_volume_driver", None):
947
+ driver_name = file_storage.csi_ephemeral_volume_driver
948
+ if driver_name:
949
+ self.log.info(f"Checking CSI driver: {driver_name}")
950
+ result = self._verify_csi_driver(driver_name)
951
+ verification_results.append(("CSI driver", result))
952
+
953
+ if getattr(file_storage, "persistent_volume_claim", None):
954
+ pvc_name = file_storage.persistent_volume_claim
955
+ if pvc_name:
956
+ self.log.info(f"Checking PVC: {pvc_name}")
957
+ result = self._verify_pvc(pvc_name)
958
+ verification_results.append(("PVC", result))
959
+
960
+ if getattr(file_storage, "file_storage_id", None):
961
+ self.log.info("Checking NFS file storage exists via cloud provider APIs...")
962
+ try:
963
+ nfs_exists = verify_file_storage_exists(
964
+ file_storage, cloud_deployment, logger=self.log
965
+ )
966
+ verification_results.append(("NFS", nfs_exists))
967
+ except (ValueError, KeyError, TypeError, ImportError) as e:
968
+ self.log.error(
969
+ f"Cloud provider API error while verifying file storage: {e}"
970
+ )
971
+ raise RuntimeError(
972
+ f"Cloud provider API error while verifying file storage: {e}"
973
+ ) from e
974
+
975
+ # Return overall success
976
+ if verification_results:
977
+ return all(result for _, result in verification_results)
978
+ else:
979
+ self.log.info("INFO: No file storage components found to verify")
980
+ return True
981
+
982
+ def _verify_csi_driver(self, driver_name: str) -> bool:
983
+ """Check if CSI driver exists on cluster."""
984
+ try:
985
+ driver_info = self.kubectl.get_resource("csidriver", driver_name)
986
+
987
+ # Parse driver details for logging
988
+ driver_spec = driver_info.get("spec", {})
989
+ self.log.info(f"CSI driver '{driver_name}' is available")
990
+ self.log.info(
991
+ f"Attach required: {driver_spec.get('attachRequired', 'unknown')}"
992
+ )
993
+ self.log.info(
994
+ f"Pod info on mount: {driver_spec.get('podInfoOnMount', 'unknown')}"
995
+ )
996
+ return True
997
+
998
+ except ResourceNotFoundError:
999
+ self.log.error(f"CSI driver '{driver_name}' not found")
1000
+ self.log.error("Available CSI drivers:")
1001
+ self._list_available_csi_drivers()
1002
+ return False
1003
+
1004
+ except Exception as e: # noqa: BLE001
1005
+ self.log.error(f"Failed to query CSI driver: {e}")
1006
+ raise RuntimeError(
1007
+ f"kubectl error while verifying CSI driver '{driver_name}': {e}"
1008
+ ) from e
1009
+
1010
+ def _verify_pvc(self, pvc_name: str) -> bool:
1011
+ """Check if PVC exists and is bound in operator namespace."""
1012
+ try:
1013
+ pvc_data = self.kubectl.get_resource(
1014
+ "pvc", pvc_name, namespace=self.config.operator_namespace
1015
+ )
1016
+
1017
+ status = pvc_data.get("status", {})
1018
+ phase = status.get("phase")
1019
+ capacity = status.get("capacity", {})
1020
+ storage_class = pvc_data.get("spec", {}).get("storageClassName")
1021
+
1022
+ if phase == "Bound":
1023
+ self.log.info(f"PVC '{pvc_name}' is bound")
1024
+ self.log.info(f"Capacity: {capacity.get('storage', 'unknown')}")
1025
+ self.log.info(f"Storage class: {storage_class or 'default'}")
1026
+ return True
1027
+ else:
1028
+ self.log.error(
1029
+ f"FAILED: PVC '{pvc_name}' is not bound (status: {phase})"
1030
+ )
1031
+ return False
1032
+
1033
+ except ResourceNotFoundError:
1034
+ self.log.error(
1035
+ f"FAILED: PVC '{pvc_name}' not found in namespace '{self.config.operator_namespace}'"
1036
+ )
1037
+ self.log.error("Available PVCs in namespace:")
1038
+ self._list_available_pvcs()
1039
+ return False
1040
+
1041
+ except Exception as e: # noqa: BLE001
1042
+ self.log.error(f"FAILED: Failed to check PVC '{pvc_name}': {e}")
1043
+ raise RuntimeError(
1044
+ f"kubectl error while verifying PVC '{pvc_name}': {e}"
1045
+ ) from e
1046
+
1047
+ def _list_available_csi_drivers(self) -> None:
1048
+ """List available CSI drivers for troubleshooting."""
1049
+ try:
1050
+ drivers = self.kubectl.list_resources("csidrivers")
1051
+ if drivers:
1052
+ for driver in drivers:
1053
+ name = driver.get("metadata", {}).get("name", "unknown")
1054
+ self.log.error(f" - {name}")
1055
+ else:
1056
+ self.log.error(" (no CSI drivers found in cluster)")
1057
+ except Exception: # noqa: BLE001
1058
+ self.log.error(" (failed to list CSI drivers)")
1059
+
1060
+ def _list_available_pvcs(self) -> None:
1061
+ """List available PVCs for troubleshooting."""
1062
+ try:
1063
+ pvcs = self.kubectl.list_resources(
1064
+ "pvcs", namespace=self.config.operator_namespace
1065
+ )
1066
+ if pvcs:
1067
+ for pvc in pvcs:
1068
+ name = pvc.get("metadata", {}).get("name", "unknown")
1069
+ self.log.error(f" - {name}")
1070
+ else:
1071
+ self.log.error(
1072
+ f" (no PVCs found in namespace '{self.config.operator_namespace}')"
1073
+ )
1074
+ except Exception: # noqa: BLE001
1075
+ self.log.error(" (failed to list PVCs)")
1076
+
1077
+
1078
+ # =============================================================================
1079
+ # GATEWAY VERIFIER
1080
+ # =============================================================================
1081
+
1082
+
1083
+ class GatewayVerifier:
1084
+ """Handles verification of gateway and ingress components for Kubernetes deployments."""
1085
+
1086
+ def __init__(
1087
+ self,
1088
+ kubectl_ops: KubectlOperations,
1089
+ k8s_config: KubernetesConfig,
1090
+ logger: BlockLogger,
1091
+ ):
1092
+ self.kubectl = kubectl_ops
1093
+ self.config = k8s_config
1094
+ self.log = logger
1095
+
1096
+ def verify_gateway_support(self, operator_data: OperatorData) -> bool:
1097
+ """Verify gateway support using pre-fetched config data."""
1098
+ if not operator_data.config.is_valid:
1099
+ self.log.warning(
1100
+ "Could not retrieve operator configuration - skipping gateway verification"
1101
+ )
1102
+ return True
1103
+
1104
+ # Extract gateway configuration from operator data
1105
+ gateway_config = GatewayConfig.from_operator_config(
1106
+ operator_data.config.config_data
1107
+ )
1108
+
1109
+ if not gateway_config.enabled:
1110
+ self.log.info(
1111
+ "Gateway support is not enabled - skipping gateway verification"
1112
+ )
1113
+ return True
1114
+
1115
+ if not gateway_config.requires_verification:
1116
+ self.log.error(
1117
+ "Gateway is enabled but no gateway name found in operator configuration"
1118
+ )
1119
+ return False
1120
+
1121
+ # Verify gateway exists in cluster
1122
+ assert (
1123
+ gateway_config.name is not None
1124
+ ) # guaranteed by requires_verification check
1125
+ return self._verify_gateway_exists(gateway_config.name)
1126
+
1127
+ def verify_nginx_ingress(self) -> bool:
1128
+ """Check for NGINX ingress controller (warning only)."""
1129
+ try:
1130
+ self.log.info("Checking for NGINX ingress controller...")
1131
+
1132
+ # Try different NGINX ingress controller configurations
1133
+ for config_dict in NGINX_INGRESS_CONFIGS:
1134
+ nginx_pod = self._find_nginx_pod(
1135
+ config_dict["namespace"], config_dict["label"]
1136
+ )
1137
+ if nginx_pod:
1138
+ if self.kubectl.is_pod_running(nginx_pod, config_dict["namespace"]):
1139
+ self.log.info(
1140
+ f"PASSED: Found running NGINX ingress controller: {nginx_pod} "
1141
+ f"(namespace: {config_dict['namespace']})"
1142
+ )
1143
+ return True
1144
+ else:
1145
+ pod_status = self.kubectl.get_pod_status(
1146
+ nginx_pod, config_dict["namespace"]
1147
+ )
1148
+ self.log.warning(
1149
+ f"WARNING: Found NGINX ingress controller '{nginx_pod}' "
1150
+ f"but it's not running (status: {pod_status})"
1151
+ )
1152
+
1153
+ # Try fallback search by name patterns
1154
+ if self._find_nginx_by_name_pattern():
1155
+ return True
1156
+
1157
+ # No NGINX ingress controller found
1158
+ self.log.warning("No NGINX ingress controller found")
1159
+ self.log.warning("This may impact ingress routing capabilities")
1160
+ self.log.warning("Available ingress controllers:")
1161
+ self._list_available_ingress_controllers()
1162
+ return False
1163
+
1164
+ except (KubectlError, ResourceNotFoundError) as e:
1165
+ self.log.warning(f"WARNING: Could not verify NGINX ingress controller: {e}")
1166
+ raise RuntimeError(
1167
+ f"kubectl error during NGINX ingress verification: {e}"
1168
+ ) from e
1169
+
1170
+ def _verify_gateway_exists(self, gateway_name: str) -> bool:
1171
+ """Verify that the specified gateway exists in the cluster."""
1172
+ try:
1173
+ # Try to find gateway in common Gateway API resource types
1174
+ for resource_type in GATEWAY_RESOURCE_TYPES:
1175
+ if self._check_gateway_resource(resource_type, gateway_name):
1176
+ return True
1177
+
1178
+ # If not found in operator namespace, try cluster-wide search
1179
+ self.log.info(
1180
+ f"Gateway '{gateway_name}' not found in operator namespace, "
1181
+ "searching cluster-wide..."
1182
+ )
1183
+ for resource_type in GATEWAY_RESOURCE_TYPES:
1184
+ if self._check_gateway_resource_cluster_wide(
1185
+ resource_type, gateway_name
1186
+ ):
1187
+ return True
1188
+
1189
+ self.log.error(f"FAILED: Gateway '{gateway_name}' not found in cluster")
1190
+ self.log.error("Available gateways:")
1191
+ self._list_available_gateways()
1192
+ return False
1193
+
1194
+ except (KubectlError, ResourceNotFoundError) as e:
1195
+ self.log.error(f"FAILED: Failed to verify gateway '{gateway_name}': {e}")
1196
+ raise RuntimeError(
1197
+ f"kubectl error while verifying gateway '{gateway_name}': {e}"
1198
+ ) from e
1199
+
1200
+ def _check_gateway_resource(self, resource_type: str, gateway_name: str) -> bool:
1201
+ """Check for gateway resource in operator namespace."""
1202
+ try:
1203
+ gateway_data = self.kubectl.get_resource(
1204
+ resource_type, gateway_name, namespace=self.config.operator_namespace
1205
+ )
1206
+
1207
+ self.log.info(
1208
+ f"PASSED: Gateway '{gateway_name}' found in namespace '{self.config.operator_namespace}'"
1209
+ )
1210
+
1211
+ # Log gateway status if available
1212
+ status = gateway_data.get("status", {})
1213
+ conditions = status.get("conditions", [])
1214
+ for condition in conditions:
1215
+ if (
1216
+ condition.get("type") == "Ready"
1217
+ and condition.get("status") == "True"
1218
+ ):
1219
+ self.log.info(" Status: Ready")
1220
+ break
1221
+
1222
+ return True
1223
+
1224
+ except ResourceNotFoundError:
1225
+ return False
1226
+
1227
+ def _check_gateway_resource_cluster_wide(
1228
+ self, resource_type: str, gateway_name: str
1229
+ ) -> bool:
1230
+ """Check for gateway resource cluster-wide."""
1231
+ try:
1232
+ gateways = self.kubectl.list_resources(resource_type, all_namespaces=True)
1233
+
1234
+ for gateway in gateways:
1235
+ if gateway.get("metadata", {}).get("name") == gateway_name:
1236
+ namespace = gateway.get("metadata", {}).get("namespace", "unknown")
1237
+ self.log.info(
1238
+ f"PASSED: Gateway '{gateway_name}' found in namespace '{namespace}'"
1239
+ )
1240
+ return True
1241
+
1242
+ return False
1243
+
1244
+ except Exception: # noqa: BLE001
1245
+ # Broad exception handling for fallback case
1246
+ return False
1247
+
1248
+ def _find_nginx_pod(self, namespace: str, label_selector: str) -> Optional[str]:
1249
+ """Find NGINX ingress pod by label selector in specific namespace."""
1250
+ try:
1251
+ pods = self.kubectl.list_resources(
1252
+ "pods", namespace=namespace, label_selector=label_selector
1253
+ )
1254
+
1255
+ if pods:
1256
+ return pods[0]["metadata"]["name"]
1257
+ return None
1258
+
1259
+ except Exception: # noqa: BLE001
1260
+ # Broad exception handling for fallback pod discovery
1261
+ return None
1262
+
1263
+ def _find_nginx_by_name_pattern(self) -> bool:
1264
+ """Find NGINX ingress controller by name pattern across all namespaces."""
1265
+ try:
1266
+ pods = self.kubectl.list_resources("pods", all_namespaces=True)
1267
+
1268
+ # Look for pods with names containing NGINX and ingress keywords
1269
+ for pod in pods:
1270
+ metadata = pod.get("metadata", {})
1271
+ name = metadata.get("name", "").lower()
1272
+ namespace = metadata.get("namespace", "")
1273
+ status_phase = pod.get("status", {}).get("phase", "")
1274
+
1275
+ if "nginx" in name and "ingress" in name:
1276
+ if status_phase == RUNNING_STATUS:
1277
+ self.log.info(
1278
+ f"PASSED: Found NGINX ingress controller by name pattern: "
1279
+ f"{metadata['name']} (namespace: {namespace})"
1280
+ )
1281
+ return True
1282
+ else:
1283
+ self.log.warning(
1284
+ f"WARNING: Found NGINX ingress controller '{metadata['name']}' "
1285
+ f"but it's not running (status: {status_phase})"
1286
+ )
1287
+
1288
+ return False
1289
+
1290
+ except Exception: # noqa: BLE001
1291
+ # Broad exception handling for fallback case
1292
+ return False
1293
+
1294
+ def _list_available_gateways(self) -> None:
1295
+ """List available gateways for troubleshooting."""
1296
+ try:
1297
+ for resource_type in GATEWAY_RESOURCE_TYPES:
1298
+ gateways = self.kubectl.list_resources(
1299
+ resource_type, all_namespaces=True
1300
+ )
1301
+
1302
+ if gateways:
1303
+ self.log.error(f"Available {resource_type}:")
1304
+ for gw in gateways:
1305
+ name = gw.get("metadata", {}).get("name", "unknown")
1306
+ self.log.error(f" - {name}")
1307
+ return
1308
+
1309
+ self.log.error(" (no gateways found in cluster)")
1310
+
1311
+ except Exception: # noqa: BLE001
1312
+ # Broad exception handling for troubleshooting helper
1313
+ self.log.error(" (failed to list gateways)")
1314
+
1315
+ def _list_available_ingress_controllers(self) -> None:
1316
+ """List available ingress controllers for troubleshooting."""
1317
+ try:
1318
+ pods = self.kubectl.list_resources("pods", all_namespaces=True)
1319
+
1320
+ ingress_controllers = []
1321
+ for pod in pods:
1322
+ metadata = pod.get("metadata", {})
1323
+ name = metadata.get("name", "").lower()
1324
+ namespace = metadata.get("namespace", "")
1325
+
1326
+ # Look for common ingress controller name patterns
1327
+ if any(keyword in name for keyword in INGRESS_CONTROLLER_KEYWORDS):
1328
+ ingress_controllers.append(
1329
+ f"{metadata['name']} (namespace: {namespace})"
1330
+ )
1331
+
1332
+ if ingress_controllers:
1333
+ for controller in ingress_controllers:
1334
+ self.log.warning(f" - {controller}")
1335
+ else:
1336
+ self.log.warning(" (no ingress controllers found)")
1337
+
1338
+ except Exception: # noqa: BLE001
1339
+ # Broad exception handling for troubleshooting helper
1340
+ self.log.warning(" (failed to list ingress controllers)")
1341
+
1342
+
1343
+ # =============================================================================
1344
+ # MAIN VERIFIER CLASS
1345
+ # =============================================================================
1346
+
1347
+
1348
+ class KubernetesCloudDeploymentVerifier:
1349
+ """Verifies Kubernetes-based cloud deployments with comprehensive checks"""
1350
+
1351
+ def __init__(self, logger: BlockLogger, api_client):
1352
+ self.log = logger
1353
+ self.api_client = api_client
1354
+ self.k8s_config: Optional[KubernetesConfig] = None
1355
+ self.results = VerificationResults()
1356
+
1357
+ def verify(self, cloud_deployment: CloudDeployment) -> bool:
1358
+ """
1359
+ Main verification workflow for Kubernetes cloud deployments.
1360
+
1361
+ Performs comprehensive checks including operator health, identity verification,
1362
+ file storage, networking, and gateway configuration.
1363
+
1364
+ Args:
1365
+ cloud_deployment: The cloud deployment configuration
1366
+ """
1367
+ deployment_name = cloud_deployment.name or cloud_deployment.cloud_deployment_id
1368
+ self.log.info(f"Starting Kubernetes verification for: {deployment_name}")
1369
+
1370
+ if cloud_deployment.file_storage is not None and isinstance(
1371
+ cloud_deployment.file_storage, dict
1372
+ ):
1373
+ cloud_deployment.file_storage = FileStorage(**cloud_deployment.file_storage)
1374
+
1375
+ try:
1376
+ return self._run_verification_steps(cloud_deployment)
1377
+
1378
+ except click.ClickException:
1379
+ # Re-raise ClickExceptions as they contain user-friendly messages
1380
+ raise
1381
+ except requests.RequestException as e:
1382
+ self.log.error(f"Network error during verification: {e}")
1383
+ return False
1384
+ except (subprocess.CalledProcessError, OSError) as e:
1385
+ self.log.error(f"System error during verification: {e}")
1386
+ return False
1387
+ except (KeyError, ValueError, json.JSONDecodeError) as e:
1388
+ self.log.error(f"Data parsing error during verification: {e}")
1389
+ return False
1390
+
1391
+ def _passed_or_failed_str_from_bool(self, is_passing: bool) -> str:
1392
+ """Return PASSED or FAILED string for verification results, matching VM verification format."""
1393
+ return PASSED_STATUS if is_passing else FAILED_STATUS
1394
+
1395
+ @contextmanager
1396
+ def _verification_step(self, step_name: str):
1397
+ """Context manager for verification steps that indents detailed output."""
1398
+ self.log.info(f"{step_name}...")
1399
+ with self.log.indent():
1400
+ yield
1401
+
1402
+ def _run_verification_steps(self, cloud_deployment: CloudDeployment) -> bool:
1403
+ """Execute the verification steps in sequence."""
1404
+ # Step 1: Configure kubectl
1405
+ with self._verification_step("Configuring kubectl access"):
1406
+ self._get_kubectl_config()
1407
+
1408
+ # k8s_config is guaranteed to be set by _get_kubectl_config()
1409
+ assert self.k8s_config is not None
1410
+
1411
+ # Initialize utility classes
1412
+ kubectl_ops = KubectlOperations(self.k8s_config.context, self.log)
1413
+ operator_verifier = OperatorVerifier(kubectl_ops, self.k8s_config, self.log)
1414
+ storage_verifier = StorageVerifier(kubectl_ops, self.k8s_config, self.log)
1415
+ gateway_verifier = GatewayVerifier(kubectl_ops, self.k8s_config, self.log)
1416
+
1417
+ # Step 2: Find and verify operator pod
1418
+ with self._verification_step("Finding operator pod"):
1419
+ try:
1420
+ operator_pod = operator_verifier.find_operator_pod()
1421
+ self.results.operator_pod_installed = True
1422
+ except OperatorPodNotFoundError as e:
1423
+ self.log.error(
1424
+ "Failed to find operator pod, please make sure the operator is running"
1425
+ )
1426
+ self.log.error(f"Error: {e}")
1427
+ return False
1428
+
1429
+ # Step 3: Port forward and fetch operator data (health + config)
1430
+ with self._verification_step("Verifying operator status"):
1431
+ try:
1432
+ operator_data = operator_verifier.get_operator_data(operator_pod)
1433
+ except (OperatorConnectionError, PortForwardError) as e:
1434
+ self.log.error(
1435
+ "Failed to connect to operator, please make sure the operator is running version >= 0.7.0 and has status reporting enabled"
1436
+ )
1437
+ self.log.error(f"Error: {e}")
1438
+ return False
1439
+
1440
+ self.log.info("Verifying operator health...")
1441
+ self.results.operator_health = operator_verifier.verify_operator_health(
1442
+ operator_data
1443
+ )
1444
+ self.log.info(
1445
+ f"Operator Health: {self._passed_or_failed_str_from_bool(self.results.operator_health)}"
1446
+ )
1447
+
1448
+ self.log.info("Verifying operator identity...")
1449
+ if cloud_deployment.kubernetes_config is None:
1450
+ self.log.error(
1451
+ "Kubernetes configuration is missing from cloud deployment"
1452
+ )
1453
+ self.results.operator_identity = False
1454
+ else:
1455
+ self.results.operator_identity = operator_verifier.verify_operator_identity(
1456
+ operator_data,
1457
+ cloud_deployment.kubernetes_config,
1458
+ cloud_deployment.provider,
1459
+ )
1460
+ self.log.info(
1461
+ f"Operator Identity: {self._passed_or_failed_str_from_bool(self.results.operator_identity)}"
1462
+ )
1463
+
1464
+ # Step 4: Check file storage
1465
+ with self._verification_step("Checking file storage"):
1466
+ if cloud_deployment.file_storage is None:
1467
+ self.log.info(
1468
+ "INFO: No file storage configured - skipping file storage verification"
1469
+ )
1470
+ self.results.file_storage = True
1471
+ else:
1472
+ self.results.file_storage = storage_verifier.verify_file_storage(
1473
+ cloud_deployment.file_storage, cloud_deployment
1474
+ )
1475
+ self.log.info(
1476
+ f"File Storage: {self._passed_or_failed_str_from_bool(self.results.file_storage)}"
1477
+ )
1478
+
1479
+ # Step 5: Verify gateway support
1480
+ with self._verification_step("Verifying gateway support"):
1481
+ self.results.gateway_support = gateway_verifier.verify_gateway_support(
1482
+ operator_data
1483
+ )
1484
+ self.log.info(
1485
+ f"Gateway Support: {self._passed_or_failed_str_from_bool(self.results.gateway_support)}"
1486
+ )
1487
+
1488
+ # Step 6: Check NGINX ingress (warning only)
1489
+ with self._verification_step("Checking NGINX ingress controller"):
1490
+ self.results.nginx_ingress = gateway_verifier.verify_nginx_ingress()
1491
+ self.log.info(
1492
+ f"NGINX Ingress: {self._passed_or_failed_str_from_bool(self.results.nginx_ingress)}"
1493
+ )
1494
+
1495
+ self._show_verification_summary()
1496
+
1497
+ if self.results.overall_success:
1498
+ self.log.info(
1499
+ "Kubernetes cloud deployment verification completed successfully"
1500
+ )
1501
+ else:
1502
+ self.log.error("Kubernetes cloud deployment verification failed")
1503
+
1504
+ return self.results.overall_success
1505
+
1506
+ def _show_verification_summary(self):
1507
+ """Show verification results summary in the same format as VM verification."""
1508
+ verification_result_summary = ["Verification result:"]
1509
+
1510
+ for component, result in self.results.to_dict().items():
1511
+ verification_result_summary.append(
1512
+ f"{component}: {self._passed_or_failed_str_from_bool(result)}"
1513
+ )
1514
+
1515
+ self.log.info("\n".join(verification_result_summary))
1516
+
1517
+ def _get_kubectl_config(self):
1518
+ """Get kubectl context and operator namespace from user"""
1519
+ # Check if kubectl is available
1520
+ temp_kubectl = KubectlOperations("", self.log)
1521
+ if not temp_kubectl.check_kubectl_available():
1522
+ raise click.ClickException(
1523
+ "kubectl command not found. Please install kubectl and ensure it's in your PATH."
1524
+ )
1525
+
1526
+ # Get available contexts
1527
+ contexts = temp_kubectl.get_available_contexts()
1528
+ if not contexts:
1529
+ raise click.ClickException(
1530
+ "No kubectl contexts found. Please configure kubectl to access your Kubernetes cluster."
1531
+ )
1532
+
1533
+ # Prompt for context selection
1534
+ if len(contexts) > 1:
1535
+ self.log.info("Available kubectl contexts:")
1536
+ for i, ctx in enumerate(contexts):
1537
+ current_marker = (
1538
+ " (current)" if ctx == temp_kubectl.get_current_context() else ""
1539
+ )
1540
+ self.log.info(f" {i+1}. {ctx}{current_marker}")
1541
+
1542
+ choice = click.prompt(
1543
+ "Select context number",
1544
+ type=click.IntRange(1, len(contexts)),
1545
+ default=1,
1546
+ )
1547
+ kubectl_context = contexts[choice - 1]
1548
+ else:
1549
+ kubectl_context = contexts[0]
1550
+ self.log.info(f"Using kubectl context: {kubectl_context}")
1551
+
1552
+ # Prompt for operator namespace
1553
+ operator_namespace = click.prompt(
1554
+ "Enter the Anyscale operator namespace",
1555
+ default=DEFAULT_OPERATOR_NAMESPACE,
1556
+ type=str,
1557
+ show_default=True,
1558
+ )
1559
+
1560
+ self.k8s_config = KubernetesConfig(
1561
+ context=kubectl_context, operator_namespace=operator_namespace
1562
+ )
1563
+
1564
+ self.log.info(
1565
+ f"Configured: context='{self.k8s_config.context}', "
1566
+ f"namespace='{self.k8s_config.operator_namespace}'"
1567
+ )