anyscale 0.26.47__py3-none-any.whl → 0.26.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anyscale/__init__.py +0 -7
- anyscale/_private/anyscale_client/README.md +115 -0
- anyscale/_private/anyscale_client/anyscale_client.py +12 -213
- anyscale/_private/anyscale_client/common.py +0 -55
- anyscale/_private/anyscale_client/fake_anyscale_client.py +19 -46
- anyscale/_private/docgen/__main__.py +32 -47
- anyscale/_private/docgen/generator.py +32 -16
- anyscale/_private/docgen/generator_legacy.py +58 -6
- anyscale/_private/docgen/models.md +3 -2
- anyscale/_private/workload/workload_config.py +16 -8
- anyscale/_private/workload/workload_sdk.py +24 -7
- anyscale/client/README.md +10 -2
- anyscale/client/openapi_client/__init__.py +6 -2
- anyscale/client/openapi_client/api/default_api.py +558 -8
- anyscale/client/openapi_client/models/__init__.py +6 -2
- anyscale/client/openapi_client/models/{alert_type.py → alert_issue_type.py} +8 -20
- anyscale/client/openapi_client/models/baseimagesenum.py +1 -2
- anyscale/client/openapi_client/models/cloud.py +31 -3
- anyscale/client/openapi_client/models/cloud_deployment.py +30 -3
- anyscale/client/openapi_client/models/cloud_with_cloud_resource.py +29 -1
- anyscale/client/openapi_client/models/cloud_with_cloud_resource_gcp.py +29 -1
- anyscale/client/openapi_client/models/dataset_metrics.py +6 -6
- anyscale/client/openapi_client/models/dataset_state.py +2 -1
- anyscale/client/openapi_client/models/decorated_cloud_deployment.py +481 -0
- anyscale/client/openapi_client/models/decoratedclouddeployment_response.py +121 -0
- anyscale/client/openapi_client/models/describe_system_workload_response.py +32 -6
- anyscale/client/openapi_client/models/experimental_workspace.py +29 -1
- anyscale/client/openapi_client/models/experimental_workspaces_sort_field.py +2 -1
- anyscale/client/openapi_client/models/metrics_query_response.py +121 -0
- anyscale/client/openapi_client/models/{clouddeployment_response.py → metricsqueryresponse_response.py} +11 -11
- anyscale/client/openapi_client/models/operator_metrics.py +8 -9
- anyscale/client/openapi_client/models/operator_status.py +102 -0
- anyscale/client/openapi_client/models/organization_usage_alert.py +20 -20
- anyscale/client/openapi_client/models/supportedbaseimagesenum.py +1 -2
- anyscale/cloud/models.py +330 -0
- anyscale/commands/cloud_commands.py +136 -44
- anyscale/commands/command_examples.py +54 -134
- anyscale/commands/compute_config_commands.py +7 -11
- anyscale/compute_config/__init__.py +2 -16
- anyscale/compute_config/_private/compute_config_sdk.py +27 -17
- anyscale/compute_config/commands.py +14 -44
- anyscale/compute_config/models.py +49 -26
- anyscale/controllers/cloud_controller.py +289 -171
- anyscale/controllers/cloud_file_storage_utils.py +204 -0
- anyscale/controllers/kubernetes_verifier.py +1570 -0
- anyscale/job/_private/job_sdk.py +17 -8
- anyscale/job/models.py +1 -1
- anyscale/scripts.py +0 -2
- anyscale/sdk/anyscale_client/models/baseimagesenum.py +1 -2
- anyscale/sdk/anyscale_client/models/cloud.py +31 -3
- anyscale/sdk/anyscale_client/models/supportedbaseimagesenum.py +1 -2
- anyscale/shared_anyscale_utils/headers.py +3 -0
- anyscale/shared_anyscale_utils/utils/id_gen.py +1 -0
- anyscale/version.py +1 -1
- anyscale/workspace/models.py +14 -7
- {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/METADATA +1 -1
- {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/RECORD +62 -73
- anyscale/commands/llm/dataset_commands.py +0 -269
- anyscale/commands/llm/group.py +0 -15
- anyscale/commands/llm/models_commands.py +0 -123
- anyscale/controllers/llm/__init__.py +0 -0
- anyscale/controllers/llm/models_controller.py +0 -144
- anyscale/llm/__init__.py +0 -2
- anyscale/llm/dataset/__init__.py +0 -2
- anyscale/llm/dataset/_private/__init__.py +0 -0
- anyscale/llm/dataset/_private/docs.py +0 -63
- anyscale/llm/dataset/_private/models.py +0 -71
- anyscale/llm/dataset/_private/sdk.py +0 -147
- anyscale/llm/model/__init__.py +0 -2
- anyscale/llm/model/_private/models_sdk.py +0 -62
- anyscale/llm/model/commands.py +0 -93
- anyscale/llm/model/models.py +0 -171
- anyscale/llm/model/sdk.py +0 -62
- anyscale/llm/sdk.py +0 -27
- {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/WHEEL +0 -0
- {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/entry_points.txt +0 -0
- {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/licenses/LICENSE +0 -0
- {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/licenses/NOTICE +0 -0
- {anyscale-0.26.47.dist-info → anyscale-0.26.49.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1570 @@
|
|
1
|
+
"""
|
2
|
+
Kubernetes Cloud Deployment Verifier
|
3
|
+
|
4
|
+
Handles verification of Kubernetes-based cloud deployments including:
|
5
|
+
- Operator pod health and connectivity
|
6
|
+
- File storage (CSI drivers, PVCs, NFS)
|
7
|
+
- Network connectivity
|
8
|
+
- Gateway support
|
9
|
+
- Nginx ingress controller
|
10
|
+
"""
|
11
|
+
|
12
|
+
|
13
|
+
from contextlib import contextmanager, suppress
|
14
|
+
from dataclasses import dataclass
|
15
|
+
import json
|
16
|
+
import os
|
17
|
+
import shutil
|
18
|
+
import signal
|
19
|
+
import socket
|
20
|
+
import subprocess
|
21
|
+
import time
|
22
|
+
from typing import Dict, List, Optional
|
23
|
+
|
24
|
+
import click
|
25
|
+
import requests
|
26
|
+
|
27
|
+
from anyscale.cli_logger import BlockLogger
|
28
|
+
from anyscale.client.openapi_client.models.cloud_deployment import CloudDeployment
|
29
|
+
from anyscale.client.openapi_client.models.cloud_providers import CloudProviders
|
30
|
+
from anyscale.client.openapi_client.models.file_storage import FileStorage
|
31
|
+
from anyscale.client.openapi_client.models.kubernetes_config import (
|
32
|
+
KubernetesConfig as OpenAPIKubernetesConfig,
|
33
|
+
)
|
34
|
+
from anyscale.controllers.cloud_file_storage_utils import verify_file_storage_exists
|
35
|
+
|
36
|
+
|
37
|
+
# =============================================================================
|
38
|
+
# CONSTANTS
|
39
|
+
# =============================================================================
|
40
|
+
|
41
|
+
# Operator configuration
|
42
|
+
OPERATOR_HEALTH_PORT = 2113
|
43
|
+
OPERATOR_CONFIG_ENDPOINT = "/config"
|
44
|
+
OPERATOR_HEALTH_ENDPOINT = "/healthz/run"
|
45
|
+
DEFAULT_OPERATOR_NAMESPACE = "anyscale-operator"
|
46
|
+
|
47
|
+
# Network and timing configuration
|
48
|
+
PORT_FORWARD_WAIT_TIME = 3 # seconds to wait for port forward to establish
|
49
|
+
HTTP_REQUEST_TIMEOUT = 10 # seconds for HTTP requests to operator
|
50
|
+
PORT_FORWARD_TERMINATION_TIMEOUT = 5 # seconds to wait for graceful termination
|
51
|
+
|
52
|
+
OPERATOR_LABEL_SELECTOR = "app=anyscale-operator"
|
53
|
+
|
54
|
+
# Gateway resource types to check
|
55
|
+
GATEWAY_RESOURCE_TYPES = [
|
56
|
+
"gateway.gateway", # Gateway API v1
|
57
|
+
"gateways.gateway.networking.k8s.io", # Full API path
|
58
|
+
"gateway", # Short name
|
59
|
+
"gw", # Common alias
|
60
|
+
]
|
61
|
+
|
62
|
+
# NGINX ingress controller configurations
|
63
|
+
NGINX_INGRESS_CONFIGS = [
|
64
|
+
{"namespace": "ingress-nginx", "label": "app.kubernetes.io/name=ingress-nginx"},
|
65
|
+
{"namespace": "nginx-ingress", "label": "app=nginx-ingress"},
|
66
|
+
{"namespace": "kube-system", "label": "app.kubernetes.io/name=ingress-nginx"},
|
67
|
+
{"namespace": "default", "label": "app=nginx-ingress"},
|
68
|
+
]
|
69
|
+
|
70
|
+
# Ingress controller name patterns for fallback search
|
71
|
+
INGRESS_CONTROLLER_KEYWORDS = [
|
72
|
+
"ingress",
|
73
|
+
"haproxy",
|
74
|
+
"traefik",
|
75
|
+
"contour",
|
76
|
+
"ambassador",
|
77
|
+
"istio-gateway",
|
78
|
+
"nginx",
|
79
|
+
]
|
80
|
+
|
81
|
+
# kubectl binary search paths
|
82
|
+
KUBECTL_COMMON_PATHS = [
|
83
|
+
"/usr/local/bin/kubectl",
|
84
|
+
"/usr/bin/kubectl",
|
85
|
+
"/bin/kubectl",
|
86
|
+
"/opt/homebrew/bin/kubectl", # macOS homebrew
|
87
|
+
"~/.local/bin/kubectl", # User local install
|
88
|
+
]
|
89
|
+
|
90
|
+
# Status and result strings
|
91
|
+
PASSED_STATUS = "PASSED"
|
92
|
+
FAILED_STATUS = "FAILED"
|
93
|
+
RUNNING_STATUS = "Running"
|
94
|
+
|
95
|
+
# Verification component names (for consistent reporting)
|
96
|
+
class VerificationComponents:
|
97
|
+
OPERATOR_POD_INSTALLED = "Operator Pod Installed"
|
98
|
+
OPERATOR_HEALTH = "Operator Health"
|
99
|
+
OPERATOR_IDENTITY = "Operator Identity"
|
100
|
+
FILE_STORAGE = "File Storage"
|
101
|
+
GATEWAY_SUPPORT = "Gateway Support"
|
102
|
+
NGINX_INGRESS = "NGINX Ingress"
|
103
|
+
|
104
|
+
|
105
|
+
# =============================================================================
|
106
|
+
# EXCEPTIONS
|
107
|
+
# =============================================================================
|
108
|
+
|
109
|
+
|
110
|
+
class KubernetesVerificationError(Exception):
|
111
|
+
"""Base exception for all Kubernetes verification errors."""
|
112
|
+
|
113
|
+
|
114
|
+
class KubectlError(KubernetesVerificationError):
|
115
|
+
"""Raised when kubectl commands fail."""
|
116
|
+
|
117
|
+
def __init__(
|
118
|
+
self, message: str, command: Optional[str] = None, stderr: Optional[str] = None
|
119
|
+
):
|
120
|
+
super().__init__(message)
|
121
|
+
self.command = command
|
122
|
+
self.stderr = stderr
|
123
|
+
|
124
|
+
|
125
|
+
class KubectlNotFoundError(KubernetesVerificationError):
|
126
|
+
"""Raised when kubectl binary cannot be found."""
|
127
|
+
|
128
|
+
|
129
|
+
class OperatorPodNotFoundError(KubernetesVerificationError):
|
130
|
+
"""Raised when the Anyscale operator pod cannot be found."""
|
131
|
+
|
132
|
+
|
133
|
+
class OperatorConnectionError(KubernetesVerificationError):
|
134
|
+
"""Raised when connection to the operator fails."""
|
135
|
+
|
136
|
+
def __init__(
|
137
|
+
self,
|
138
|
+
message: str,
|
139
|
+
pod_name: Optional[str] = None,
|
140
|
+
endpoint: Optional[str] = None,
|
141
|
+
):
|
142
|
+
super().__init__(message)
|
143
|
+
self.pod_name = pod_name
|
144
|
+
self.endpoint = endpoint
|
145
|
+
|
146
|
+
|
147
|
+
class PortForwardError(KubernetesVerificationError):
|
148
|
+
"""Raised when port forwarding to a pod fails."""
|
149
|
+
|
150
|
+
def __init__(
|
151
|
+
self, message: str, pod_name: Optional[str] = None, port: Optional[int] = None
|
152
|
+
):
|
153
|
+
super().__init__(message)
|
154
|
+
self.pod_name = pod_name
|
155
|
+
self.port = port
|
156
|
+
|
157
|
+
|
158
|
+
class IdentityVerificationError(KubernetesVerificationError):
|
159
|
+
"""Raised when operator identity verification fails."""
|
160
|
+
|
161
|
+
def __init__(
|
162
|
+
self,
|
163
|
+
message: str,
|
164
|
+
expected_identity: Optional[str] = None,
|
165
|
+
actual_identity: Optional[str] = None,
|
166
|
+
):
|
167
|
+
super().__init__(message)
|
168
|
+
self.expected_identity = expected_identity
|
169
|
+
self.actual_identity = actual_identity
|
170
|
+
|
171
|
+
|
172
|
+
class FileStorageVerificationError(KubernetesVerificationError):
|
173
|
+
"""Raised when file storage verification fails."""
|
174
|
+
|
175
|
+
|
176
|
+
class GatewayVerificationError(KubernetesVerificationError):
|
177
|
+
"""Raised when gateway verification fails."""
|
178
|
+
|
179
|
+
def __init__(self, message: str, gateway_name: Optional[str] = None):
|
180
|
+
super().__init__(message)
|
181
|
+
self.gateway_name = gateway_name
|
182
|
+
|
183
|
+
|
184
|
+
class ResourceNotFoundError(KubernetesVerificationError):
|
185
|
+
"""Raised when a required Kubernetes resource is not found."""
|
186
|
+
|
187
|
+
def __init__(
|
188
|
+
self,
|
189
|
+
message: str,
|
190
|
+
resource_type: Optional[str] = None,
|
191
|
+
resource_name: Optional[str] = None,
|
192
|
+
namespace: Optional[str] = None,
|
193
|
+
):
|
194
|
+
super().__init__(message)
|
195
|
+
self.resource_type = resource_type
|
196
|
+
self.resource_name = resource_name
|
197
|
+
self.namespace = namespace
|
198
|
+
|
199
|
+
|
200
|
+
# =============================================================================
|
201
|
+
# DATA MODELS
|
202
|
+
# =============================================================================
|
203
|
+
|
204
|
+
|
205
|
+
@dataclass
|
206
|
+
class VerificationResults:
|
207
|
+
"""Tracks the results of all verification steps."""
|
208
|
+
|
209
|
+
operator_pod_installed: bool = False
|
210
|
+
operator_health: bool = False
|
211
|
+
operator_identity: bool = False
|
212
|
+
file_storage: bool = False
|
213
|
+
gateway_support: bool = False
|
214
|
+
nginx_ingress: bool = False
|
215
|
+
|
216
|
+
def to_dict(self) -> Dict[str, bool]:
|
217
|
+
"""Convert to dictionary format matching original implementation."""
|
218
|
+
return {
|
219
|
+
VerificationComponents.OPERATOR_POD_INSTALLED: self.operator_pod_installed,
|
220
|
+
VerificationComponents.OPERATOR_HEALTH: self.operator_health,
|
221
|
+
VerificationComponents.OPERATOR_IDENTITY: self.operator_identity,
|
222
|
+
VerificationComponents.FILE_STORAGE: self.file_storage,
|
223
|
+
VerificationComponents.GATEWAY_SUPPORT: self.gateway_support,
|
224
|
+
VerificationComponents.NGINX_INGRESS: self.nginx_ingress,
|
225
|
+
}
|
226
|
+
|
227
|
+
@property
|
228
|
+
def overall_success(self) -> bool:
|
229
|
+
"""Return True if all verification steps passed."""
|
230
|
+
return all(
|
231
|
+
[
|
232
|
+
self.operator_pod_installed,
|
233
|
+
self.operator_health,
|
234
|
+
self.operator_identity,
|
235
|
+
self.file_storage,
|
236
|
+
self.gateway_support,
|
237
|
+
self.nginx_ingress,
|
238
|
+
]
|
239
|
+
)
|
240
|
+
|
241
|
+
|
242
|
+
@dataclass
|
243
|
+
class KubernetesConfig:
|
244
|
+
"""Configuration for Kubernetes cluster access."""
|
245
|
+
|
246
|
+
context: str
|
247
|
+
operator_namespace: str
|
248
|
+
|
249
|
+
def __post_init__(self):
|
250
|
+
"""Validate configuration after initialization."""
|
251
|
+
if not self.context:
|
252
|
+
raise ValueError("Kubernetes context cannot be empty")
|
253
|
+
if not self.operator_namespace:
|
254
|
+
raise ValueError("Operator namespace cannot be empty")
|
255
|
+
|
256
|
+
|
257
|
+
@dataclass
|
258
|
+
class OperatorHealthData:
|
259
|
+
"""Data retrieved from operator health endpoint."""
|
260
|
+
|
261
|
+
status_code: int
|
262
|
+
response_text: Optional[str] = None
|
263
|
+
|
264
|
+
@property
|
265
|
+
def is_healthy(self) -> bool:
|
266
|
+
"""Return True if operator is healthy."""
|
267
|
+
return self.status_code == 200
|
268
|
+
|
269
|
+
|
270
|
+
@dataclass
|
271
|
+
class OperatorConfigData:
|
272
|
+
"""Data retrieved from operator config endpoint."""
|
273
|
+
|
274
|
+
status_code: int
|
275
|
+
response_text: str
|
276
|
+
config_data: Optional[Dict] = None
|
277
|
+
config_error: Optional[str] = None
|
278
|
+
|
279
|
+
@property
|
280
|
+
def is_valid(self) -> bool:
|
281
|
+
"""Return True if config data is valid."""
|
282
|
+
return self.status_code == 200 and self.config_data is not None
|
283
|
+
|
284
|
+
|
285
|
+
@dataclass
|
286
|
+
class OperatorData:
|
287
|
+
"""Combined data from operator health and config endpoints."""
|
288
|
+
|
289
|
+
health: OperatorHealthData
|
290
|
+
config: OperatorConfigData
|
291
|
+
|
292
|
+
@classmethod
|
293
|
+
def from_dict(cls, data: Dict) -> "OperatorData":
|
294
|
+
"""Create OperatorData from dictionary format used in original code."""
|
295
|
+
health = OperatorHealthData(
|
296
|
+
status_code=data["health_status"], response_text=data.get("health_response")
|
297
|
+
)
|
298
|
+
|
299
|
+
config = OperatorConfigData(
|
300
|
+
status_code=data["config_status"],
|
301
|
+
response_text=data["config_response"],
|
302
|
+
config_data=data.get("config_data"),
|
303
|
+
config_error=data.get("config_error"),
|
304
|
+
)
|
305
|
+
|
306
|
+
return cls(health=health, config=config)
|
307
|
+
|
308
|
+
|
309
|
+
@dataclass
|
310
|
+
class GatewayConfig:
|
311
|
+
"""Gateway configuration from operator."""
|
312
|
+
|
313
|
+
enabled: bool = False
|
314
|
+
name: Optional[str] = None
|
315
|
+
|
316
|
+
@classmethod
|
317
|
+
def from_operator_config(cls, config_data: Optional[Dict]) -> "GatewayConfig":
|
318
|
+
"""Extract gateway config from operator configuration."""
|
319
|
+
if not config_data:
|
320
|
+
return cls()
|
321
|
+
|
322
|
+
gateway_config = config_data.get("gateway", {})
|
323
|
+
if not gateway_config:
|
324
|
+
return cls()
|
325
|
+
|
326
|
+
return cls(
|
327
|
+
enabled=gateway_config.get("enable", False), name=gateway_config.get("name")
|
328
|
+
)
|
329
|
+
|
330
|
+
@property
|
331
|
+
def requires_verification(self) -> bool:
|
332
|
+
"""Return True if gateway verification is required."""
|
333
|
+
return self.enabled and self.name is not None
|
334
|
+
|
335
|
+
|
336
|
+
# =============================================================================
|
337
|
+
# KUBECTL OPERATIONS
|
338
|
+
# =============================================================================
|
339
|
+
|
340
|
+
|
341
|
+
class KubectlOperations:
|
342
|
+
"""Utility class for executing kubectl commands with consistent error handling."""
|
343
|
+
|
344
|
+
def __init__(self, context: str, logger: BlockLogger):
|
345
|
+
self.context = context
|
346
|
+
self.log = logger
|
347
|
+
self._kubectl_path: Optional[str] = None
|
348
|
+
|
349
|
+
def get_resource(
|
350
|
+
self, resource_type: str, name: str, namespace: Optional[str] = None
|
351
|
+
) -> Dict:
|
352
|
+
"""Get a single Kubernetes resource by name."""
|
353
|
+
cmd_args = ["get", resource_type, name, "--context", self.context, "-o", "json"]
|
354
|
+
if namespace:
|
355
|
+
cmd_args.extend(["-n", namespace])
|
356
|
+
|
357
|
+
try:
|
358
|
+
result = self._run_kubectl_command(cmd_args)
|
359
|
+
return json.loads(result.stdout)
|
360
|
+
except subprocess.CalledProcessError as e:
|
361
|
+
if "not found" in e.stderr.lower():
|
362
|
+
raise ResourceNotFoundError(
|
363
|
+
f"{resource_type} '{name}' not found",
|
364
|
+
resource_type=resource_type,
|
365
|
+
resource_name=name,
|
366
|
+
namespace=namespace,
|
367
|
+
)
|
368
|
+
raise KubectlError(
|
369
|
+
f"Failed to get {resource_type} '{name}': {e.stderr}",
|
370
|
+
command=" ".join(cmd_args),
|
371
|
+
stderr=e.stderr,
|
372
|
+
)
|
373
|
+
except json.JSONDecodeError as e:
|
374
|
+
raise KubectlError(
|
375
|
+
f"Invalid JSON response from kubectl: {e}", command=" ".join(cmd_args)
|
376
|
+
)
|
377
|
+
|
378
|
+
def list_resources(
|
379
|
+
self,
|
380
|
+
resource_type: str,
|
381
|
+
namespace: Optional[str] = None,
|
382
|
+
label_selector: Optional[str] = None,
|
383
|
+
all_namespaces: bool = False,
|
384
|
+
) -> List[Dict]:
|
385
|
+
"""List Kubernetes resources with optional filtering."""
|
386
|
+
cmd_args = ["get", resource_type, "--context", self.context, "-o", "json"]
|
387
|
+
|
388
|
+
if all_namespaces:
|
389
|
+
cmd_args.append("--all-namespaces")
|
390
|
+
elif namespace:
|
391
|
+
cmd_args.extend(["-n", namespace])
|
392
|
+
|
393
|
+
if label_selector:
|
394
|
+
cmd_args.extend(["-l", label_selector])
|
395
|
+
|
396
|
+
try:
|
397
|
+
result = self._run_kubectl_command(cmd_args)
|
398
|
+
data = json.loads(result.stdout)
|
399
|
+
return data.get("items", [])
|
400
|
+
except subprocess.CalledProcessError as e:
|
401
|
+
raise KubectlError(
|
402
|
+
f"Failed to list {resource_type}: {e.stderr}",
|
403
|
+
command=" ".join(cmd_args),
|
404
|
+
stderr=e.stderr,
|
405
|
+
)
|
406
|
+
except json.JSONDecodeError as e:
|
407
|
+
raise KubectlError(
|
408
|
+
f"Invalid JSON response from kubectl: {e}", command=" ".join(cmd_args)
|
409
|
+
)
|
410
|
+
|
411
|
+
def get_resource_field(
|
412
|
+
self,
|
413
|
+
resource_type: str,
|
414
|
+
name: str,
|
415
|
+
jsonpath: str,
|
416
|
+
namespace: Optional[str] = None,
|
417
|
+
) -> str:
|
418
|
+
"""Get a specific field from a Kubernetes resource using jsonpath."""
|
419
|
+
cmd_args = [
|
420
|
+
"get",
|
421
|
+
resource_type,
|
422
|
+
name,
|
423
|
+
"--context",
|
424
|
+
self.context,
|
425
|
+
"-o",
|
426
|
+
f"jsonpath={jsonpath}",
|
427
|
+
]
|
428
|
+
if namespace:
|
429
|
+
cmd_args.extend(["-n", namespace])
|
430
|
+
|
431
|
+
try:
|
432
|
+
result = self._run_kubectl_command(cmd_args)
|
433
|
+
return result.stdout.strip()
|
434
|
+
except subprocess.CalledProcessError as e:
|
435
|
+
if "not found" in e.stderr.lower():
|
436
|
+
raise ResourceNotFoundError(
|
437
|
+
f"{resource_type} '{name}' not found",
|
438
|
+
resource_type=resource_type,
|
439
|
+
resource_name=name,
|
440
|
+
namespace=namespace,
|
441
|
+
)
|
442
|
+
raise KubectlError(
|
443
|
+
f"Failed to get field from {resource_type} '{name}': {e.stderr}",
|
444
|
+
command=" ".join(cmd_args),
|
445
|
+
stderr=e.stderr,
|
446
|
+
)
|
447
|
+
|
448
|
+
def get_available_contexts(self) -> List[str]:
|
449
|
+
"""Get list of available kubectl contexts."""
|
450
|
+
try:
|
451
|
+
result = self._run_kubectl_command(["config", "get-contexts", "-o", "name"])
|
452
|
+
contexts = [
|
453
|
+
ctx.strip() for ctx in result.stdout.strip().split("\n") if ctx.strip()
|
454
|
+
]
|
455
|
+
return contexts
|
456
|
+
except subprocess.CalledProcessError as e:
|
457
|
+
raise KubectlError(
|
458
|
+
f"Failed to get kubectl contexts: {e.stderr}",
|
459
|
+
command="kubectl config get-contexts -o name",
|
460
|
+
stderr=e.stderr,
|
461
|
+
)
|
462
|
+
|
463
|
+
def get_current_context(self) -> Optional[str]:
|
464
|
+
"""Get the current kubectl context."""
|
465
|
+
try:
|
466
|
+
result = self._run_kubectl_command(["config", "current-context"])
|
467
|
+
return result.stdout.strip()
|
468
|
+
except subprocess.CalledProcessError as e:
|
469
|
+
if "current-context is not set" in e.stderr.lower():
|
470
|
+
return None
|
471
|
+
raise KubectlError(
|
472
|
+
f"Failed to get current context: {e.stderr}",
|
473
|
+
command="kubectl config current-context",
|
474
|
+
stderr=e.stderr,
|
475
|
+
)
|
476
|
+
|
477
|
+
def start_port_forward(
|
478
|
+
self, pod_name: str, local_port: int, remote_port: int, namespace: str
|
479
|
+
) -> subprocess.Popen:
|
480
|
+
"""Start port forwarding to a pod."""
|
481
|
+
cmd_args = [
|
482
|
+
"port-forward",
|
483
|
+
"--context",
|
484
|
+
self.context,
|
485
|
+
"-n",
|
486
|
+
namespace,
|
487
|
+
pod_name,
|
488
|
+
f"{local_port}:{remote_port}",
|
489
|
+
]
|
490
|
+
|
491
|
+
try:
|
492
|
+
cmd = self._get_kubectl_cmd(cmd_args)
|
493
|
+
process = subprocess.Popen(
|
494
|
+
cmd,
|
495
|
+
stdout=subprocess.PIPE,
|
496
|
+
stderr=subprocess.PIPE,
|
497
|
+
preexec_fn=os.setsid, # Create new process group for cleanup
|
498
|
+
)
|
499
|
+
return process
|
500
|
+
except (subprocess.CalledProcessError, OSError) as e:
|
501
|
+
raise KubectlError(
|
502
|
+
f"Failed to start port forward to {pod_name}: {e}",
|
503
|
+
command=" ".join(cmd_args),
|
504
|
+
)
|
505
|
+
|
506
|
+
def check_kubectl_available(self) -> bool:
|
507
|
+
"""Check if kubectl command is available."""
|
508
|
+
try:
|
509
|
+
self._run_kubectl_command(["version", "--client"])
|
510
|
+
return True
|
511
|
+
except (subprocess.CalledProcessError, FileNotFoundError, KubectlNotFoundError):
|
512
|
+
return False
|
513
|
+
|
514
|
+
def get_pod_status(self, pod_name: str, namespace: str) -> str:
|
515
|
+
"""
|
516
|
+
Get pod status phase in specific namespace.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
pod_name: Name of the pod
|
520
|
+
namespace: Namespace containing the pod
|
521
|
+
|
522
|
+
Returns:
|
523
|
+
Pod status phase (e.g., "Running", "Pending") or "unknown" if cannot be determined
|
524
|
+
"""
|
525
|
+
try:
|
526
|
+
return self.get_resource_field(
|
527
|
+
"pod", pod_name, "{.status.phase}", namespace=namespace
|
528
|
+
)
|
529
|
+
except (KubectlError, ResourceNotFoundError):
|
530
|
+
# Return "unknown" if status cannot be determined
|
531
|
+
return "unknown"
|
532
|
+
|
533
|
+
def is_pod_running(self, pod_name: str, namespace: str) -> bool:
|
534
|
+
"""
|
535
|
+
Check if pod is in running state.
|
536
|
+
|
537
|
+
Args:
|
538
|
+
pod_name: Name of the pod
|
539
|
+
namespace: Namespace containing the pod
|
540
|
+
|
541
|
+
Returns:
|
542
|
+
True if pod is running, False otherwise
|
543
|
+
"""
|
544
|
+
try:
|
545
|
+
status = self.get_resource_field(
|
546
|
+
"pod", pod_name, "{.status.phase}", namespace=namespace
|
547
|
+
)
|
548
|
+
return status == RUNNING_STATUS
|
549
|
+
except (KubectlError, ResourceNotFoundError):
|
550
|
+
# Return False if status check fails
|
551
|
+
return False
|
552
|
+
|
553
|
+
def _run_kubectl_command(self, args: List[str]) -> subprocess.CompletedProcess:
|
554
|
+
"""Execute a kubectl command with the given arguments."""
|
555
|
+
cmd = self._get_kubectl_cmd(args)
|
556
|
+
return subprocess.run(cmd, capture_output=True, text=True, check=True)
|
557
|
+
|
558
|
+
def _get_kubectl_cmd(self, args: List[str]) -> List[str]:
|
559
|
+
"""Get kubectl command with proper binary path."""
|
560
|
+
kubectl_path = self._find_kubectl_binary()
|
561
|
+
if not kubectl_path:
|
562
|
+
raise KubectlNotFoundError(
|
563
|
+
"kubectl command not found. Please install kubectl and ensure it's in your PATH."
|
564
|
+
)
|
565
|
+
return [kubectl_path] + args
|
566
|
+
|
567
|
+
def _find_kubectl_binary(self) -> Optional[str]:
|
568
|
+
"""Find kubectl binary in common locations."""
|
569
|
+
if self._kubectl_path:
|
570
|
+
return self._kubectl_path
|
571
|
+
|
572
|
+
# Try to find kubectl using shutil.which first (respects PATH)
|
573
|
+
kubectl_path = shutil.which("kubectl")
|
574
|
+
if kubectl_path:
|
575
|
+
self._kubectl_path = kubectl_path
|
576
|
+
return kubectl_path
|
577
|
+
|
578
|
+
# Try common installation locations
|
579
|
+
for path in KUBECTL_COMMON_PATHS:
|
580
|
+
expanded_path = os.path.expanduser(path)
|
581
|
+
if os.path.isfile(expanded_path) and os.access(expanded_path, os.X_OK):
|
582
|
+
self._kubectl_path = expanded_path
|
583
|
+
return expanded_path
|
584
|
+
|
585
|
+
return None
|
586
|
+
|
587
|
+
|
588
|
+
# =============================================================================
|
589
|
+
# OPERATOR VERIFIER
|
590
|
+
# =============================================================================
|
591
|
+
|
592
|
+
|
593
|
+
class OperatorVerifier:
|
594
|
+
"""Handles verification of Anyscale operator pod, health, and identity."""
|
595
|
+
|
596
|
+
def __init__(
|
597
|
+
self,
|
598
|
+
kubectl_ops: KubectlOperations,
|
599
|
+
k8s_config: KubernetesConfig,
|
600
|
+
logger: BlockLogger,
|
601
|
+
):
|
602
|
+
self.kubectl = kubectl_ops
|
603
|
+
self.config = k8s_config
|
604
|
+
self.log = logger
|
605
|
+
|
606
|
+
def find_operator_pod(self) -> str:
|
607
|
+
"""Find and verify operator pod is running."""
|
608
|
+
try:
|
609
|
+
pods = self.kubectl.list_resources(
|
610
|
+
"pods",
|
611
|
+
namespace=self.config.operator_namespace,
|
612
|
+
label_selector=OPERATOR_LABEL_SELECTOR,
|
613
|
+
)
|
614
|
+
except KubectlError as e:
|
615
|
+
raise OperatorPodNotFoundError(f"Failed to list operator pods: {e}")
|
616
|
+
|
617
|
+
if not pods:
|
618
|
+
raise OperatorPodNotFoundError(
|
619
|
+
"No Anyscale operator pods found. Expected pods with labels like "
|
620
|
+
"'app=anyscale-operator'"
|
621
|
+
)
|
622
|
+
|
623
|
+
operator_pod = pods[0]["metadata"]["name"]
|
624
|
+
|
625
|
+
if not self.kubectl.is_pod_running(
|
626
|
+
operator_pod, self.config.operator_namespace
|
627
|
+
):
|
628
|
+
raise OperatorPodNotFoundError(
|
629
|
+
f"Operator pod '{operator_pod}' is not running"
|
630
|
+
)
|
631
|
+
|
632
|
+
return operator_pod
|
633
|
+
|
634
|
+
def get_operator_data(self, pod_name: str) -> OperatorData:
|
635
|
+
"""Port forward to operator and fetch both health and config data."""
|
636
|
+
try:
|
637
|
+
with self._port_forward_to_operator(pod_name) as local_port:
|
638
|
+
# Fetch health data
|
639
|
+
health_data = self._fetch_health_data(local_port)
|
640
|
+
|
641
|
+
# Fetch config data
|
642
|
+
config_data = self._fetch_config_data(local_port)
|
643
|
+
|
644
|
+
return OperatorData(health=health_data, config=config_data)
|
645
|
+
|
646
|
+
except requests.RequestException as e:
|
647
|
+
raise OperatorConnectionError(
|
648
|
+
f"Cannot connect to operator endpoints: {e}", pod_name=pod_name
|
649
|
+
)
|
650
|
+
except RuntimeError as e:
|
651
|
+
raise PortForwardError(
|
652
|
+
f"Port forwarding failed: {e}",
|
653
|
+
pod_name=pod_name,
|
654
|
+
port=OPERATOR_HEALTH_PORT,
|
655
|
+
)
|
656
|
+
|
657
|
+
def verify_operator_health(self, operator_data: OperatorData) -> bool:
|
658
|
+
"""Verify operator health using pre-fetched data."""
|
659
|
+
if operator_data.health.is_healthy:
|
660
|
+
return True
|
661
|
+
else:
|
662
|
+
self.log.error(
|
663
|
+
f"Health check failed - HTTP {operator_data.health.status_code}"
|
664
|
+
)
|
665
|
+
if operator_data.health.response_text:
|
666
|
+
self.log.error(f"Response: {operator_data.health.response_text}")
|
667
|
+
return False
|
668
|
+
|
669
|
+
def verify_operator_identity(
|
670
|
+
self,
|
671
|
+
operator_data: OperatorData,
|
672
|
+
kubernetes_config: OpenAPIKubernetesConfig,
|
673
|
+
cloud_provider: Optional[CloudProviders],
|
674
|
+
) -> bool:
|
675
|
+
"""Verify operator identity using pre-fetched config data."""
|
676
|
+
# Validate kubernetes_config contents
|
677
|
+
expected_identity = kubernetes_config.anyscale_operator_iam_identity
|
678
|
+
if not expected_identity:
|
679
|
+
self.log.error(
|
680
|
+
"Missing 'anyscale_operator_iam_identity' in kubernetes config"
|
681
|
+
)
|
682
|
+
return False
|
683
|
+
|
684
|
+
# Validate config response
|
685
|
+
if not operator_data.config.is_valid:
|
686
|
+
self.log.error(
|
687
|
+
f"Config endpoint returned HTTP {operator_data.config.status_code}"
|
688
|
+
)
|
689
|
+
if operator_data.config.response_text:
|
690
|
+
self.log.error(f"Response: {operator_data.config.response_text}")
|
691
|
+
return False
|
692
|
+
|
693
|
+
# Extract actual identity from config
|
694
|
+
if operator_data.config.config_data is None:
|
695
|
+
self.log.error("Operator config data is None")
|
696
|
+
return False
|
697
|
+
|
698
|
+
actual_identity = operator_data.config.config_data.get("iamIdentity")
|
699
|
+
if not actual_identity:
|
700
|
+
self.log.error("Operator config missing 'iamIdentity' field")
|
701
|
+
return False
|
702
|
+
|
703
|
+
# Perform identity comparison
|
704
|
+
if self._evaluate_identity_match(
|
705
|
+
expected_identity, actual_identity, cloud_provider
|
706
|
+
):
|
707
|
+
self.log.info(
|
708
|
+
f"AWS identity match: Role matches (Expected: {expected_identity})"
|
709
|
+
)
|
710
|
+
self.log.info("Expected IAM role matches actual assumed role")
|
711
|
+
return True
|
712
|
+
else:
|
713
|
+
self.log.error("Operator identity mismatch")
|
714
|
+
self.log.error(f"Expected: {expected_identity}")
|
715
|
+
self.log.error(f"Actual: {actual_identity}")
|
716
|
+
return False
|
717
|
+
|
718
|
+
@contextmanager
|
719
|
+
def _port_forward_to_operator(self, pod_name: str):
|
720
|
+
"""Context manager that port forwards to operator pod."""
|
721
|
+
port_forward_process = None
|
722
|
+
local_port = None
|
723
|
+
try:
|
724
|
+
# Get a free port for port forwarding
|
725
|
+
local_port = self._get_free_port()
|
726
|
+
self.log.info(f"Using local port {local_port} for port forwarding")
|
727
|
+
|
728
|
+
# Start port forwarding to the pod
|
729
|
+
self.log.info(
|
730
|
+
f"Starting port forward to pod {pod_name} on port {local_port}:{OPERATOR_HEALTH_PORT}..."
|
731
|
+
)
|
732
|
+
|
733
|
+
port_forward_process = self.kubectl.start_port_forward(
|
734
|
+
pod_name,
|
735
|
+
local_port,
|
736
|
+
OPERATOR_HEALTH_PORT,
|
737
|
+
self.config.operator_namespace,
|
738
|
+
)
|
739
|
+
|
740
|
+
# Wait for port forward to establish
|
741
|
+
self.log.info("Waiting for port forward to establish...")
|
742
|
+
time.sleep(PORT_FORWARD_WAIT_TIME)
|
743
|
+
|
744
|
+
# Check if port forward process is still running
|
745
|
+
if port_forward_process.poll() is not None:
|
746
|
+
stderr = (
|
747
|
+
port_forward_process.stderr.read().decode()
|
748
|
+
if port_forward_process.stderr
|
749
|
+
else ""
|
750
|
+
)
|
751
|
+
raise RuntimeError(f"Port forward failed to start: {stderr}")
|
752
|
+
|
753
|
+
# Yield the local port to the calling function
|
754
|
+
yield local_port
|
755
|
+
|
756
|
+
finally:
|
757
|
+
# Clean up port forward process
|
758
|
+
if port_forward_process and port_forward_process.poll() is None:
|
759
|
+
try:
|
760
|
+
# Kill the entire process group to ensure cleanup
|
761
|
+
os.killpg(os.getpgid(port_forward_process.pid), signal.SIGTERM)
|
762
|
+
port_forward_process.wait(timeout=PORT_FORWARD_TERMINATION_TIMEOUT)
|
763
|
+
except (ProcessLookupError, subprocess.TimeoutExpired):
|
764
|
+
# Force kill if graceful termination fails
|
765
|
+
with suppress(ProcessLookupError):
|
766
|
+
os.killpg(os.getpgid(port_forward_process.pid), signal.SIGKILL)
|
767
|
+
except (OSError, ValueError) as e:
|
768
|
+
self.log.warning(f"Port forward cleanup warning: {e}")
|
769
|
+
|
770
|
+
def _get_free_port(self) -> int:
|
771
|
+
"""Get a random free port on localhost."""
|
772
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
773
|
+
s.bind(("", 0))
|
774
|
+
s.listen(1)
|
775
|
+
port = s.getsockname()[1]
|
776
|
+
return port
|
777
|
+
|
778
|
+
def _fetch_health_data(self, local_port: int) -> OperatorHealthData:
|
779
|
+
"""Fetch health data from operator."""
|
780
|
+
response = requests.get(
|
781
|
+
f"http://localhost:{local_port}{OPERATOR_HEALTH_ENDPOINT}",
|
782
|
+
timeout=HTTP_REQUEST_TIMEOUT,
|
783
|
+
)
|
784
|
+
|
785
|
+
return OperatorHealthData(
|
786
|
+
status_code=response.status_code,
|
787
|
+
response_text=response.text if response.status_code != 200 else None,
|
788
|
+
)
|
789
|
+
|
790
|
+
def _fetch_config_data(self, local_port: int) -> OperatorConfigData:
|
791
|
+
"""Fetch config data from operator."""
|
792
|
+
response = requests.get(
|
793
|
+
f"http://localhost:{local_port}{OPERATOR_CONFIG_ENDPOINT}",
|
794
|
+
timeout=HTTP_REQUEST_TIMEOUT,
|
795
|
+
)
|
796
|
+
|
797
|
+
config_data = None
|
798
|
+
config_error = None
|
799
|
+
|
800
|
+
if response.status_code == 200:
|
801
|
+
try:
|
802
|
+
config_data = response.json()
|
803
|
+
except json.JSONDecodeError as e:
|
804
|
+
config_error = str(e)
|
805
|
+
|
806
|
+
return OperatorConfigData(
|
807
|
+
status_code=response.status_code,
|
808
|
+
response_text=response.text,
|
809
|
+
config_data=config_data,
|
810
|
+
config_error=config_error,
|
811
|
+
)
|
812
|
+
|
813
|
+
def _evaluate_identity_match(
|
814
|
+
self,
|
815
|
+
expected_identity: str,
|
816
|
+
actual_identity: str,
|
817
|
+
cloud_provider: Optional[CloudProviders],
|
818
|
+
) -> bool:
|
819
|
+
"""Evaluate if the operator identity matches expected identity based on cloud provider."""
|
820
|
+
if not expected_identity or not actual_identity:
|
821
|
+
return False
|
822
|
+
|
823
|
+
# Convert to string for comparison, default to AWS
|
824
|
+
cloud_provider_str = str(cloud_provider) if cloud_provider else "AWS"
|
825
|
+
|
826
|
+
# Handle cloud provider specific identity comparison
|
827
|
+
if cloud_provider_str == "AWS":
|
828
|
+
return self._evaluate_aws_identity(expected_identity, actual_identity)
|
829
|
+
elif cloud_provider_str == "GCP":
|
830
|
+
return self._evaluate_gcp_identity(expected_identity, actual_identity)
|
831
|
+
elif cloud_provider_str == "AZURE":
|
832
|
+
return self._evaluate_azure_identity(expected_identity, actual_identity)
|
833
|
+
else:
|
834
|
+
# For unknown providers, fall back to exact string comparison
|
835
|
+
self.log.warning(
|
836
|
+
f"Unknown cloud provider '{cloud_provider}', using exact string comparison"
|
837
|
+
)
|
838
|
+
return expected_identity == actual_identity
|
839
|
+
|
840
|
+
def _evaluate_aws_identity(
|
841
|
+
self, expected_identity: str, actual_identity: str
|
842
|
+
) -> bool:
|
843
|
+
"""Evaluate AWS IAM identity comparison."""
|
844
|
+
try:
|
845
|
+
# If they're exactly equal, that's fine
|
846
|
+
if expected_identity == actual_identity:
|
847
|
+
return True
|
848
|
+
|
849
|
+
# Check if actual is an assumed role version of expected role
|
850
|
+
if self._is_aws_assumed_role(actual_identity):
|
851
|
+
# Extract the role name from both ARNs
|
852
|
+
expected_role = self._extract_aws_role_name(expected_identity)
|
853
|
+
actual_role = self._extract_aws_role_name_from_assumed_role(
|
854
|
+
actual_identity
|
855
|
+
)
|
856
|
+
|
857
|
+
if expected_role and actual_role and expected_role == actual_role:
|
858
|
+
# Also check account ID matches
|
859
|
+
expected_account = self._extract_aws_account_id(expected_identity)
|
860
|
+
actual_account = self._extract_aws_account_id(actual_identity)
|
861
|
+
|
862
|
+
if expected_account == actual_account:
|
863
|
+
self.log.info(
|
864
|
+
f"AWS identity match: Role '{expected_role}' (account: {expected_account})"
|
865
|
+
)
|
866
|
+
return True
|
867
|
+
|
868
|
+
return False
|
869
|
+
|
870
|
+
except (ValueError, IndexError, AttributeError) as e:
|
871
|
+
self.log.error(f"Error evaluating AWS identity: {e}")
|
872
|
+
return False
|
873
|
+
|
874
|
+
def _evaluate_gcp_identity(
|
875
|
+
self, expected_identity: str, actual_identity: str
|
876
|
+
) -> bool:
|
877
|
+
"""Evaluate GCP identity comparison."""
|
878
|
+
return expected_identity == actual_identity
|
879
|
+
|
880
|
+
def _evaluate_azure_identity(
|
881
|
+
self, expected_identity: str, actual_identity: str
|
882
|
+
) -> bool:
|
883
|
+
"""Evaluate Azure identity comparison."""
|
884
|
+
return expected_identity == actual_identity
|
885
|
+
|
886
|
+
def _is_aws_assumed_role(self, arn: str) -> bool:
|
887
|
+
"""Check if ARN is an assumed role ARN."""
|
888
|
+
return arn.startswith("arn:aws:sts:") and ":assumed-role/" in arn
|
889
|
+
|
890
|
+
def _extract_aws_role_name(self, role_arn: str) -> Optional[str]:
|
891
|
+
"""Extract role name from IAM role ARN."""
|
892
|
+
try:
|
893
|
+
if ":role/" in role_arn:
|
894
|
+
return role_arn.split(":role/")[-1]
|
895
|
+
return None
|
896
|
+
except (ValueError, IndexError):
|
897
|
+
return None
|
898
|
+
|
899
|
+
def _extract_aws_role_name_from_assumed_role(
|
900
|
+
self, assumed_role_arn: str
|
901
|
+
) -> Optional[str]:
|
902
|
+
"""Extract role name from assumed role ARN."""
|
903
|
+
try:
|
904
|
+
if ":assumed-role/" in assumed_role_arn:
|
905
|
+
parts = assumed_role_arn.split(":assumed-role/")[-1].split("/")
|
906
|
+
if len(parts) >= 1:
|
907
|
+
return parts[0] # Role name is first part after assumed-role/
|
908
|
+
return None
|
909
|
+
except (ValueError, IndexError):
|
910
|
+
return None
|
911
|
+
|
912
|
+
def _extract_aws_account_id(self, arn: str) -> Optional[str]:
|
913
|
+
"""Extract AWS account ID from any ARN."""
|
914
|
+
try:
|
915
|
+
# ARN format: arn:partition:service:region:account-id:resource
|
916
|
+
parts = arn.split(":")
|
917
|
+
if len(parts) >= 5:
|
918
|
+
return parts[4]
|
919
|
+
return None
|
920
|
+
except (ValueError, IndexError):
|
921
|
+
return None
|
922
|
+
|
923
|
+
|
924
|
+
# =============================================================================
|
925
|
+
# STORAGE VERIFIER
|
926
|
+
# =============================================================================
|
927
|
+
|
928
|
+
|
929
|
+
class StorageVerifier:
|
930
|
+
"""Handles verification of file storage components for Kubernetes deployments."""
|
931
|
+
|
932
|
+
def __init__(
|
933
|
+
self,
|
934
|
+
kubectl_ops: KubectlOperations,
|
935
|
+
k8s_config: KubernetesConfig,
|
936
|
+
logger: BlockLogger,
|
937
|
+
):
|
938
|
+
self.kubectl = kubectl_ops
|
939
|
+
self.config = k8s_config
|
940
|
+
self.log = logger
|
941
|
+
|
942
|
+
def verify_file_storage(
|
943
|
+
self, file_storage: FileStorage, cloud_deployment: CloudDeployment
|
944
|
+
) -> bool:
|
945
|
+
"""Verify file storage configuration (non-functional checks only)."""
|
946
|
+
self.log.info("Verifying file storage configuration...")
|
947
|
+
verification_results = []
|
948
|
+
|
949
|
+
if getattr(file_storage, "csi_ephemeral_volume_driver", None):
|
950
|
+
driver_name = file_storage.csi_ephemeral_volume_driver
|
951
|
+
if driver_name:
|
952
|
+
self.log.info(f"Checking CSI driver: {driver_name}")
|
953
|
+
result = self._verify_csi_driver(driver_name)
|
954
|
+
verification_results.append(("CSI driver", result))
|
955
|
+
|
956
|
+
if getattr(file_storage, "persistent_volume_claim", None):
|
957
|
+
pvc_name = file_storage.persistent_volume_claim
|
958
|
+
if pvc_name:
|
959
|
+
self.log.info(f"Checking PVC: {pvc_name}")
|
960
|
+
result = self._verify_pvc(pvc_name)
|
961
|
+
verification_results.append(("PVC", result))
|
962
|
+
|
963
|
+
if getattr(file_storage, "file_storage_id", None):
|
964
|
+
self.log.info("Checking NFS file storage exists via cloud provider APIs...")
|
965
|
+
try:
|
966
|
+
nfs_exists = verify_file_storage_exists(
|
967
|
+
file_storage, cloud_deployment, logger=self.log
|
968
|
+
)
|
969
|
+
verification_results.append(("NFS", nfs_exists))
|
970
|
+
except (ValueError, KeyError, TypeError, ImportError) as e:
|
971
|
+
self.log.error(
|
972
|
+
f"Cloud provider API error while verifying file storage: {e}"
|
973
|
+
)
|
974
|
+
raise RuntimeError(
|
975
|
+
f"Cloud provider API error while verifying file storage: {e}"
|
976
|
+
) from e
|
977
|
+
|
978
|
+
# Return overall success
|
979
|
+
if verification_results:
|
980
|
+
return all(result for _, result in verification_results)
|
981
|
+
else:
|
982
|
+
self.log.info("INFO: No file storage components found to verify")
|
983
|
+
return True
|
984
|
+
|
985
|
+
def _verify_csi_driver(self, driver_name: str) -> bool:
|
986
|
+
"""Check if CSI driver exists on cluster."""
|
987
|
+
try:
|
988
|
+
driver_info = self.kubectl.get_resource("csidriver", driver_name)
|
989
|
+
|
990
|
+
# Parse driver details for logging
|
991
|
+
driver_spec = driver_info.get("spec", {})
|
992
|
+
self.log.info(f"CSI driver '{driver_name}' is available")
|
993
|
+
self.log.info(
|
994
|
+
f"Attach required: {driver_spec.get('attachRequired', 'unknown')}"
|
995
|
+
)
|
996
|
+
self.log.info(
|
997
|
+
f"Pod info on mount: {driver_spec.get('podInfoOnMount', 'unknown')}"
|
998
|
+
)
|
999
|
+
return True
|
1000
|
+
|
1001
|
+
except ResourceNotFoundError:
|
1002
|
+
self.log.error(f"CSI driver '{driver_name}' not found")
|
1003
|
+
self.log.error("Available CSI drivers:")
|
1004
|
+
self._list_available_csi_drivers()
|
1005
|
+
return False
|
1006
|
+
|
1007
|
+
except Exception as e: # noqa: BLE001
|
1008
|
+
self.log.error(f"Failed to query CSI driver: {e}")
|
1009
|
+
raise RuntimeError(
|
1010
|
+
f"kubectl error while verifying CSI driver '{driver_name}': {e}"
|
1011
|
+
) from e
|
1012
|
+
|
1013
|
+
def _verify_pvc(self, pvc_name: str) -> bool:
|
1014
|
+
"""Check if PVC exists and is bound in operator namespace."""
|
1015
|
+
try:
|
1016
|
+
pvc_data = self.kubectl.get_resource(
|
1017
|
+
"pvc", pvc_name, namespace=self.config.operator_namespace
|
1018
|
+
)
|
1019
|
+
|
1020
|
+
status = pvc_data.get("status", {})
|
1021
|
+
phase = status.get("phase")
|
1022
|
+
capacity = status.get("capacity", {})
|
1023
|
+
storage_class = pvc_data.get("spec", {}).get("storageClassName")
|
1024
|
+
|
1025
|
+
if phase == "Bound":
|
1026
|
+
self.log.info(f"PVC '{pvc_name}' is bound")
|
1027
|
+
self.log.info(f"Capacity: {capacity.get('storage', 'unknown')}")
|
1028
|
+
self.log.info(f"Storage class: {storage_class or 'default'}")
|
1029
|
+
return True
|
1030
|
+
else:
|
1031
|
+
self.log.error(
|
1032
|
+
f"FAILED: PVC '{pvc_name}' is not bound (status: {phase})"
|
1033
|
+
)
|
1034
|
+
return False
|
1035
|
+
|
1036
|
+
except ResourceNotFoundError:
|
1037
|
+
self.log.error(
|
1038
|
+
f"FAILED: PVC '{pvc_name}' not found in namespace '{self.config.operator_namespace}'"
|
1039
|
+
)
|
1040
|
+
self.log.error("Available PVCs in namespace:")
|
1041
|
+
self._list_available_pvcs()
|
1042
|
+
return False
|
1043
|
+
|
1044
|
+
except Exception as e: # noqa: BLE001
|
1045
|
+
self.log.error(f"FAILED: Failed to check PVC '{pvc_name}': {e}")
|
1046
|
+
raise RuntimeError(
|
1047
|
+
f"kubectl error while verifying PVC '{pvc_name}': {e}"
|
1048
|
+
) from e
|
1049
|
+
|
1050
|
+
def _list_available_csi_drivers(self) -> None:
|
1051
|
+
"""List available CSI drivers for troubleshooting."""
|
1052
|
+
try:
|
1053
|
+
drivers = self.kubectl.list_resources("csidrivers")
|
1054
|
+
if drivers:
|
1055
|
+
for driver in drivers:
|
1056
|
+
name = driver.get("metadata", {}).get("name", "unknown")
|
1057
|
+
self.log.error(f" - {name}")
|
1058
|
+
else:
|
1059
|
+
self.log.error(" (no CSI drivers found in cluster)")
|
1060
|
+
except Exception: # noqa: BLE001
|
1061
|
+
self.log.error(" (failed to list CSI drivers)")
|
1062
|
+
|
1063
|
+
def _list_available_pvcs(self) -> None:
|
1064
|
+
"""List available PVCs for troubleshooting."""
|
1065
|
+
try:
|
1066
|
+
pvcs = self.kubectl.list_resources(
|
1067
|
+
"pvcs", namespace=self.config.operator_namespace
|
1068
|
+
)
|
1069
|
+
if pvcs:
|
1070
|
+
for pvc in pvcs:
|
1071
|
+
name = pvc.get("metadata", {}).get("name", "unknown")
|
1072
|
+
self.log.error(f" - {name}")
|
1073
|
+
else:
|
1074
|
+
self.log.error(
|
1075
|
+
f" (no PVCs found in namespace '{self.config.operator_namespace}')"
|
1076
|
+
)
|
1077
|
+
except Exception: # noqa: BLE001
|
1078
|
+
self.log.error(" (failed to list PVCs)")
|
1079
|
+
|
1080
|
+
|
1081
|
+
# =============================================================================
|
1082
|
+
# GATEWAY VERIFIER
|
1083
|
+
# =============================================================================
|
1084
|
+
|
1085
|
+
|
1086
|
+
class GatewayVerifier:
|
1087
|
+
"""Handles verification of gateway and ingress components for Kubernetes deployments."""
|
1088
|
+
|
1089
|
+
def __init__(
|
1090
|
+
self,
|
1091
|
+
kubectl_ops: KubectlOperations,
|
1092
|
+
k8s_config: KubernetesConfig,
|
1093
|
+
logger: BlockLogger,
|
1094
|
+
):
|
1095
|
+
self.kubectl = kubectl_ops
|
1096
|
+
self.config = k8s_config
|
1097
|
+
self.log = logger
|
1098
|
+
|
1099
|
+
def verify_gateway_support(self, operator_data: OperatorData) -> bool:
|
1100
|
+
"""Verify gateway support using pre-fetched config data."""
|
1101
|
+
if not operator_data.config.is_valid:
|
1102
|
+
self.log.warning(
|
1103
|
+
"Could not retrieve operator configuration - skipping gateway verification"
|
1104
|
+
)
|
1105
|
+
return True
|
1106
|
+
|
1107
|
+
# Extract gateway configuration from operator data
|
1108
|
+
gateway_config = GatewayConfig.from_operator_config(
|
1109
|
+
operator_data.config.config_data
|
1110
|
+
)
|
1111
|
+
|
1112
|
+
if not gateway_config.enabled:
|
1113
|
+
self.log.info(
|
1114
|
+
"Gateway support is not enabled - skipping gateway verification"
|
1115
|
+
)
|
1116
|
+
return True
|
1117
|
+
|
1118
|
+
if not gateway_config.requires_verification:
|
1119
|
+
self.log.error(
|
1120
|
+
"Gateway is enabled but no gateway name found in operator configuration"
|
1121
|
+
)
|
1122
|
+
return False
|
1123
|
+
|
1124
|
+
# Verify gateway exists in cluster
|
1125
|
+
assert (
|
1126
|
+
gateway_config.name is not None
|
1127
|
+
) # guaranteed by requires_verification check
|
1128
|
+
return self._verify_gateway_exists(gateway_config.name)
|
1129
|
+
|
1130
|
+
def verify_nginx_ingress(self) -> bool:
|
1131
|
+
"""Check for NGINX ingress controller (warning only)."""
|
1132
|
+
try:
|
1133
|
+
self.log.info("Checking for NGINX ingress controller...")
|
1134
|
+
|
1135
|
+
# Try different NGINX ingress controller configurations
|
1136
|
+
for config_dict in NGINX_INGRESS_CONFIGS:
|
1137
|
+
nginx_pod = self._find_nginx_pod(
|
1138
|
+
config_dict["namespace"], config_dict["label"]
|
1139
|
+
)
|
1140
|
+
if nginx_pod:
|
1141
|
+
if self.kubectl.is_pod_running(nginx_pod, config_dict["namespace"]):
|
1142
|
+
self.log.info(
|
1143
|
+
f"PASSED: Found running NGINX ingress controller: {nginx_pod} "
|
1144
|
+
f"(namespace: {config_dict['namespace']})"
|
1145
|
+
)
|
1146
|
+
return True
|
1147
|
+
else:
|
1148
|
+
pod_status = self.kubectl.get_pod_status(
|
1149
|
+
nginx_pod, config_dict["namespace"]
|
1150
|
+
)
|
1151
|
+
self.log.warning(
|
1152
|
+
f"WARNING: Found NGINX ingress controller '{nginx_pod}' "
|
1153
|
+
f"but it's not running (status: {pod_status})"
|
1154
|
+
)
|
1155
|
+
|
1156
|
+
# Try fallback search by name patterns
|
1157
|
+
if self._find_nginx_by_name_pattern():
|
1158
|
+
return True
|
1159
|
+
|
1160
|
+
# No NGINX ingress controller found
|
1161
|
+
self.log.warning("No NGINX ingress controller found")
|
1162
|
+
self.log.warning("This may impact ingress routing capabilities")
|
1163
|
+
self.log.warning("Available ingress controllers:")
|
1164
|
+
self._list_available_ingress_controllers()
|
1165
|
+
return False
|
1166
|
+
|
1167
|
+
except (KubectlError, ResourceNotFoundError) as e:
|
1168
|
+
self.log.warning(f"WARNING: Could not verify NGINX ingress controller: {e}")
|
1169
|
+
raise RuntimeError(
|
1170
|
+
f"kubectl error during NGINX ingress verification: {e}"
|
1171
|
+
) from e
|
1172
|
+
|
1173
|
+
def _verify_gateway_exists(self, gateway_name: str) -> bool:
|
1174
|
+
"""Verify that the specified gateway exists in the cluster."""
|
1175
|
+
try:
|
1176
|
+
# Try to find gateway in common Gateway API resource types
|
1177
|
+
for resource_type in GATEWAY_RESOURCE_TYPES:
|
1178
|
+
if self._check_gateway_resource(resource_type, gateway_name):
|
1179
|
+
return True
|
1180
|
+
|
1181
|
+
# If not found in operator namespace, try cluster-wide search
|
1182
|
+
self.log.info(
|
1183
|
+
f"Gateway '{gateway_name}' not found in operator namespace, "
|
1184
|
+
"searching cluster-wide..."
|
1185
|
+
)
|
1186
|
+
for resource_type in GATEWAY_RESOURCE_TYPES:
|
1187
|
+
if self._check_gateway_resource_cluster_wide(
|
1188
|
+
resource_type, gateway_name
|
1189
|
+
):
|
1190
|
+
return True
|
1191
|
+
|
1192
|
+
self.log.error(f"FAILED: Gateway '{gateway_name}' not found in cluster")
|
1193
|
+
self.log.error("Available gateways:")
|
1194
|
+
self._list_available_gateways()
|
1195
|
+
return False
|
1196
|
+
|
1197
|
+
except (KubectlError, ResourceNotFoundError) as e:
|
1198
|
+
self.log.error(f"FAILED: Failed to verify gateway '{gateway_name}': {e}")
|
1199
|
+
raise RuntimeError(
|
1200
|
+
f"kubectl error while verifying gateway '{gateway_name}': {e}"
|
1201
|
+
) from e
|
1202
|
+
|
1203
|
+
def _check_gateway_resource(self, resource_type: str, gateway_name: str) -> bool:
|
1204
|
+
"""Check for gateway resource in operator namespace."""
|
1205
|
+
try:
|
1206
|
+
gateway_data = self.kubectl.get_resource(
|
1207
|
+
resource_type, gateway_name, namespace=self.config.operator_namespace
|
1208
|
+
)
|
1209
|
+
|
1210
|
+
self.log.info(
|
1211
|
+
f"PASSED: Gateway '{gateway_name}' found in namespace '{self.config.operator_namespace}'"
|
1212
|
+
)
|
1213
|
+
|
1214
|
+
# Log gateway status if available
|
1215
|
+
status = gateway_data.get("status", {})
|
1216
|
+
conditions = status.get("conditions", [])
|
1217
|
+
for condition in conditions:
|
1218
|
+
if (
|
1219
|
+
condition.get("type") == "Ready"
|
1220
|
+
and condition.get("status") == "True"
|
1221
|
+
):
|
1222
|
+
self.log.info(" Status: Ready")
|
1223
|
+
break
|
1224
|
+
|
1225
|
+
return True
|
1226
|
+
|
1227
|
+
except ResourceNotFoundError:
|
1228
|
+
return False
|
1229
|
+
|
1230
|
+
def _check_gateway_resource_cluster_wide(
|
1231
|
+
self, resource_type: str, gateway_name: str
|
1232
|
+
) -> bool:
|
1233
|
+
"""Check for gateway resource cluster-wide."""
|
1234
|
+
try:
|
1235
|
+
gateways = self.kubectl.list_resources(resource_type, all_namespaces=True)
|
1236
|
+
|
1237
|
+
for gateway in gateways:
|
1238
|
+
if gateway.get("metadata", {}).get("name") == gateway_name:
|
1239
|
+
namespace = gateway.get("metadata", {}).get("namespace", "unknown")
|
1240
|
+
self.log.info(
|
1241
|
+
f"PASSED: Gateway '{gateway_name}' found in namespace '{namespace}'"
|
1242
|
+
)
|
1243
|
+
return True
|
1244
|
+
|
1245
|
+
return False
|
1246
|
+
|
1247
|
+
except Exception: # noqa: BLE001
|
1248
|
+
# Broad exception handling for fallback case
|
1249
|
+
return False
|
1250
|
+
|
1251
|
+
def _find_nginx_pod(self, namespace: str, label_selector: str) -> Optional[str]:
|
1252
|
+
"""Find NGINX ingress pod by label selector in specific namespace."""
|
1253
|
+
try:
|
1254
|
+
pods = self.kubectl.list_resources(
|
1255
|
+
"pods", namespace=namespace, label_selector=label_selector
|
1256
|
+
)
|
1257
|
+
|
1258
|
+
if pods:
|
1259
|
+
return pods[0]["metadata"]["name"]
|
1260
|
+
return None
|
1261
|
+
|
1262
|
+
except Exception: # noqa: BLE001
|
1263
|
+
# Broad exception handling for fallback pod discovery
|
1264
|
+
return None
|
1265
|
+
|
1266
|
+
def _find_nginx_by_name_pattern(self) -> bool:
|
1267
|
+
"""Find NGINX ingress controller by name pattern across all namespaces."""
|
1268
|
+
try:
|
1269
|
+
pods = self.kubectl.list_resources("pods", all_namespaces=True)
|
1270
|
+
|
1271
|
+
# Look for pods with names containing NGINX and ingress keywords
|
1272
|
+
for pod in pods:
|
1273
|
+
metadata = pod.get("metadata", {})
|
1274
|
+
name = metadata.get("name", "").lower()
|
1275
|
+
namespace = metadata.get("namespace", "")
|
1276
|
+
status_phase = pod.get("status", {}).get("phase", "")
|
1277
|
+
|
1278
|
+
if "nginx" in name and "ingress" in name:
|
1279
|
+
if status_phase == RUNNING_STATUS:
|
1280
|
+
self.log.info(
|
1281
|
+
f"PASSED: Found NGINX ingress controller by name pattern: "
|
1282
|
+
f"{metadata['name']} (namespace: {namespace})"
|
1283
|
+
)
|
1284
|
+
return True
|
1285
|
+
else:
|
1286
|
+
self.log.warning(
|
1287
|
+
f"WARNING: Found NGINX ingress controller '{metadata['name']}' "
|
1288
|
+
f"but it's not running (status: {status_phase})"
|
1289
|
+
)
|
1290
|
+
|
1291
|
+
return False
|
1292
|
+
|
1293
|
+
except Exception: # noqa: BLE001
|
1294
|
+
# Broad exception handling for fallback case
|
1295
|
+
return False
|
1296
|
+
|
1297
|
+
def _list_available_gateways(self) -> None:
|
1298
|
+
"""List available gateways for troubleshooting."""
|
1299
|
+
try:
|
1300
|
+
for resource_type in GATEWAY_RESOURCE_TYPES:
|
1301
|
+
gateways = self.kubectl.list_resources(
|
1302
|
+
resource_type, all_namespaces=True
|
1303
|
+
)
|
1304
|
+
|
1305
|
+
if gateways:
|
1306
|
+
self.log.error(f"Available {resource_type}:")
|
1307
|
+
for gw in gateways:
|
1308
|
+
name = gw.get("metadata", {}).get("name", "unknown")
|
1309
|
+
self.log.error(f" - {name}")
|
1310
|
+
return
|
1311
|
+
|
1312
|
+
self.log.error(" (no gateways found in cluster)")
|
1313
|
+
|
1314
|
+
except Exception: # noqa: BLE001
|
1315
|
+
# Broad exception handling for troubleshooting helper
|
1316
|
+
self.log.error(" (failed to list gateways)")
|
1317
|
+
|
1318
|
+
def _list_available_ingress_controllers(self) -> None:
|
1319
|
+
"""List available ingress controllers for troubleshooting."""
|
1320
|
+
try:
|
1321
|
+
pods = self.kubectl.list_resources("pods", all_namespaces=True)
|
1322
|
+
|
1323
|
+
ingress_controllers = []
|
1324
|
+
for pod in pods:
|
1325
|
+
metadata = pod.get("metadata", {})
|
1326
|
+
name = metadata.get("name", "").lower()
|
1327
|
+
namespace = metadata.get("namespace", "")
|
1328
|
+
|
1329
|
+
# Look for common ingress controller name patterns
|
1330
|
+
if any(keyword in name for keyword in INGRESS_CONTROLLER_KEYWORDS):
|
1331
|
+
ingress_controllers.append(
|
1332
|
+
f"{metadata['name']} (namespace: {namespace})"
|
1333
|
+
)
|
1334
|
+
|
1335
|
+
if ingress_controllers:
|
1336
|
+
for controller in ingress_controllers:
|
1337
|
+
self.log.warning(f" - {controller}")
|
1338
|
+
else:
|
1339
|
+
self.log.warning(" (no ingress controllers found)")
|
1340
|
+
|
1341
|
+
except Exception: # noqa: BLE001
|
1342
|
+
# Broad exception handling for troubleshooting helper
|
1343
|
+
self.log.warning(" (failed to list ingress controllers)")
|
1344
|
+
|
1345
|
+
|
1346
|
+
# =============================================================================
|
1347
|
+
# MAIN VERIFIER CLASS
|
1348
|
+
# =============================================================================
|
1349
|
+
|
1350
|
+
|
1351
|
+
class KubernetesCloudDeploymentVerifier:
|
1352
|
+
"""Verifies Kubernetes-based cloud deployments with comprehensive checks"""
|
1353
|
+
|
1354
|
+
def __init__(self, logger: BlockLogger, api_client):
|
1355
|
+
self.log = logger
|
1356
|
+
self.api_client = api_client
|
1357
|
+
self.k8s_config: Optional[KubernetesConfig] = None
|
1358
|
+
self.results = VerificationResults()
|
1359
|
+
|
1360
|
+
def verify(self, cloud_deployment: CloudDeployment) -> bool:
|
1361
|
+
"""
|
1362
|
+
Main verification workflow for Kubernetes cloud deployments.
|
1363
|
+
|
1364
|
+
Performs comprehensive checks including operator health, identity verification,
|
1365
|
+
file storage, networking, and gateway configuration.
|
1366
|
+
|
1367
|
+
Args:
|
1368
|
+
cloud_deployment: The cloud deployment configuration
|
1369
|
+
"""
|
1370
|
+
deployment_name = cloud_deployment.name or cloud_deployment.cloud_deployment_id
|
1371
|
+
self.log.info(f"Starting Kubernetes verification for: {deployment_name}")
|
1372
|
+
|
1373
|
+
if cloud_deployment.file_storage is not None and isinstance(
|
1374
|
+
cloud_deployment.file_storage, dict
|
1375
|
+
):
|
1376
|
+
cloud_deployment.file_storage = FileStorage(**cloud_deployment.file_storage)
|
1377
|
+
|
1378
|
+
try:
|
1379
|
+
return self._run_verification_steps(cloud_deployment)
|
1380
|
+
|
1381
|
+
except click.ClickException:
|
1382
|
+
# Re-raise ClickExceptions as they contain user-friendly messages
|
1383
|
+
raise
|
1384
|
+
except requests.RequestException as e:
|
1385
|
+
self.log.error(f"Network error during verification: {e}")
|
1386
|
+
return False
|
1387
|
+
except (subprocess.CalledProcessError, OSError) as e:
|
1388
|
+
self.log.error(f"System error during verification: {e}")
|
1389
|
+
return False
|
1390
|
+
except (KeyError, ValueError, json.JSONDecodeError) as e:
|
1391
|
+
self.log.error(f"Data parsing error during verification: {e}")
|
1392
|
+
return False
|
1393
|
+
|
1394
|
+
def _passed_or_failed_str_from_bool(self, is_passing: bool) -> str:
|
1395
|
+
"""Return PASSED or FAILED string for verification results, matching VM verification format."""
|
1396
|
+
return PASSED_STATUS if is_passing else FAILED_STATUS
|
1397
|
+
|
1398
|
+
@contextmanager
|
1399
|
+
def _verification_step(self, step_name: str):
|
1400
|
+
"""Context manager for verification steps that indents detailed output."""
|
1401
|
+
self.log.info(f"{step_name}...")
|
1402
|
+
with self.log.indent():
|
1403
|
+
yield
|
1404
|
+
|
1405
|
+
def _run_verification_steps(self, cloud_deployment: CloudDeployment) -> bool:
|
1406
|
+
"""Execute the verification steps in sequence."""
|
1407
|
+
# Step 1: Configure kubectl
|
1408
|
+
with self._verification_step("Configuring kubectl access"):
|
1409
|
+
self._get_kubectl_config()
|
1410
|
+
|
1411
|
+
# k8s_config is guaranteed to be set by _get_kubectl_config()
|
1412
|
+
assert self.k8s_config is not None
|
1413
|
+
|
1414
|
+
# Initialize utility classes
|
1415
|
+
kubectl_ops = KubectlOperations(self.k8s_config.context, self.log)
|
1416
|
+
operator_verifier = OperatorVerifier(kubectl_ops, self.k8s_config, self.log)
|
1417
|
+
storage_verifier = StorageVerifier(kubectl_ops, self.k8s_config, self.log)
|
1418
|
+
gateway_verifier = GatewayVerifier(kubectl_ops, self.k8s_config, self.log)
|
1419
|
+
|
1420
|
+
# Step 2: Find and verify operator pod
|
1421
|
+
with self._verification_step("Finding operator pod"):
|
1422
|
+
try:
|
1423
|
+
operator_pod = operator_verifier.find_operator_pod()
|
1424
|
+
self.results.operator_pod_installed = True
|
1425
|
+
except OperatorPodNotFoundError as e:
|
1426
|
+
self.log.error(
|
1427
|
+
"Failed to find operator pod, please make sure the operator is running"
|
1428
|
+
)
|
1429
|
+
self.log.error(f"Error: {e}")
|
1430
|
+
return False
|
1431
|
+
|
1432
|
+
# Step 3: Port forward and fetch operator data (health + config)
|
1433
|
+
with self._verification_step("Verifying operator status"):
|
1434
|
+
try:
|
1435
|
+
operator_data = operator_verifier.get_operator_data(operator_pod)
|
1436
|
+
except (OperatorConnectionError, PortForwardError) as e:
|
1437
|
+
self.log.error(
|
1438
|
+
"Failed to connect to operator, please make sure the operator is running version >= 0.7.0 and has status reporting enabled"
|
1439
|
+
)
|
1440
|
+
self.log.error(f"Error: {e}")
|
1441
|
+
return False
|
1442
|
+
|
1443
|
+
self.log.info("Verifying operator health...")
|
1444
|
+
self.results.operator_health = operator_verifier.verify_operator_health(
|
1445
|
+
operator_data
|
1446
|
+
)
|
1447
|
+
self.log.info(
|
1448
|
+
f"Operator Health: {self._passed_or_failed_str_from_bool(self.results.operator_health)}"
|
1449
|
+
)
|
1450
|
+
|
1451
|
+
self.log.info("Verifying operator identity...")
|
1452
|
+
if cloud_deployment.kubernetes_config is None:
|
1453
|
+
self.log.error(
|
1454
|
+
"Kubernetes configuration is missing from cloud deployment"
|
1455
|
+
)
|
1456
|
+
self.results.operator_identity = False
|
1457
|
+
else:
|
1458
|
+
self.results.operator_identity = operator_verifier.verify_operator_identity(
|
1459
|
+
operator_data,
|
1460
|
+
cloud_deployment.kubernetes_config,
|
1461
|
+
cloud_deployment.provider,
|
1462
|
+
)
|
1463
|
+
self.log.info(
|
1464
|
+
f"Operator Identity: {self._passed_or_failed_str_from_bool(self.results.operator_identity)}"
|
1465
|
+
)
|
1466
|
+
|
1467
|
+
# Step 4: Check file storage
|
1468
|
+
with self._verification_step("Checking file storage"):
|
1469
|
+
if cloud_deployment.file_storage is None:
|
1470
|
+
self.log.info(
|
1471
|
+
"INFO: No file storage configured - skipping file storage verification"
|
1472
|
+
)
|
1473
|
+
self.results.file_storage = True
|
1474
|
+
else:
|
1475
|
+
self.results.file_storage = storage_verifier.verify_file_storage(
|
1476
|
+
cloud_deployment.file_storage, cloud_deployment
|
1477
|
+
)
|
1478
|
+
self.log.info(
|
1479
|
+
f"File Storage: {self._passed_or_failed_str_from_bool(self.results.file_storage)}"
|
1480
|
+
)
|
1481
|
+
|
1482
|
+
# Step 5: Verify gateway support
|
1483
|
+
with self._verification_step("Verifying gateway support"):
|
1484
|
+
self.results.gateway_support = gateway_verifier.verify_gateway_support(
|
1485
|
+
operator_data
|
1486
|
+
)
|
1487
|
+
self.log.info(
|
1488
|
+
f"Gateway Support: {self._passed_or_failed_str_from_bool(self.results.gateway_support)}"
|
1489
|
+
)
|
1490
|
+
|
1491
|
+
# Step 6: Check NGINX ingress (warning only)
|
1492
|
+
with self._verification_step("Checking NGINX ingress controller"):
|
1493
|
+
self.results.nginx_ingress = gateway_verifier.verify_nginx_ingress()
|
1494
|
+
self.log.info(
|
1495
|
+
f"NGINX Ingress: {self._passed_or_failed_str_from_bool(self.results.nginx_ingress)}"
|
1496
|
+
)
|
1497
|
+
|
1498
|
+
self._show_verification_summary()
|
1499
|
+
|
1500
|
+
if self.results.overall_success:
|
1501
|
+
self.log.info(
|
1502
|
+
"Kubernetes cloud deployment verification completed successfully"
|
1503
|
+
)
|
1504
|
+
else:
|
1505
|
+
self.log.error("Kubernetes cloud deployment verification failed")
|
1506
|
+
|
1507
|
+
return self.results.overall_success
|
1508
|
+
|
1509
|
+
def _show_verification_summary(self):
|
1510
|
+
"""Show verification results summary in the same format as VM verification."""
|
1511
|
+
verification_result_summary = ["Verification result:"]
|
1512
|
+
|
1513
|
+
for component, result in self.results.to_dict().items():
|
1514
|
+
verification_result_summary.append(
|
1515
|
+
f"{component}: {self._passed_or_failed_str_from_bool(result)}"
|
1516
|
+
)
|
1517
|
+
|
1518
|
+
self.log.info("\n".join(verification_result_summary))
|
1519
|
+
|
1520
|
+
def _get_kubectl_config(self):
|
1521
|
+
"""Get kubectl context and operator namespace from user"""
|
1522
|
+
# Check if kubectl is available
|
1523
|
+
temp_kubectl = KubectlOperations("", self.log)
|
1524
|
+
if not temp_kubectl.check_kubectl_available():
|
1525
|
+
raise click.ClickException(
|
1526
|
+
"kubectl command not found. Please install kubectl and ensure it's in your PATH."
|
1527
|
+
)
|
1528
|
+
|
1529
|
+
# Get available contexts
|
1530
|
+
contexts = temp_kubectl.get_available_contexts()
|
1531
|
+
if not contexts:
|
1532
|
+
raise click.ClickException(
|
1533
|
+
"No kubectl contexts found. Please configure kubectl to access your Kubernetes cluster."
|
1534
|
+
)
|
1535
|
+
|
1536
|
+
# Prompt for context selection
|
1537
|
+
if len(contexts) > 1:
|
1538
|
+
self.log.info("Available kubectl contexts:")
|
1539
|
+
for i, ctx in enumerate(contexts):
|
1540
|
+
current_marker = (
|
1541
|
+
" (current)" if ctx == temp_kubectl.get_current_context() else ""
|
1542
|
+
)
|
1543
|
+
self.log.info(f" {i+1}. {ctx}{current_marker}")
|
1544
|
+
|
1545
|
+
choice = click.prompt(
|
1546
|
+
"Select context number",
|
1547
|
+
type=click.IntRange(1, len(contexts)),
|
1548
|
+
default=1,
|
1549
|
+
)
|
1550
|
+
kubectl_context = contexts[choice - 1]
|
1551
|
+
else:
|
1552
|
+
kubectl_context = contexts[0]
|
1553
|
+
self.log.info(f"Using kubectl context: {kubectl_context}")
|
1554
|
+
|
1555
|
+
# Prompt for operator namespace
|
1556
|
+
operator_namespace = click.prompt(
|
1557
|
+
"Enter the Anyscale operator namespace",
|
1558
|
+
default=DEFAULT_OPERATOR_NAMESPACE,
|
1559
|
+
type=str,
|
1560
|
+
show_default=True,
|
1561
|
+
)
|
1562
|
+
|
1563
|
+
self.k8s_config = KubernetesConfig(
|
1564
|
+
context=kubectl_context, operator_namespace=operator_namespace
|
1565
|
+
)
|
1566
|
+
|
1567
|
+
self.log.info(
|
1568
|
+
f"Configured: context='{self.k8s_config.context}', "
|
1569
|
+
f"namespace='{self.k8s_config.operator_namespace}'"
|
1570
|
+
)
|