superset-showtime 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of superset-showtime might be problematic. Click here for more details.
- showtime/__init__.py +21 -0
- showtime/__main__.py +8 -0
- showtime/cli.py +1361 -0
- showtime/commands/__init__.py +1 -0
- showtime/commands/start.py +40 -0
- showtime/core/__init__.py +1 -0
- showtime/core/aws.py +758 -0
- showtime/core/circus.py +285 -0
- showtime/core/config.py +152 -0
- showtime/core/emojis.py +86 -0
- showtime/core/github.py +214 -0
- showtime/data/ecs-task-definition.json +59 -0
- superset_showtime-0.1.0.dist-info/METADATA +391 -0
- superset_showtime-0.1.0.dist-info/RECORD +16 -0
- superset_showtime-0.1.0.dist-info/WHEEL +4 -0
- superset_showtime-0.1.0.dist-info/entry_points.txt +3 -0
showtime/core/aws.py
ADDED
|
@@ -0,0 +1,758 @@
|
|
|
1
|
+
"""
|
|
2
|
+
🎪 AWS interface for circus tent environment management
|
|
3
|
+
|
|
4
|
+
Replicates the AWS logic from current GitHub Actions workflows.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
import boto3
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class AWSError(Exception):
|
|
19
|
+
"""AWS operation error"""
|
|
20
|
+
|
|
21
|
+
message: str
|
|
22
|
+
operation: str
|
|
23
|
+
resource: Optional[str] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class EnvironmentResult:
|
|
28
|
+
"""Result of AWS environment operation"""
|
|
29
|
+
|
|
30
|
+
success: bool
|
|
31
|
+
ip: Optional[str] = None
|
|
32
|
+
service_name: Optional[str] = None
|
|
33
|
+
error: Optional[str] = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class AWSInterface:
|
|
37
|
+
"""AWS ECS/ECR client replicating current GHA logic"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, region: str = None, cluster: str = None, repository: str = None):
|
|
40
|
+
self.region = region or os.getenv("AWS_REGION", "us-west-2")
|
|
41
|
+
self.cluster = cluster or os.getenv("ECS_CLUSTER", "superset-ci")
|
|
42
|
+
self.repository = repository or os.getenv("ECR_REPOSITORY", "superset-ci")
|
|
43
|
+
|
|
44
|
+
# AWS clients
|
|
45
|
+
self.ecs_client = boto3.client("ecs", region_name=self.region)
|
|
46
|
+
self.ecr_client = boto3.client("ecr", region_name=self.region)
|
|
47
|
+
self.ec2_client = boto3.client("ec2", region_name=self.region)
|
|
48
|
+
|
|
49
|
+
# Network configuration (from current GHA)
|
|
50
|
+
self.subnets = ["subnet-0e15a5034b4121710", "subnet-0e8efef4a72224974"]
|
|
51
|
+
self.security_group = "sg-092ff3a6ae0574d91"
|
|
52
|
+
|
|
53
|
+
def create_environment(
|
|
54
|
+
self,
|
|
55
|
+
pr_number: int,
|
|
56
|
+
sha: str,
|
|
57
|
+
github_user: str = "unknown",
|
|
58
|
+
feature_flags: List[Dict[str, str]] = None,
|
|
59
|
+
) -> EnvironmentResult:
|
|
60
|
+
"""
|
|
61
|
+
Create ephemeral environment with blue-green deployment support
|
|
62
|
+
|
|
63
|
+
Blue-Green Steps:
|
|
64
|
+
1. Check if ECR image exists
|
|
65
|
+
2. Create Show object for consistent naming
|
|
66
|
+
3. Check for existing services (blue)
|
|
67
|
+
4. Create new service with SHA (green)
|
|
68
|
+
5. Wait for deployment stability
|
|
69
|
+
6. Get public IP and return for traffic switching
|
|
70
|
+
"""
|
|
71
|
+
from datetime import datetime
|
|
72
|
+
|
|
73
|
+
from .circus import Show
|
|
74
|
+
|
|
75
|
+
# Create Show object for consistent AWS naming
|
|
76
|
+
show = Show(
|
|
77
|
+
pr_number=pr_number,
|
|
78
|
+
sha=sha[:7], # Truncate to 7 chars like GitHub
|
|
79
|
+
status="building",
|
|
80
|
+
created_at=datetime.utcnow().strftime("%Y-%m-%dT%H-%M"),
|
|
81
|
+
requested_by=github_user,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
service_name = f"{show.aws_service_name}-service" # pr-{pr_number}-{sha}-service
|
|
85
|
+
image_tag = show.aws_image_tag # pr-{pr_number}-{sha}-ci
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
# Step 1: Check if ECR image exists (replicate GHA check-image step)
|
|
89
|
+
if not self._check_ecr_image_exists(image_tag):
|
|
90
|
+
return EnvironmentResult(
|
|
91
|
+
success=False,
|
|
92
|
+
error=f"Container image {image_tag} not found in ECR. Build the image first.",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Step 2: Create/update ECS task definition with feature flags
|
|
96
|
+
task_def_arn = self._create_task_definition_with_image_and_flags(
|
|
97
|
+
image_tag, feature_flags or []
|
|
98
|
+
)
|
|
99
|
+
if not task_def_arn:
|
|
100
|
+
return EnvironmentResult(success=False, error="Failed to create task definition")
|
|
101
|
+
|
|
102
|
+
# Step 3: Blue-Green Logic - Check for existing services
|
|
103
|
+
print(f"🔍 Checking for existing services for PR #{pr_number}")
|
|
104
|
+
existing_services = self._find_pr_services(pr_number)
|
|
105
|
+
|
|
106
|
+
if existing_services:
|
|
107
|
+
print(
|
|
108
|
+
f"📊 Found {len(existing_services)} existing services - starting blue-green deployment"
|
|
109
|
+
)
|
|
110
|
+
for svc in existing_services:
|
|
111
|
+
print(f" 🔵 Blue: {svc['service_name']} ({svc['status']})")
|
|
112
|
+
|
|
113
|
+
# Step 4: Create new green service
|
|
114
|
+
print(f"🟢 Creating green service: {service_name}")
|
|
115
|
+
success = self._create_ecs_service(service_name, pr_number, github_user)
|
|
116
|
+
if not success:
|
|
117
|
+
return EnvironmentResult(success=False, error="Green service creation failed")
|
|
118
|
+
|
|
119
|
+
# Step 5: Deploy task definition to green service
|
|
120
|
+
success = self._deploy_task_definition(service_name, task_def_arn)
|
|
121
|
+
if not success:
|
|
122
|
+
return EnvironmentResult(
|
|
123
|
+
success=False, error="Green task definition deployment failed"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Step 6: Wait for service stability (replicate GHA wait-for-service-stability)
|
|
127
|
+
print(f"⏳ Waiting for service {service_name} to become stable...")
|
|
128
|
+
if not self._wait_for_service_stability(service_name):
|
|
129
|
+
return EnvironmentResult(success=False, error="Service failed to become stable")
|
|
130
|
+
|
|
131
|
+
# Step 7: Health check the new service
|
|
132
|
+
print(f"🏥 Health checking service {service_name}...")
|
|
133
|
+
if not self._health_check_service(service_name):
|
|
134
|
+
return EnvironmentResult(success=False, error="Service failed health checks")
|
|
135
|
+
|
|
136
|
+
# Step 8: Get IP after health checks pass
|
|
137
|
+
ip = self.get_environment_ip(service_name)
|
|
138
|
+
if not ip:
|
|
139
|
+
return EnvironmentResult(success=False, error="Failed to get environment IP")
|
|
140
|
+
|
|
141
|
+
return EnvironmentResult(success=True, ip=ip, service_name=service_name)
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
return EnvironmentResult(success=False, error=str(e))
|
|
145
|
+
|
|
146
|
+
def delete_environment(self, service_name: str, pr_number: int) -> bool:
|
|
147
|
+
"""
|
|
148
|
+
Delete ephemeral environment - replicates cleanup GHA logic
|
|
149
|
+
|
|
150
|
+
Steps:
|
|
151
|
+
1. Check if ECS service exists and is active
|
|
152
|
+
2. Delete ECS service with --force
|
|
153
|
+
3. Delete ECR image tag
|
|
154
|
+
"""
|
|
155
|
+
try:
|
|
156
|
+
# Step 1: Check if service exists and is active
|
|
157
|
+
if not self._service_exists(service_name):
|
|
158
|
+
return True # Already deleted
|
|
159
|
+
|
|
160
|
+
# Step 2: Delete ECS service (force delete)
|
|
161
|
+
self.ecs_client.delete_service(cluster=self.cluster, service=service_name, force=True)
|
|
162
|
+
|
|
163
|
+
# Step 3: Delete ECR image tag
|
|
164
|
+
# Extract SHA from service name: pr-1234-abc123f → abc123f
|
|
165
|
+
sha = service_name.split("-")[-1]
|
|
166
|
+
image_tag = f"pr-{pr_number}-{sha}"
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
self.ecr_client.batch_delete_image(
|
|
170
|
+
repositoryName=self.repository, imageIds=[{"imageTag": image_tag}]
|
|
171
|
+
)
|
|
172
|
+
except self.ecr_client.exceptions.ImageNotFoundException:
|
|
173
|
+
pass # Image already deleted
|
|
174
|
+
|
|
175
|
+
return True
|
|
176
|
+
|
|
177
|
+
except Exception as e:
|
|
178
|
+
raise AWSError(message=str(e), operation="delete_environment", resource=service_name)
|
|
179
|
+
|
|
180
|
+
def get_environment_ip(self, service_name: str) -> Optional[str]:
|
|
181
|
+
"""
|
|
182
|
+
Get public IP for environment - replicates GHA IP discovery logic
|
|
183
|
+
|
|
184
|
+
Steps:
|
|
185
|
+
1. List tasks for service
|
|
186
|
+
2. Describe task to get network interface
|
|
187
|
+
3. Get public IP from network interface
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
# Step 1: List tasks
|
|
191
|
+
tasks_response = self.ecs_client.list_tasks(
|
|
192
|
+
cluster=self.cluster, serviceName=service_name
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
if not tasks_response["taskArns"]:
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
task_arn = tasks_response["taskArns"][0]
|
|
199
|
+
|
|
200
|
+
# Step 2: Describe task to get network interface
|
|
201
|
+
task_response = self.ecs_client.describe_tasks(cluster=self.cluster, tasks=[task_arn])
|
|
202
|
+
|
|
203
|
+
if not task_response["tasks"]:
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
task = task_response["tasks"][0]
|
|
207
|
+
|
|
208
|
+
# Find network interface ID
|
|
209
|
+
eni_id = None
|
|
210
|
+
for attachment in task.get("attachments", []):
|
|
211
|
+
for detail in attachment.get("details", []):
|
|
212
|
+
if detail["name"] == "networkInterfaceId":
|
|
213
|
+
eni_id = detail["value"]
|
|
214
|
+
break
|
|
215
|
+
if eni_id:
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
if not eni_id:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
# Step 3: Get public IP from network interface
|
|
222
|
+
eni_response = self.ec2_client.describe_network_interfaces(NetworkInterfaceIds=[eni_id])
|
|
223
|
+
|
|
224
|
+
if not eni_response["NetworkInterfaces"]:
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
eni = eni_response["NetworkInterfaces"][0]
|
|
228
|
+
return eni.get("Association", {}).get("PublicIp")
|
|
229
|
+
|
|
230
|
+
except Exception:
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
def get_environment_status(self, service_name: str) -> str:
|
|
234
|
+
"""Get environment status from AWS"""
|
|
235
|
+
try:
|
|
236
|
+
response = self.ecs_client.describe_services(
|
|
237
|
+
cluster=self.cluster, services=[service_name]
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if not response["services"]:
|
|
241
|
+
return "not_found"
|
|
242
|
+
|
|
243
|
+
service = response["services"][0]
|
|
244
|
+
status = service["status"]
|
|
245
|
+
|
|
246
|
+
if status == "ACTIVE":
|
|
247
|
+
# Check if tasks are running
|
|
248
|
+
running_count = service["runningCount"]
|
|
249
|
+
desired_count = service["desiredCount"]
|
|
250
|
+
|
|
251
|
+
if running_count == desired_count and running_count > 0:
|
|
252
|
+
return "running"
|
|
253
|
+
else:
|
|
254
|
+
return "building"
|
|
255
|
+
else:
|
|
256
|
+
return "failed"
|
|
257
|
+
|
|
258
|
+
except Exception:
|
|
259
|
+
return "unknown"
|
|
260
|
+
|
|
261
|
+
def _check_ecr_image_exists(self, image_tag: str) -> bool:
|
|
262
|
+
"""Check if ECR image exists (replicate GHA check-image step)"""
|
|
263
|
+
try:
|
|
264
|
+
# Get registry ID from ECR login
|
|
265
|
+
ecr_response = self.ecr_client.get_authorization_token()
|
|
266
|
+
registry_id = ecr_response["authorizationData"][0]["proxyEndpoint"]
|
|
267
|
+
registry_id = registry_id.split(".")[0].replace("https://", "")
|
|
268
|
+
|
|
269
|
+
# Replicate exact GHA describe-images command
|
|
270
|
+
self.ecr_client.describe_images(
|
|
271
|
+
registryId=registry_id,
|
|
272
|
+
repositoryName=self.repository,
|
|
273
|
+
imageIds=[{"imageTag": image_tag}],
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
print(f"✅ Found ECR image: {image_tag}")
|
|
277
|
+
return True
|
|
278
|
+
|
|
279
|
+
except self.ecr_client.exceptions.ImageNotFoundException:
|
|
280
|
+
print(f"❌ ECR image not found: {image_tag}")
|
|
281
|
+
return False
|
|
282
|
+
except Exception as e:
|
|
283
|
+
print(f"❌ ECR image check failed: {e}")
|
|
284
|
+
return False
|
|
285
|
+
|
|
286
|
+
def _create_task_definition_with_image_and_flags(
|
|
287
|
+
self, image_tag: str, feature_flags: List[Dict[str, str]]
|
|
288
|
+
) -> Optional[str]:
|
|
289
|
+
"""Create ECS task definition with image and feature flags (replicate GHA task-def + env vars)"""
|
|
290
|
+
try:
|
|
291
|
+
# Load base task definition template
|
|
292
|
+
task_def_path = Path(__file__).parent.parent / "data" / "ecs-task-definition.json"
|
|
293
|
+
with open(task_def_path) as f:
|
|
294
|
+
task_def = json.load(f)
|
|
295
|
+
|
|
296
|
+
# Get ECR registry for full image URL
|
|
297
|
+
ecr_response = self.ecr_client.get_authorization_token()
|
|
298
|
+
registry_url = ecr_response["authorizationData"][0]["proxyEndpoint"]
|
|
299
|
+
registry_url = registry_url.replace("https://", "")
|
|
300
|
+
full_image_url = f"{registry_url}/{self.repository}:{image_tag}"
|
|
301
|
+
|
|
302
|
+
# Update image in container definition (replicate GHA render-task-definition)
|
|
303
|
+
task_def["containerDefinitions"][0]["image"] = full_image_url
|
|
304
|
+
|
|
305
|
+
# Add feature flags to environment (replicate GHA jq environment update)
|
|
306
|
+
container_env = task_def["containerDefinitions"][0]["environment"]
|
|
307
|
+
for flag in feature_flags:
|
|
308
|
+
container_env.append(flag)
|
|
309
|
+
|
|
310
|
+
# Register task definition
|
|
311
|
+
response = self.ecs_client.register_task_definition(**task_def)
|
|
312
|
+
task_def_arn = response["taskDefinition"]["taskDefinitionArn"]
|
|
313
|
+
|
|
314
|
+
print(f"✅ Created task definition: {task_def_arn}")
|
|
315
|
+
return task_def_arn
|
|
316
|
+
|
|
317
|
+
except Exception as e:
|
|
318
|
+
print(f"❌ Task definition creation failed: {e}")
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
def _deploy_task_definition(self, service_name: str, task_def_arn: str) -> bool:
|
|
322
|
+
"""Deploy task definition to service (replicate GHA deploy-task step)"""
|
|
323
|
+
try:
|
|
324
|
+
# Replicate exact GHA deploy-task-definition parameters
|
|
325
|
+
self.ecs_client.update_service(
|
|
326
|
+
cluster=self.cluster, service=service_name, taskDefinition=task_def_arn
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
print(f"✅ Updated service {service_name} with task definition")
|
|
330
|
+
return True
|
|
331
|
+
|
|
332
|
+
except Exception as e:
|
|
333
|
+
print(f"❌ Task definition deployment failed: {e}")
|
|
334
|
+
return False
|
|
335
|
+
|
|
336
|
+
def _service_exists(self, service_name: str) -> bool:
|
|
337
|
+
"""Check if ECS service exists and is active"""
|
|
338
|
+
try:
|
|
339
|
+
response = self.ecs_client.describe_services(
|
|
340
|
+
cluster=self.cluster, services=[service_name]
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
for service in response["services"]:
|
|
344
|
+
if service["status"] == "ACTIVE":
|
|
345
|
+
return True
|
|
346
|
+
|
|
347
|
+
return False
|
|
348
|
+
|
|
349
|
+
except Exception:
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
def _create_ecs_service(self, service_name: str, pr_number: int, github_user: str) -> bool:
|
|
353
|
+
"""Create ECS service (replicate exact GHA create-service step)"""
|
|
354
|
+
try:
|
|
355
|
+
# Replicate exact GHA create-service command parameters
|
|
356
|
+
response = self.ecs_client.create_service(
|
|
357
|
+
cluster=self.cluster,
|
|
358
|
+
serviceName=service_name, # pr-{pr_number}-service
|
|
359
|
+
taskDefinition=self.cluster, # Uses cluster name as task def family
|
|
360
|
+
launchType="FARGATE",
|
|
361
|
+
desiredCount=1,
|
|
362
|
+
platformVersion="LATEST",
|
|
363
|
+
networkConfiguration={
|
|
364
|
+
"awsvpcConfiguration": {
|
|
365
|
+
"subnets": self.subnets, # Same subnets as GHA
|
|
366
|
+
"securityGroups": [self.security_group], # Same SG as GHA
|
|
367
|
+
"assignPublicIp": "ENABLED",
|
|
368
|
+
}
|
|
369
|
+
},
|
|
370
|
+
tags=[
|
|
371
|
+
{"key": "pr", "value": str(pr_number)},
|
|
372
|
+
{"key": "github_user", "value": github_user},
|
|
373
|
+
{"key": "showtime_created", "value": str(int(time.time()))},
|
|
374
|
+
{
|
|
375
|
+
"key": "showtime_expires",
|
|
376
|
+
"value": str(int(time.time()) + 48 * 3600),
|
|
377
|
+
}, # 48 hours
|
|
378
|
+
{"key": "showtime_managed", "value": "true"},
|
|
379
|
+
],
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
print(f"✅ Created ECS service: {service_name}")
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
except Exception as e:
|
|
386
|
+
print(f"❌ ECS service creation failed: {e}")
|
|
387
|
+
return False
|
|
388
|
+
|
|
389
|
+
def _wait_for_deployment_and_get_ip(
|
|
390
|
+
self, service_name: str, timeout_minutes: int = 10
|
|
391
|
+
) -> Optional[str]:
|
|
392
|
+
"""Wait for ECS deployment to complete and get IP"""
|
|
393
|
+
try:
|
|
394
|
+
# Wait for service stability (replicate GHA wait-for-service-stability)
|
|
395
|
+
waiter = self.ecs_client.get_waiter("services_stable")
|
|
396
|
+
waiter.wait(
|
|
397
|
+
cluster=self.cluster,
|
|
398
|
+
services=[service_name],
|
|
399
|
+
WaiterConfig={"maxAttempts": timeout_minutes * 2}, # 30s intervals
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
# Get IP after deployment is stable
|
|
403
|
+
return self.get_environment_ip(service_name)
|
|
404
|
+
|
|
405
|
+
except Exception:
|
|
406
|
+
return None
|
|
407
|
+
|
|
408
|
+
def list_circus_environments(self) -> List[Dict[str, Any]]:
|
|
409
|
+
"""List all environments with circus tags"""
|
|
410
|
+
try:
|
|
411
|
+
# List all services in cluster
|
|
412
|
+
services_response = self.ecs_client.list_services(cluster=self.cluster)
|
|
413
|
+
|
|
414
|
+
circus_services = []
|
|
415
|
+
for service_arn in services_response["serviceArns"]:
|
|
416
|
+
service_name = service_arn.split("/")[-1]
|
|
417
|
+
|
|
418
|
+
# Check if it's a circus service (pr-{number}-{sha} pattern)
|
|
419
|
+
if service_name.startswith("pr-") and len(service_name.split("-")) >= 3:
|
|
420
|
+
# Get service details and tags
|
|
421
|
+
service_response = self.ecs_client.describe_services(
|
|
422
|
+
cluster=self.cluster, services=[service_name]
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if service_response["services"]:
|
|
426
|
+
service = service_response["services"][0]
|
|
427
|
+
circus_services.append(
|
|
428
|
+
{
|
|
429
|
+
"service_name": service_name,
|
|
430
|
+
"status": service["status"],
|
|
431
|
+
"running_count": service["runningCount"],
|
|
432
|
+
"desired_count": service["desiredCount"],
|
|
433
|
+
"created_at": service["createdAt"],
|
|
434
|
+
"ip": self.get_environment_ip(service_name),
|
|
435
|
+
}
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
return circus_services
|
|
439
|
+
|
|
440
|
+
except Exception:
|
|
441
|
+
return []
|
|
442
|
+
|
|
443
|
+
def cleanup_orphaned_environments(self, max_age_hours: int = 48) -> List[str]:
|
|
444
|
+
"""Clean up environments older than max_age_hours"""
|
|
445
|
+
import time
|
|
446
|
+
|
|
447
|
+
try:
|
|
448
|
+
orphaned = []
|
|
449
|
+
circus_services = self.list_circus_environments()
|
|
450
|
+
|
|
451
|
+
current_time = time.time()
|
|
452
|
+
max_age_seconds = max_age_hours * 3600
|
|
453
|
+
|
|
454
|
+
for service in circus_services:
|
|
455
|
+
# Calculate age
|
|
456
|
+
created_timestamp = service["created_at"].timestamp()
|
|
457
|
+
age_seconds = current_time - created_timestamp
|
|
458
|
+
|
|
459
|
+
if age_seconds > max_age_seconds:
|
|
460
|
+
service_name = service["service_name"]
|
|
461
|
+
|
|
462
|
+
# Extract PR number for cleanup
|
|
463
|
+
pr_number = int(service_name.split("-")[1])
|
|
464
|
+
|
|
465
|
+
# Delete the service
|
|
466
|
+
if self.delete_environment(service_name, pr_number):
|
|
467
|
+
orphaned.append(service_name)
|
|
468
|
+
|
|
469
|
+
return orphaned
|
|
470
|
+
|
|
471
|
+
except Exception as e:
|
|
472
|
+
raise AWSError(message=str(e), operation="cleanup_orphaned_environments")
|
|
473
|
+
|
|
474
|
+
def update_feature_flags(self, service_name: str, feature_flags: Dict[str, bool]) -> bool:
|
|
475
|
+
"""Update feature flags in running environment"""
|
|
476
|
+
try:
|
|
477
|
+
# Get current task definition
|
|
478
|
+
service_response = self.ecs_client.describe_services(
|
|
479
|
+
cluster=self.cluster, services=[service_name]
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
if not service_response["services"]:
|
|
483
|
+
return False
|
|
484
|
+
|
|
485
|
+
task_def_arn = service_response["services"][0]["taskDefinition"]
|
|
486
|
+
|
|
487
|
+
# Get task definition details
|
|
488
|
+
task_def_response = self.ecs_client.describe_task_definition(
|
|
489
|
+
taskDefinition=task_def_arn
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
task_def = task_def_response["taskDefinition"]
|
|
493
|
+
|
|
494
|
+
# Update environment variables
|
|
495
|
+
container_def = task_def["containerDefinitions"][0]
|
|
496
|
+
env_vars = container_def.get("environment", [])
|
|
497
|
+
|
|
498
|
+
# Update feature flags
|
|
499
|
+
for flag_name, enabled in feature_flags.items():
|
|
500
|
+
# Remove existing flag
|
|
501
|
+
env_vars = [e for e in env_vars if e["name"] != flag_name]
|
|
502
|
+
# Add updated flag
|
|
503
|
+
env_vars.append({"name": flag_name, "value": "True" if enabled else "False"})
|
|
504
|
+
|
|
505
|
+
container_def["environment"] = env_vars
|
|
506
|
+
|
|
507
|
+
# Register new task definition
|
|
508
|
+
new_task_def = self.ecs_client.register_task_definition(
|
|
509
|
+
family=task_def["family"],
|
|
510
|
+
containerDefinitions=task_def["containerDefinitions"],
|
|
511
|
+
requiresCompatibilities=task_def["requiresCompatibilities"],
|
|
512
|
+
networkMode=task_def["networkMode"],
|
|
513
|
+
cpu=task_def["cpu"],
|
|
514
|
+
memory=task_def["memory"],
|
|
515
|
+
executionRoleArn=task_def["executionRoleArn"],
|
|
516
|
+
taskRoleArn=task_def.get("taskRoleArn"),
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Update service to use new task definition
|
|
520
|
+
self.ecs_client.update_service(
|
|
521
|
+
cluster=self.cluster,
|
|
522
|
+
service=service_name,
|
|
523
|
+
taskDefinition=new_task_def["taskDefinition"]["taskDefinitionArn"],
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
return True
|
|
527
|
+
|
|
528
|
+
except Exception as e:
|
|
529
|
+
print(f"Feature flag update failed: {e}")
|
|
530
|
+
return False
|
|
531
|
+
|
|
532
|
+
def _delete_ecs_service(self, service_name: str) -> bool:
|
|
533
|
+
"""Delete ECS service (replicate GHA delete-service step)"""
|
|
534
|
+
try:
|
|
535
|
+
# Replicate exact GHA delete-service command with --force
|
|
536
|
+
self.ecs_client.delete_service(cluster=self.cluster, service=service_name, force=True)
|
|
537
|
+
|
|
538
|
+
print(f"✅ Deleted ECS service: {service_name}")
|
|
539
|
+
return True
|
|
540
|
+
|
|
541
|
+
except Exception as e:
|
|
542
|
+
print(f"❌ ECS service deletion failed: {e}")
|
|
543
|
+
return False
|
|
544
|
+
|
|
545
|
+
def _delete_ecr_image(self, image_tag: str) -> bool:
|
|
546
|
+
"""Delete ECR image tag (replicate GHA batch-delete-image step)"""
|
|
547
|
+
try:
|
|
548
|
+
# Get registry ID for ECR operations
|
|
549
|
+
ecr_response = self.ecr_client.get_authorization_token()
|
|
550
|
+
registry_id = ecr_response["authorizationData"][0]["proxyEndpoint"]
|
|
551
|
+
registry_id = registry_id.split(".")[0].replace("https://", "")
|
|
552
|
+
|
|
553
|
+
# Replicate exact GHA batch-delete-image command
|
|
554
|
+
self.ecr_client.batch_delete_image(
|
|
555
|
+
registryId=registry_id,
|
|
556
|
+
repositoryName=self.repository,
|
|
557
|
+
imageIds=[{"imageTag": image_tag}],
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
print(f"✅ Deleted ECR image: {image_tag}")
|
|
561
|
+
return True
|
|
562
|
+
|
|
563
|
+
except self.ecr_client.exceptions.ImageNotFoundException:
|
|
564
|
+
print(f"⚠️ ECR image not found: {image_tag} (already deleted)")
|
|
565
|
+
return True # Consider this success since it's already gone
|
|
566
|
+
except Exception as e:
|
|
567
|
+
print(f"❌ ECR image deletion failed: {e}")
|
|
568
|
+
return False
|
|
569
|
+
|
|
570
|
+
def find_expired_services(self, older_than: str) -> List[Dict[str, Any]]:
|
|
571
|
+
"""Find ECS services managed by showtime that are expired"""
|
|
572
|
+
import re
|
|
573
|
+
import time
|
|
574
|
+
|
|
575
|
+
try:
|
|
576
|
+
# Parse older_than (e.g., "48h", "7d")
|
|
577
|
+
time_match = re.match(r"(\d+)([hd])", older_than)
|
|
578
|
+
if not time_match:
|
|
579
|
+
return []
|
|
580
|
+
|
|
581
|
+
hours = int(time_match.group(1))
|
|
582
|
+
if time_match.group(2) == "d":
|
|
583
|
+
hours *= 24
|
|
584
|
+
|
|
585
|
+
cutoff_timestamp = time.time() - (hours * 3600)
|
|
586
|
+
expired_services = []
|
|
587
|
+
|
|
588
|
+
# List all services in cluster
|
|
589
|
+
response = self.ecs_client.list_services(cluster=self.cluster)
|
|
590
|
+
|
|
591
|
+
for service_arn in response.get("serviceArns", []):
|
|
592
|
+
service_name = service_arn.split("/")[-1]
|
|
593
|
+
|
|
594
|
+
# Only check services that match showtime pattern: pr-{number}-service
|
|
595
|
+
if not service_name.startswith("pr-") or not service_name.endswith("-service"):
|
|
596
|
+
continue
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
# Get service tags to check expiration
|
|
600
|
+
tags_response = self.ecs_client.list_tags_for_resource(resourceArn=service_arn)
|
|
601
|
+
tags = {tag["key"]: tag["value"] for tag in tags_response.get("tags", [])}
|
|
602
|
+
|
|
603
|
+
# Only process services managed by showtime
|
|
604
|
+
if tags.get("showtime_managed") != "true":
|
|
605
|
+
continue
|
|
606
|
+
|
|
607
|
+
# Check if expired
|
|
608
|
+
expires_timestamp = tags.get("showtime_expires")
|
|
609
|
+
created_timestamp = tags.get("showtime_created")
|
|
610
|
+
|
|
611
|
+
if expires_timestamp and float(expires_timestamp) < time.time():
|
|
612
|
+
# Extract PR number from service name: pr-1234-service -> 1234
|
|
613
|
+
pr_match = re.match(r"pr-(\d+)-service", service_name)
|
|
614
|
+
pr_number = int(pr_match.group(1)) if pr_match else None
|
|
615
|
+
|
|
616
|
+
age_hours = (
|
|
617
|
+
(time.time() - float(created_timestamp)) / 3600
|
|
618
|
+
if created_timestamp
|
|
619
|
+
else 0
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
expired_services.append(
|
|
623
|
+
{
|
|
624
|
+
"service_name": service_name,
|
|
625
|
+
"service_arn": service_arn,
|
|
626
|
+
"pr_number": pr_number,
|
|
627
|
+
"age_hours": age_hours,
|
|
628
|
+
"expires_timestamp": expires_timestamp,
|
|
629
|
+
"tags": tags,
|
|
630
|
+
}
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
except Exception as e:
|
|
634
|
+
print(f"⚠️ Could not check service {service_name}: {e}")
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
return expired_services
|
|
638
|
+
|
|
639
|
+
except Exception as e:
|
|
640
|
+
print(f"❌ Failed to find expired services: {e}")
|
|
641
|
+
return []
|
|
642
|
+
|
|
643
|
+
def _find_pr_services(self, pr_number: int) -> List[Dict[str, Any]]:
|
|
644
|
+
"""Find all ECS services for a specific PR"""
|
|
645
|
+
try:
|
|
646
|
+
pr_services = []
|
|
647
|
+
|
|
648
|
+
# List all services in cluster
|
|
649
|
+
response = self.ecs_client.list_services(cluster=self.cluster)
|
|
650
|
+
|
|
651
|
+
for service_arn in response.get("serviceArns", []):
|
|
652
|
+
service_name = service_arn.split("/")[-1]
|
|
653
|
+
|
|
654
|
+
# Check if service matches PR pattern: pr-{number}-{sha}-service
|
|
655
|
+
if service_name.startswith(f"pr-{pr_number}-") and service_name.endswith(
|
|
656
|
+
"-service"
|
|
657
|
+
):
|
|
658
|
+
try:
|
|
659
|
+
# Get service details
|
|
660
|
+
service_response = self.ecs_client.describe_services(
|
|
661
|
+
cluster=self.cluster, services=[service_name]
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
if service_response["services"]:
|
|
665
|
+
service = service_response["services"][0]
|
|
666
|
+
|
|
667
|
+
# Extract SHA from service name: pr-1234-abc123f-service -> abc123f
|
|
668
|
+
sha_match = service_name.replace(f"pr-{pr_number}-", "").replace(
|
|
669
|
+
"-service", ""
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
pr_services.append(
|
|
673
|
+
{
|
|
674
|
+
"service_name": service_name,
|
|
675
|
+
"service_arn": service_arn,
|
|
676
|
+
"sha": sha_match,
|
|
677
|
+
"status": service["status"],
|
|
678
|
+
"running_count": service["runningCount"],
|
|
679
|
+
"desired_count": service["desiredCount"],
|
|
680
|
+
"created_at": service["createdAt"],
|
|
681
|
+
}
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
except Exception as e:
|
|
685
|
+
print(f"⚠️ Could not check service {service_name}: {e}")
|
|
686
|
+
continue
|
|
687
|
+
|
|
688
|
+
return pr_services
|
|
689
|
+
|
|
690
|
+
except Exception as e:
|
|
691
|
+
print(f"❌ Failed to find PR services: {e}")
|
|
692
|
+
return []
|
|
693
|
+
|
|
694
|
+
def _wait_for_service_stability(self, service_name: str, timeout_minutes: int = 10) -> bool:
|
|
695
|
+
"""Wait for ECS service to become stable (replicate GHA wait-for-service-stability)"""
|
|
696
|
+
try:
|
|
697
|
+
# Use ECS waiter - same as GHA wait-for-service-stability
|
|
698
|
+
waiter = self.ecs_client.get_waiter("services_stable")
|
|
699
|
+
waiter.wait(
|
|
700
|
+
cluster=self.cluster,
|
|
701
|
+
services=[service_name],
|
|
702
|
+
WaiterConfig={"maxAttempts": timeout_minutes * 2}, # 30s intervals
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
print(f"✅ Service {service_name} is stable")
|
|
706
|
+
return True
|
|
707
|
+
|
|
708
|
+
except Exception as e:
|
|
709
|
+
print(f"❌ Service stability check failed: {e}")
|
|
710
|
+
return False
|
|
711
|
+
|
|
712
|
+
def _health_check_service(self, service_name: str, max_attempts: int = 6) -> bool:
|
|
713
|
+
"""Health check service by testing HTTP response"""
|
|
714
|
+
import time
|
|
715
|
+
|
|
716
|
+
import httpx
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
# Get service IP
|
|
720
|
+
ip = self.get_environment_ip(service_name)
|
|
721
|
+
if not ip:
|
|
722
|
+
print("❌ Could not get service IP for health check")
|
|
723
|
+
return False
|
|
724
|
+
|
|
725
|
+
health_url = f"http://{ip}:8080/health" # Superset health endpoint
|
|
726
|
+
fallback_url = f"http://{ip}:8080/" # Fallback to main page
|
|
727
|
+
|
|
728
|
+
for attempt in range(max_attempts):
|
|
729
|
+
try:
|
|
730
|
+
with httpx.Client(timeout=10.0) as client:
|
|
731
|
+
# Try health endpoint first
|
|
732
|
+
try:
|
|
733
|
+
response = client.get(health_url)
|
|
734
|
+
if response.status_code == 200:
|
|
735
|
+
print(f"✅ Health check passed on attempt {attempt + 1}")
|
|
736
|
+
return True
|
|
737
|
+
except httpx.RequestError:
|
|
738
|
+
pass
|
|
739
|
+
|
|
740
|
+
# Fallback to main page
|
|
741
|
+
response = client.get(fallback_url)
|
|
742
|
+
if response.status_code == 200:
|
|
743
|
+
print(f"✅ Health check passed (main page) on attempt {attempt + 1}")
|
|
744
|
+
return True
|
|
745
|
+
|
|
746
|
+
except Exception as e:
|
|
747
|
+
print(f"⚠️ Health check attempt {attempt + 1} failed: {e}")
|
|
748
|
+
|
|
749
|
+
if attempt < max_attempts - 1:
|
|
750
|
+
print("⏳ Waiting 30s before next health check attempt...")
|
|
751
|
+
time.sleep(30)
|
|
752
|
+
|
|
753
|
+
print(f"❌ Health check failed after {max_attempts} attempts")
|
|
754
|
+
return False
|
|
755
|
+
|
|
756
|
+
except Exception as e:
|
|
757
|
+
print(f"❌ Health check error: {e}")
|
|
758
|
+
return False
|