superset-showtime 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of superset-showtime might be problematic. Click here for more details.

showtime/core/aws.py ADDED
@@ -0,0 +1,758 @@
1
+ """
2
+ 🎪 AWS interface for circus tent environment management
3
+
4
+ Replicates the AWS logic from current GitHub Actions workflows.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import time
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ import boto3
15
+
16
+
17
+ @dataclass
18
+ class AWSError(Exception):
19
+ """AWS operation error"""
20
+
21
+ message: str
22
+ operation: str
23
+ resource: Optional[str] = None
24
+
25
+
26
+ @dataclass
27
+ class EnvironmentResult:
28
+ """Result of AWS environment operation"""
29
+
30
+ success: bool
31
+ ip: Optional[str] = None
32
+ service_name: Optional[str] = None
33
+ error: Optional[str] = None
34
+
35
+
36
+ class AWSInterface:
37
+ """AWS ECS/ECR client replicating current GHA logic"""
38
+
39
+ def __init__(self, region: str = None, cluster: str = None, repository: str = None):
40
+ self.region = region or os.getenv("AWS_REGION", "us-west-2")
41
+ self.cluster = cluster or os.getenv("ECS_CLUSTER", "superset-ci")
42
+ self.repository = repository or os.getenv("ECR_REPOSITORY", "superset-ci")
43
+
44
+ # AWS clients
45
+ self.ecs_client = boto3.client("ecs", region_name=self.region)
46
+ self.ecr_client = boto3.client("ecr", region_name=self.region)
47
+ self.ec2_client = boto3.client("ec2", region_name=self.region)
48
+
49
+ # Network configuration (from current GHA)
50
+ self.subnets = ["subnet-0e15a5034b4121710", "subnet-0e8efef4a72224974"]
51
+ self.security_group = "sg-092ff3a6ae0574d91"
52
+
53
+ def create_environment(
54
+ self,
55
+ pr_number: int,
56
+ sha: str,
57
+ github_user: str = "unknown",
58
+ feature_flags: List[Dict[str, str]] = None,
59
+ ) -> EnvironmentResult:
60
+ """
61
+ Create ephemeral environment with blue-green deployment support
62
+
63
+ Blue-Green Steps:
64
+ 1. Check if ECR image exists
65
+ 2. Create Show object for consistent naming
66
+ 3. Check for existing services (blue)
67
+ 4. Create new service with SHA (green)
68
+ 5. Wait for deployment stability
69
+ 6. Get public IP and return for traffic switching
70
+ """
71
+ from datetime import datetime
72
+
73
+ from .circus import Show
74
+
75
+ # Create Show object for consistent AWS naming
76
+ show = Show(
77
+ pr_number=pr_number,
78
+ sha=sha[:7], # Truncate to 7 chars like GitHub
79
+ status="building",
80
+ created_at=datetime.utcnow().strftime("%Y-%m-%dT%H-%M"),
81
+ requested_by=github_user,
82
+ )
83
+
84
+ service_name = f"{show.aws_service_name}-service" # pr-{pr_number}-{sha}-service
85
+ image_tag = show.aws_image_tag # pr-{pr_number}-{sha}-ci
86
+
87
+ try:
88
+ # Step 1: Check if ECR image exists (replicate GHA check-image step)
89
+ if not self._check_ecr_image_exists(image_tag):
90
+ return EnvironmentResult(
91
+ success=False,
92
+ error=f"Container image {image_tag} not found in ECR. Build the image first.",
93
+ )
94
+
95
+ # Step 2: Create/update ECS task definition with feature flags
96
+ task_def_arn = self._create_task_definition_with_image_and_flags(
97
+ image_tag, feature_flags or []
98
+ )
99
+ if not task_def_arn:
100
+ return EnvironmentResult(success=False, error="Failed to create task definition")
101
+
102
+ # Step 3: Blue-Green Logic - Check for existing services
103
+ print(f"🔍 Checking for existing services for PR #{pr_number}")
104
+ existing_services = self._find_pr_services(pr_number)
105
+
106
+ if existing_services:
107
+ print(
108
+ f"📊 Found {len(existing_services)} existing services - starting blue-green deployment"
109
+ )
110
+ for svc in existing_services:
111
+ print(f" 🔵 Blue: {svc['service_name']} ({svc['status']})")
112
+
113
+ # Step 4: Create new green service
114
+ print(f"🟢 Creating green service: {service_name}")
115
+ success = self._create_ecs_service(service_name, pr_number, github_user)
116
+ if not success:
117
+ return EnvironmentResult(success=False, error="Green service creation failed")
118
+
119
+ # Step 5: Deploy task definition to green service
120
+ success = self._deploy_task_definition(service_name, task_def_arn)
121
+ if not success:
122
+ return EnvironmentResult(
123
+ success=False, error="Green task definition deployment failed"
124
+ )
125
+
126
+ # Step 6: Wait for service stability (replicate GHA wait-for-service-stability)
127
+ print(f"⏳ Waiting for service {service_name} to become stable...")
128
+ if not self._wait_for_service_stability(service_name):
129
+ return EnvironmentResult(success=False, error="Service failed to become stable")
130
+
131
+ # Step 7: Health check the new service
132
+ print(f"🏥 Health checking service {service_name}...")
133
+ if not self._health_check_service(service_name):
134
+ return EnvironmentResult(success=False, error="Service failed health checks")
135
+
136
+ # Step 8: Get IP after health checks pass
137
+ ip = self.get_environment_ip(service_name)
138
+ if not ip:
139
+ return EnvironmentResult(success=False, error="Failed to get environment IP")
140
+
141
+ return EnvironmentResult(success=True, ip=ip, service_name=service_name)
142
+
143
+ except Exception as e:
144
+ return EnvironmentResult(success=False, error=str(e))
145
+
146
+ def delete_environment(self, service_name: str, pr_number: int) -> bool:
147
+ """
148
+ Delete ephemeral environment - replicates cleanup GHA logic
149
+
150
+ Steps:
151
+ 1. Check if ECS service exists and is active
152
+ 2. Delete ECS service with --force
153
+ 3. Delete ECR image tag
154
+ """
155
+ try:
156
+ # Step 1: Check if service exists and is active
157
+ if not self._service_exists(service_name):
158
+ return True # Already deleted
159
+
160
+ # Step 2: Delete ECS service (force delete)
161
+ self.ecs_client.delete_service(cluster=self.cluster, service=service_name, force=True)
162
+
163
+ # Step 3: Delete ECR image tag
164
+ # Extract SHA from service name: pr-1234-abc123f → abc123f
165
+ sha = service_name.split("-")[-1]
166
+ image_tag = f"pr-{pr_number}-{sha}"
167
+
168
+ try:
169
+ self.ecr_client.batch_delete_image(
170
+ repositoryName=self.repository, imageIds=[{"imageTag": image_tag}]
171
+ )
172
+ except self.ecr_client.exceptions.ImageNotFoundException:
173
+ pass # Image already deleted
174
+
175
+ return True
176
+
177
+ except Exception as e:
178
+ raise AWSError(message=str(e), operation="delete_environment", resource=service_name)
179
+
180
+ def get_environment_ip(self, service_name: str) -> Optional[str]:
181
+ """
182
+ Get public IP for environment - replicates GHA IP discovery logic
183
+
184
+ Steps:
185
+ 1. List tasks for service
186
+ 2. Describe task to get network interface
187
+ 3. Get public IP from network interface
188
+ """
189
+ try:
190
+ # Step 1: List tasks
191
+ tasks_response = self.ecs_client.list_tasks(
192
+ cluster=self.cluster, serviceName=service_name
193
+ )
194
+
195
+ if not tasks_response["taskArns"]:
196
+ return None
197
+
198
+ task_arn = tasks_response["taskArns"][0]
199
+
200
+ # Step 2: Describe task to get network interface
201
+ task_response = self.ecs_client.describe_tasks(cluster=self.cluster, tasks=[task_arn])
202
+
203
+ if not task_response["tasks"]:
204
+ return None
205
+
206
+ task = task_response["tasks"][0]
207
+
208
+ # Find network interface ID
209
+ eni_id = None
210
+ for attachment in task.get("attachments", []):
211
+ for detail in attachment.get("details", []):
212
+ if detail["name"] == "networkInterfaceId":
213
+ eni_id = detail["value"]
214
+ break
215
+ if eni_id:
216
+ break
217
+
218
+ if not eni_id:
219
+ return None
220
+
221
+ # Step 3: Get public IP from network interface
222
+ eni_response = self.ec2_client.describe_network_interfaces(NetworkInterfaceIds=[eni_id])
223
+
224
+ if not eni_response["NetworkInterfaces"]:
225
+ return None
226
+
227
+ eni = eni_response["NetworkInterfaces"][0]
228
+ return eni.get("Association", {}).get("PublicIp")
229
+
230
+ except Exception:
231
+ return None
232
+
233
+ def get_environment_status(self, service_name: str) -> str:
234
+ """Get environment status from AWS"""
235
+ try:
236
+ response = self.ecs_client.describe_services(
237
+ cluster=self.cluster, services=[service_name]
238
+ )
239
+
240
+ if not response["services"]:
241
+ return "not_found"
242
+
243
+ service = response["services"][0]
244
+ status = service["status"]
245
+
246
+ if status == "ACTIVE":
247
+ # Check if tasks are running
248
+ running_count = service["runningCount"]
249
+ desired_count = service["desiredCount"]
250
+
251
+ if running_count == desired_count and running_count > 0:
252
+ return "running"
253
+ else:
254
+ return "building"
255
+ else:
256
+ return "failed"
257
+
258
+ except Exception:
259
+ return "unknown"
260
+
261
+ def _check_ecr_image_exists(self, image_tag: str) -> bool:
262
+ """Check if ECR image exists (replicate GHA check-image step)"""
263
+ try:
264
+ # Get registry ID from ECR login
265
+ ecr_response = self.ecr_client.get_authorization_token()
266
+ registry_id = ecr_response["authorizationData"][0]["proxyEndpoint"]
267
+ registry_id = registry_id.split(".")[0].replace("https://", "")
268
+
269
+ # Replicate exact GHA describe-images command
270
+ self.ecr_client.describe_images(
271
+ registryId=registry_id,
272
+ repositoryName=self.repository,
273
+ imageIds=[{"imageTag": image_tag}],
274
+ )
275
+
276
+ print(f"✅ Found ECR image: {image_tag}")
277
+ return True
278
+
279
+ except self.ecr_client.exceptions.ImageNotFoundException:
280
+ print(f"❌ ECR image not found: {image_tag}")
281
+ return False
282
+ except Exception as e:
283
+ print(f"❌ ECR image check failed: {e}")
284
+ return False
285
+
286
+ def _create_task_definition_with_image_and_flags(
287
+ self, image_tag: str, feature_flags: List[Dict[str, str]]
288
+ ) -> Optional[str]:
289
+ """Create ECS task definition with image and feature flags (replicate GHA task-def + env vars)"""
290
+ try:
291
+ # Load base task definition template
292
+ task_def_path = Path(__file__).parent.parent / "data" / "ecs-task-definition.json"
293
+ with open(task_def_path) as f:
294
+ task_def = json.load(f)
295
+
296
+ # Get ECR registry for full image URL
297
+ ecr_response = self.ecr_client.get_authorization_token()
298
+ registry_url = ecr_response["authorizationData"][0]["proxyEndpoint"]
299
+ registry_url = registry_url.replace("https://", "")
300
+ full_image_url = f"{registry_url}/{self.repository}:{image_tag}"
301
+
302
+ # Update image in container definition (replicate GHA render-task-definition)
303
+ task_def["containerDefinitions"][0]["image"] = full_image_url
304
+
305
+ # Add feature flags to environment (replicate GHA jq environment update)
306
+ container_env = task_def["containerDefinitions"][0]["environment"]
307
+ for flag in feature_flags:
308
+ container_env.append(flag)
309
+
310
+ # Register task definition
311
+ response = self.ecs_client.register_task_definition(**task_def)
312
+ task_def_arn = response["taskDefinition"]["taskDefinitionArn"]
313
+
314
+ print(f"✅ Created task definition: {task_def_arn}")
315
+ return task_def_arn
316
+
317
+ except Exception as e:
318
+ print(f"❌ Task definition creation failed: {e}")
319
+ return None
320
+
321
+ def _deploy_task_definition(self, service_name: str, task_def_arn: str) -> bool:
322
+ """Deploy task definition to service (replicate GHA deploy-task step)"""
323
+ try:
324
+ # Replicate exact GHA deploy-task-definition parameters
325
+ self.ecs_client.update_service(
326
+ cluster=self.cluster, service=service_name, taskDefinition=task_def_arn
327
+ )
328
+
329
+ print(f"✅ Updated service {service_name} with task definition")
330
+ return True
331
+
332
+ except Exception as e:
333
+ print(f"❌ Task definition deployment failed: {e}")
334
+ return False
335
+
336
+ def _service_exists(self, service_name: str) -> bool:
337
+ """Check if ECS service exists and is active"""
338
+ try:
339
+ response = self.ecs_client.describe_services(
340
+ cluster=self.cluster, services=[service_name]
341
+ )
342
+
343
+ for service in response["services"]:
344
+ if service["status"] == "ACTIVE":
345
+ return True
346
+
347
+ return False
348
+
349
+ except Exception:
350
+ return False
351
+
352
+ def _create_ecs_service(self, service_name: str, pr_number: int, github_user: str) -> bool:
353
+ """Create ECS service (replicate exact GHA create-service step)"""
354
+ try:
355
+ # Replicate exact GHA create-service command parameters
356
+ response = self.ecs_client.create_service(
357
+ cluster=self.cluster,
358
+ serviceName=service_name, # pr-{pr_number}-service
359
+ taskDefinition=self.cluster, # Uses cluster name as task def family
360
+ launchType="FARGATE",
361
+ desiredCount=1,
362
+ platformVersion="LATEST",
363
+ networkConfiguration={
364
+ "awsvpcConfiguration": {
365
+ "subnets": self.subnets, # Same subnets as GHA
366
+ "securityGroups": [self.security_group], # Same SG as GHA
367
+ "assignPublicIp": "ENABLED",
368
+ }
369
+ },
370
+ tags=[
371
+ {"key": "pr", "value": str(pr_number)},
372
+ {"key": "github_user", "value": github_user},
373
+ {"key": "showtime_created", "value": str(int(time.time()))},
374
+ {
375
+ "key": "showtime_expires",
376
+ "value": str(int(time.time()) + 48 * 3600),
377
+ }, # 48 hours
378
+ {"key": "showtime_managed", "value": "true"},
379
+ ],
380
+ )
381
+
382
+ print(f"✅ Created ECS service: {service_name}")
383
+ return True
384
+
385
+ except Exception as e:
386
+ print(f"❌ ECS service creation failed: {e}")
387
+ return False
388
+
389
+ def _wait_for_deployment_and_get_ip(
390
+ self, service_name: str, timeout_minutes: int = 10
391
+ ) -> Optional[str]:
392
+ """Wait for ECS deployment to complete and get IP"""
393
+ try:
394
+ # Wait for service stability (replicate GHA wait-for-service-stability)
395
+ waiter = self.ecs_client.get_waiter("services_stable")
396
+ waiter.wait(
397
+ cluster=self.cluster,
398
+ services=[service_name],
399
+ WaiterConfig={"maxAttempts": timeout_minutes * 2}, # 30s intervals
400
+ )
401
+
402
+ # Get IP after deployment is stable
403
+ return self.get_environment_ip(service_name)
404
+
405
+ except Exception:
406
+ return None
407
+
408
+ def list_circus_environments(self) -> List[Dict[str, Any]]:
409
+ """List all environments with circus tags"""
410
+ try:
411
+ # List all services in cluster
412
+ services_response = self.ecs_client.list_services(cluster=self.cluster)
413
+
414
+ circus_services = []
415
+ for service_arn in services_response["serviceArns"]:
416
+ service_name = service_arn.split("/")[-1]
417
+
418
+ # Check if it's a circus service (pr-{number}-{sha} pattern)
419
+ if service_name.startswith("pr-") and len(service_name.split("-")) >= 3:
420
+ # Get service details and tags
421
+ service_response = self.ecs_client.describe_services(
422
+ cluster=self.cluster, services=[service_name]
423
+ )
424
+
425
+ if service_response["services"]:
426
+ service = service_response["services"][0]
427
+ circus_services.append(
428
+ {
429
+ "service_name": service_name,
430
+ "status": service["status"],
431
+ "running_count": service["runningCount"],
432
+ "desired_count": service["desiredCount"],
433
+ "created_at": service["createdAt"],
434
+ "ip": self.get_environment_ip(service_name),
435
+ }
436
+ )
437
+
438
+ return circus_services
439
+
440
+ except Exception:
441
+ return []
442
+
443
+ def cleanup_orphaned_environments(self, max_age_hours: int = 48) -> List[str]:
444
+ """Clean up environments older than max_age_hours"""
445
+ import time
446
+
447
+ try:
448
+ orphaned = []
449
+ circus_services = self.list_circus_environments()
450
+
451
+ current_time = time.time()
452
+ max_age_seconds = max_age_hours * 3600
453
+
454
+ for service in circus_services:
455
+ # Calculate age
456
+ created_timestamp = service["created_at"].timestamp()
457
+ age_seconds = current_time - created_timestamp
458
+
459
+ if age_seconds > max_age_seconds:
460
+ service_name = service["service_name"]
461
+
462
+ # Extract PR number for cleanup
463
+ pr_number = int(service_name.split("-")[1])
464
+
465
+ # Delete the service
466
+ if self.delete_environment(service_name, pr_number):
467
+ orphaned.append(service_name)
468
+
469
+ return orphaned
470
+
471
+ except Exception as e:
472
+ raise AWSError(message=str(e), operation="cleanup_orphaned_environments")
473
+
474
+ def update_feature_flags(self, service_name: str, feature_flags: Dict[str, bool]) -> bool:
475
+ """Update feature flags in running environment"""
476
+ try:
477
+ # Get current task definition
478
+ service_response = self.ecs_client.describe_services(
479
+ cluster=self.cluster, services=[service_name]
480
+ )
481
+
482
+ if not service_response["services"]:
483
+ return False
484
+
485
+ task_def_arn = service_response["services"][0]["taskDefinition"]
486
+
487
+ # Get task definition details
488
+ task_def_response = self.ecs_client.describe_task_definition(
489
+ taskDefinition=task_def_arn
490
+ )
491
+
492
+ task_def = task_def_response["taskDefinition"]
493
+
494
+ # Update environment variables
495
+ container_def = task_def["containerDefinitions"][0]
496
+ env_vars = container_def.get("environment", [])
497
+
498
+ # Update feature flags
499
+ for flag_name, enabled in feature_flags.items():
500
+ # Remove existing flag
501
+ env_vars = [e for e in env_vars if e["name"] != flag_name]
502
+ # Add updated flag
503
+ env_vars.append({"name": flag_name, "value": "True" if enabled else "False"})
504
+
505
+ container_def["environment"] = env_vars
506
+
507
+ # Register new task definition
508
+ new_task_def = self.ecs_client.register_task_definition(
509
+ family=task_def["family"],
510
+ containerDefinitions=task_def["containerDefinitions"],
511
+ requiresCompatibilities=task_def["requiresCompatibilities"],
512
+ networkMode=task_def["networkMode"],
513
+ cpu=task_def["cpu"],
514
+ memory=task_def["memory"],
515
+ executionRoleArn=task_def["executionRoleArn"],
516
+ taskRoleArn=task_def.get("taskRoleArn"),
517
+ )
518
+
519
+ # Update service to use new task definition
520
+ self.ecs_client.update_service(
521
+ cluster=self.cluster,
522
+ service=service_name,
523
+ taskDefinition=new_task_def["taskDefinition"]["taskDefinitionArn"],
524
+ )
525
+
526
+ return True
527
+
528
+ except Exception as e:
529
+ print(f"Feature flag update failed: {e}")
530
+ return False
531
+
532
+ def _delete_ecs_service(self, service_name: str) -> bool:
533
+ """Delete ECS service (replicate GHA delete-service step)"""
534
+ try:
535
+ # Replicate exact GHA delete-service command with --force
536
+ self.ecs_client.delete_service(cluster=self.cluster, service=service_name, force=True)
537
+
538
+ print(f"✅ Deleted ECS service: {service_name}")
539
+ return True
540
+
541
+ except Exception as e:
542
+ print(f"❌ ECS service deletion failed: {e}")
543
+ return False
544
+
545
+ def _delete_ecr_image(self, image_tag: str) -> bool:
546
+ """Delete ECR image tag (replicate GHA batch-delete-image step)"""
547
+ try:
548
+ # Get registry ID for ECR operations
549
+ ecr_response = self.ecr_client.get_authorization_token()
550
+ registry_id = ecr_response["authorizationData"][0]["proxyEndpoint"]
551
+ registry_id = registry_id.split(".")[0].replace("https://", "")
552
+
553
+ # Replicate exact GHA batch-delete-image command
554
+ self.ecr_client.batch_delete_image(
555
+ registryId=registry_id,
556
+ repositoryName=self.repository,
557
+ imageIds=[{"imageTag": image_tag}],
558
+ )
559
+
560
+ print(f"✅ Deleted ECR image: {image_tag}")
561
+ return True
562
+
563
+ except self.ecr_client.exceptions.ImageNotFoundException:
564
+ print(f"⚠️ ECR image not found: {image_tag} (already deleted)")
565
+ return True # Consider this success since it's already gone
566
+ except Exception as e:
567
+ print(f"❌ ECR image deletion failed: {e}")
568
+ return False
569
+
570
+ def find_expired_services(self, older_than: str) -> List[Dict[str, Any]]:
571
+ """Find ECS services managed by showtime that are expired"""
572
+ import re
573
+ import time
574
+
575
+ try:
576
+ # Parse older_than (e.g., "48h", "7d")
577
+ time_match = re.match(r"(\d+)([hd])", older_than)
578
+ if not time_match:
579
+ return []
580
+
581
+ hours = int(time_match.group(1))
582
+ if time_match.group(2) == "d":
583
+ hours *= 24
584
+
585
+ cutoff_timestamp = time.time() - (hours * 3600)
586
+ expired_services = []
587
+
588
+ # List all services in cluster
589
+ response = self.ecs_client.list_services(cluster=self.cluster)
590
+
591
+ for service_arn in response.get("serviceArns", []):
592
+ service_name = service_arn.split("/")[-1]
593
+
594
+ # Only check services that match showtime pattern: pr-{number}-service
595
+ if not service_name.startswith("pr-") or not service_name.endswith("-service"):
596
+ continue
597
+
598
+ try:
599
+ # Get service tags to check expiration
600
+ tags_response = self.ecs_client.list_tags_for_resource(resourceArn=service_arn)
601
+ tags = {tag["key"]: tag["value"] for tag in tags_response.get("tags", [])}
602
+
603
+ # Only process services managed by showtime
604
+ if tags.get("showtime_managed") != "true":
605
+ continue
606
+
607
+ # Check if expired
608
+ expires_timestamp = tags.get("showtime_expires")
609
+ created_timestamp = tags.get("showtime_created")
610
+
611
+ if expires_timestamp and float(expires_timestamp) < time.time():
612
+ # Extract PR number from service name: pr-1234-service -> 1234
613
+ pr_match = re.match(r"pr-(\d+)-service", service_name)
614
+ pr_number = int(pr_match.group(1)) if pr_match else None
615
+
616
+ age_hours = (
617
+ (time.time() - float(created_timestamp)) / 3600
618
+ if created_timestamp
619
+ else 0
620
+ )
621
+
622
+ expired_services.append(
623
+ {
624
+ "service_name": service_name,
625
+ "service_arn": service_arn,
626
+ "pr_number": pr_number,
627
+ "age_hours": age_hours,
628
+ "expires_timestamp": expires_timestamp,
629
+ "tags": tags,
630
+ }
631
+ )
632
+
633
+ except Exception as e:
634
+ print(f"⚠️ Could not check service {service_name}: {e}")
635
+ continue
636
+
637
+ return expired_services
638
+
639
+ except Exception as e:
640
+ print(f"❌ Failed to find expired services: {e}")
641
+ return []
642
+
643
+ def _find_pr_services(self, pr_number: int) -> List[Dict[str, Any]]:
644
+ """Find all ECS services for a specific PR"""
645
+ try:
646
+ pr_services = []
647
+
648
+ # List all services in cluster
649
+ response = self.ecs_client.list_services(cluster=self.cluster)
650
+
651
+ for service_arn in response.get("serviceArns", []):
652
+ service_name = service_arn.split("/")[-1]
653
+
654
+ # Check if service matches PR pattern: pr-{number}-{sha}-service
655
+ if service_name.startswith(f"pr-{pr_number}-") and service_name.endswith(
656
+ "-service"
657
+ ):
658
+ try:
659
+ # Get service details
660
+ service_response = self.ecs_client.describe_services(
661
+ cluster=self.cluster, services=[service_name]
662
+ )
663
+
664
+ if service_response["services"]:
665
+ service = service_response["services"][0]
666
+
667
+ # Extract SHA from service name: pr-1234-abc123f-service -> abc123f
668
+ sha_match = service_name.replace(f"pr-{pr_number}-", "").replace(
669
+ "-service", ""
670
+ )
671
+
672
+ pr_services.append(
673
+ {
674
+ "service_name": service_name,
675
+ "service_arn": service_arn,
676
+ "sha": sha_match,
677
+ "status": service["status"],
678
+ "running_count": service["runningCount"],
679
+ "desired_count": service["desiredCount"],
680
+ "created_at": service["createdAt"],
681
+ }
682
+ )
683
+
684
+ except Exception as e:
685
+ print(f"⚠️ Could not check service {service_name}: {e}")
686
+ continue
687
+
688
+ return pr_services
689
+
690
+ except Exception as e:
691
+ print(f"❌ Failed to find PR services: {e}")
692
+ return []
693
+
694
+ def _wait_for_service_stability(self, service_name: str, timeout_minutes: int = 10) -> bool:
695
+ """Wait for ECS service to become stable (replicate GHA wait-for-service-stability)"""
696
+ try:
697
+ # Use ECS waiter - same as GHA wait-for-service-stability
698
+ waiter = self.ecs_client.get_waiter("services_stable")
699
+ waiter.wait(
700
+ cluster=self.cluster,
701
+ services=[service_name],
702
+ WaiterConfig={"maxAttempts": timeout_minutes * 2}, # 30s intervals
703
+ )
704
+
705
+ print(f"✅ Service {service_name} is stable")
706
+ return True
707
+
708
+ except Exception as e:
709
+ print(f"❌ Service stability check failed: {e}")
710
+ return False
711
+
712
+ def _health_check_service(self, service_name: str, max_attempts: int = 6) -> bool:
713
+ """Health check service by testing HTTP response"""
714
+ import time
715
+
716
+ import httpx
717
+
718
+ try:
719
+ # Get service IP
720
+ ip = self.get_environment_ip(service_name)
721
+ if not ip:
722
+ print("❌ Could not get service IP for health check")
723
+ return False
724
+
725
+ health_url = f"http://{ip}:8080/health" # Superset health endpoint
726
+ fallback_url = f"http://{ip}:8080/" # Fallback to main page
727
+
728
+ for attempt in range(max_attempts):
729
+ try:
730
+ with httpx.Client(timeout=10.0) as client:
731
+ # Try health endpoint first
732
+ try:
733
+ response = client.get(health_url)
734
+ if response.status_code == 200:
735
+ print(f"✅ Health check passed on attempt {attempt + 1}")
736
+ return True
737
+ except httpx.RequestError:
738
+ pass
739
+
740
+ # Fallback to main page
741
+ response = client.get(fallback_url)
742
+ if response.status_code == 200:
743
+ print(f"✅ Health check passed (main page) on attempt {attempt + 1}")
744
+ return True
745
+
746
+ except Exception as e:
747
+ print(f"⚠️ Health check attempt {attempt + 1} failed: {e}")
748
+
749
+ if attempt < max_attempts - 1:
750
+ print("⏳ Waiting 30s before next health check attempt...")
751
+ time.sleep(30)
752
+
753
+ print(f"❌ Health check failed after {max_attempts} attempts")
754
+ return False
755
+
756
+ except Exception as e:
757
+ print(f"❌ Health check error: {e}")
758
+ return False