nuvu-scan 1.3.8__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
1
  """
2
2
  Amazon Redshift collector.
3
3
 
4
- Collects Redshift clusters and serverless namespaces.
4
+ Collects Redshift clusters, serverless namespaces, datashares, and external schemas.
5
5
  """
6
6
 
7
+ from datetime import datetime, timedelta, timezone
7
8
  from typing import Any
8
9
 
9
10
  import boto3
@@ -18,21 +19,96 @@ class RedshiftCollector:
18
19
  def __init__(self, session: boto3.Session, regions: list[str] | None = None):
19
20
  self.session = session
20
21
  self.regions = regions or []
22
+ self._account_id: str | None = None
21
23
 
22
24
  def collect(self) -> list[Asset]:
23
- """Collect Redshift clusters and serverless namespaces."""
25
+ """Collect all Redshift resources."""
26
+ import sys
27
+
24
28
  assets = []
25
29
 
26
- # Collect provisioned clusters
27
- assets.extend(self._collect_clusters())
30
+ # Collect reserved nodes first to compare with clusters
31
+ print(" → Checking reserved nodes...", file=sys.stderr)
32
+ self._reserved_nodes = self._get_reserved_nodes()
28
33
 
29
- # Collect serverless namespaces
30
- assets.extend(self._collect_serverless())
34
+ # Collect provisioned clusters
35
+ print(" → Collecting Redshift clusters...", file=sys.stderr)
36
+ cluster_assets = self._collect_clusters()
37
+ assets.extend(cluster_assets)
38
+ print(f" → Found {len(cluster_assets)} clusters", file=sys.stderr)
39
+
40
+ # Collect serverless namespaces and workgroups
41
+ print(" → Collecting Redshift Serverless...", file=sys.stderr)
42
+ serverless_assets = self._collect_serverless()
43
+ assets.extend(serverless_assets)
44
+ print(f" → Found {len(serverless_assets)} serverless resources", file=sys.stderr)
45
+
46
+ # Collect datashares (cross-account data sharing)
47
+ print(" → Collecting Redshift datashares...", file=sys.stderr)
48
+ datashare_assets = self._collect_datashares()
49
+ assets.extend(datashare_assets)
50
+ print(f" → Found {len(datashare_assets)} datashares", file=sys.stderr)
51
+
52
+ # Collect snapshots (cost saving opportunity)
53
+ print(" → Collecting Redshift snapshots (may take a moment)...", file=sys.stderr)
54
+ snapshot_assets = self._collect_snapshots()
55
+ assets.extend(snapshot_assets)
56
+ print(f" → Found {len(snapshot_assets)} snapshots", file=sys.stderr)
57
+
58
+ # Collect reserved node info as assets (for visibility)
59
+ reserved_assets = self._collect_reserved_nodes_as_assets()
60
+ assets.extend(reserved_assets)
61
+ print(f" → Found {len(reserved_assets)} reserved nodes", file=sys.stderr)
31
62
 
32
63
  return assets
33
64
 
65
+ def _get_reserved_nodes(self) -> dict[str, list[dict]]:
66
+ """Get reserved nodes per region for comparison with on-demand clusters."""
67
+ reserved_by_region = {}
68
+
69
+ regions_to_check = self.regions if self.regions else ["us-east-1"]
70
+
71
+ for region in regions_to_check:
72
+ try:
73
+ redshift_client = self.session.client("redshift", region_name=region)
74
+ response = redshift_client.describe_reserved_nodes()
75
+
76
+ active_reservations = []
77
+ for node in response.get("ReservedNodes", []):
78
+ if node.get("State") == "active":
79
+ active_reservations.append(
80
+ {
81
+ "node_type": node.get("NodeType"),
82
+ "node_count": node.get("NodeCount", 0),
83
+ "duration": node.get("Duration", 0),
84
+ "start_time": node.get("StartTime"),
85
+ "offering_type": node.get("OfferingType"),
86
+ "reserved_node_id": node.get("ReservedNodeId"),
87
+ }
88
+ )
89
+
90
+ reserved_by_region[region] = active_reservations
91
+
92
+ except ClientError as e:
93
+ if "AccessDenied" not in str(e):
94
+ print(f"Error getting reserved nodes in {region}: {e}")
95
+ reserved_by_region[region] = []
96
+
97
+ return reserved_by_region
98
+
99
+ def _get_account_id(self) -> str:
100
+ """Get AWS account ID."""
101
+ if self._account_id:
102
+ return self._account_id
103
+ try:
104
+ sts = self.session.client("sts")
105
+ self._account_id = sts.get_caller_identity()["Account"]
106
+ return self._account_id
107
+ except ClientError:
108
+ return ""
109
+
34
110
  def _collect_clusters(self) -> list[Asset]:
35
- """Collect provisioned Redshift clusters."""
111
+ """Collect provisioned Redshift clusters with enhanced metrics."""
36
112
  assets = []
37
113
 
38
114
  regions_to_check = self.regions if self.regions else ["us-east-1"]
@@ -47,20 +123,86 @@ class RedshiftCollector:
47
123
  for cluster in response.get("Clusters", []):
48
124
  cluster_id = cluster["ClusterIdentifier"]
49
125
 
50
- # Get cluster status and usage
126
+ # Get cluster status and configuration
51
127
  status = cluster.get("ClusterStatus", "unknown")
52
128
  node_count = cluster.get("NumberOfNodes", 0)
53
129
  node_type = cluster.get("NodeType", "")
130
+ db_name = cluster.get("DBName", "")
131
+
132
+ # Get encryption status
133
+ encrypted = cluster.get("Encrypted", False)
134
+
135
+ # Get VPC security info
136
+ publicly_accessible = cluster.get("PubliclyAccessible", False)
137
+ vpc_id = cluster.get("VpcId", "")
138
+
139
+ # Get tags
140
+ tags = {tag["Key"]: tag["Value"] for tag in cluster.get("Tags", [])}
141
+ ownership = self._infer_ownership(tags, cluster_id)
142
+
143
+ # Get last activity from CloudWatch metrics
144
+ last_activity = self._get_last_activity_cloudwatch(cluster_id, region)
145
+ days_since_last_use = self._calculate_days_since_last_use(last_activity)
54
146
 
147
+ # Check if cluster is covered by reserved nodes
148
+ reservation_status = self._check_reservation_coverage(
149
+ region, node_type, node_count
150
+ )
151
+
152
+ # Get WLM configuration
153
+ wlm_config = self._get_wlm_configuration(redshift_client, cluster_id)
154
+
155
+ # Calculate cluster age for reservation recommendation
156
+ create_time = cluster.get("ClusterCreateTime")
157
+ cluster_age_days = None
158
+ if create_time:
159
+ cluster_age_days = (
160
+ datetime.now(timezone.utc) - create_time.replace(tzinfo=timezone.utc)
161
+ ).days
162
+
163
+ # Build risk flags
55
164
  risk_flags = []
56
- if status == "available" and node_count > 0:
57
- # Check if cluster is idle (no recent queries)
58
- # This would require querying system tables, simplified here
59
- pass
165
+ if publicly_accessible:
166
+ risk_flags.append("publicly_accessible")
167
+ if not encrypted:
168
+ risk_flags.append("unencrypted")
169
+ if days_since_last_use is not None and days_since_last_use > 30:
170
+ risk_flags.append("low_activity")
171
+ if days_since_last_use is not None and days_since_last_use > 90:
172
+ risk_flags.append("potentially_unused")
173
+
174
+ # Reservation-related risks (cost optimization)
175
+ if (
176
+ not reservation_status["covered"]
177
+ and cluster_age_days
178
+ and cluster_age_days > 90
179
+ ):
180
+ risk_flags.append("no_reservation_long_running")
181
+
182
+ # WLM risks
183
+ if wlm_config.get("is_default_only"):
184
+ risk_flags.append("default_wlm_only")
185
+ if wlm_config.get("has_unlimited_queue"):
186
+ risk_flags.append("unlimited_wlm_queue")
187
+
188
+ # Get maintenance window info
189
+ maintenance_window = cluster.get("PreferredMaintenanceWindow", "")
60
190
 
61
191
  # Estimate cost based on node type and count
62
192
  monthly_cost = self._estimate_cluster_cost(node_type, node_count)
63
193
 
194
+ # Calculate potential savings from reservation
195
+ potential_reservation_savings = 0.0
196
+ if (
197
+ not reservation_status["covered"]
198
+ and cluster_age_days
199
+ and cluster_age_days > 30
200
+ ):
201
+ # Reserved nodes typically save 30-75% depending on term
202
+ potential_reservation_savings = (
203
+ monthly_cost * 0.40
204
+ ) # Conservative 40% estimate
205
+
64
206
  assets.append(
65
207
  Asset(
66
208
  provider="aws",
@@ -70,7 +212,7 @@ class RedshiftCollector:
70
212
  region=region,
71
213
  arn=cluster.get(
72
214
  "ClusterNamespaceArn",
73
- f"arn:aws:redshift:{region}::cluster:{cluster_id}",
215
+ f"arn:aws:redshift:{region}:{self._get_account_id()}:cluster:{cluster_id}",
74
216
  ),
75
217
  name=cluster_id,
76
218
  created_at=(
@@ -78,16 +220,38 @@ class RedshiftCollector:
78
220
  if cluster.get("ClusterCreateTime")
79
221
  else None
80
222
  ),
223
+ tags=tags,
81
224
  risk_flags=risk_flags,
82
- last_activity_at=self._get_last_activity(cluster_id, region),
225
+ ownership_confidence=ownership["confidence"],
226
+ suggested_owner=ownership["owner"],
227
+ last_activity_at=last_activity,
83
228
  usage_metrics={
84
229
  "status": status,
85
230
  "node_count": node_count,
86
231
  "node_type": node_type,
87
- "last_used": self._get_last_activity(cluster_id, region),
88
- "days_since_last_use": self._calculate_days_since_last_use(
89
- self._get_last_activity(cluster_id, region)
232
+ "database_name": db_name,
233
+ "encrypted": encrypted,
234
+ "publicly_accessible": publicly_accessible,
235
+ "vpc_id": vpc_id,
236
+ "maintenance_window": maintenance_window,
237
+ "cluster_version": cluster.get("ClusterVersion", ""),
238
+ "cluster_age_days": cluster_age_days,
239
+ "last_used": last_activity,
240
+ "days_since_last_use": days_since_last_use,
241
+ # Reservation info
242
+ "has_reservation": reservation_status["covered"],
243
+ "reserved_nodes_count": reservation_status.get("reserved_count", 0),
244
+ "on_demand_nodes_count": reservation_status.get(
245
+ "on_demand_count", node_count
246
+ ),
247
+ "potential_reservation_savings_usd": potential_reservation_savings,
248
+ # WLM configuration
249
+ "wlm_queue_count": wlm_config.get("queue_count", 0),
250
+ "wlm_is_default_only": wlm_config.get("is_default_only", True),
251
+ "wlm_has_unlimited_queue": wlm_config.get(
252
+ "has_unlimited_queue", False
90
253
  ),
254
+ "wlm_auto_wlm": wlm_config.get("auto_wlm", False),
91
255
  },
92
256
  cost_estimate_usd=monthly_cost,
93
257
  )
@@ -99,7 +263,7 @@ class RedshiftCollector:
99
263
  return assets
100
264
 
101
265
  def _collect_serverless(self) -> list[Asset]:
102
- """Collect Redshift Serverless namespaces."""
266
+ """Collect Redshift Serverless namespaces and workgroups."""
103
267
  assets = []
104
268
 
105
269
  regions_to_check = self.regions if self.regions else ["us-east-1"]
@@ -113,16 +277,24 @@ class RedshiftCollector:
113
277
 
114
278
  for namespace in response.get("namespaces", []):
115
279
  namespace_name = namespace.get("namespaceName", "")
280
+ namespace_id = namespace.get("namespaceId", "")
116
281
 
117
282
  # Get workgroups for namespace
118
283
  workgroups_response = redshift_client.list_workgroups()
119
- workgroup_count = len(
120
- [
121
- wg
122
- for wg in workgroups_response.get("workgroups", [])
123
- if wg.get("namespaceName") == namespace_name
124
- ]
125
- )
284
+ associated_workgroups = [
285
+ wg
286
+ for wg in workgroups_response.get("workgroups", [])
287
+ if wg.get("namespaceName") == namespace_name
288
+ ]
289
+ workgroup_count = len(associated_workgroups)
290
+
291
+ # Check for encryption
292
+ kms_key = namespace.get("kmsKeyId")
293
+ encrypted = bool(kms_key)
294
+
295
+ risk_flags = []
296
+ if not encrypted:
297
+ risk_flags.append("unencrypted")
126
298
 
127
299
  assets.append(
128
300
  Asset(
@@ -133,7 +305,7 @@ class RedshiftCollector:
133
305
  region=region,
134
306
  arn=namespace.get(
135
307
  "namespaceArn",
136
- f"arn:aws:redshift-serverless:{region}::namespace/{namespace_name}",
308
+ f"arn:aws:redshift-serverless:{region}:{self._get_account_id()}:namespace/{namespace_id}",
137
309
  ),
138
310
  name=namespace_name,
139
311
  created_at=(
@@ -141,52 +313,263 @@ class RedshiftCollector:
141
313
  if namespace.get("creationDate")
142
314
  else None
143
315
  ),
144
- last_activity_at=None, # Serverless doesn't have direct last activity
316
+ risk_flags=risk_flags,
317
+ last_activity_at=None,
145
318
  usage_metrics={
319
+ "namespace_id": namespace_id,
146
320
  "workgroup_count": workgroup_count,
147
321
  "status": namespace.get("status", "unknown"),
322
+ "db_name": namespace.get("dbName", ""),
323
+ "admin_username": namespace.get("adminUsername", ""),
324
+ "encrypted": encrypted,
148
325
  "last_used": None,
149
326
  "days_since_last_use": None,
150
327
  },
151
328
  )
152
329
  )
153
330
 
331
+ # Collect individual workgroups
332
+ for wg in associated_workgroups:
333
+ wg_name = wg.get("workgroupName", "")
334
+ base_capacity = wg.get("baseCapacity", 0)
335
+
336
+ # Estimate cost: Serverless charges $0.36/RPU-hour
337
+ # Assume 10% utilization for base estimate
338
+ estimated_monthly_cost = base_capacity * 0.36 * 24 * 30 * 0.1
339
+
340
+ # Check public accessibility
341
+ publicly_accessible = wg.get("publiclyAccessible", False)
342
+
343
+ wg_risk_flags = []
344
+ if publicly_accessible:
345
+ wg_risk_flags.append("publicly_accessible")
346
+
347
+ assets.append(
348
+ Asset(
349
+ provider="aws",
350
+ asset_type="redshift_serverless_workgroup",
351
+ normalized_category=NormalizedCategory.DATA_WAREHOUSE,
352
+ service="Redshift Serverless",
353
+ region=region,
354
+ arn=wg.get(
355
+ "workgroupArn",
356
+ f"arn:aws:redshift-serverless:{region}:{self._get_account_id()}:workgroup/{wg_name}",
357
+ ),
358
+ name=wg_name,
359
+ created_at=(
360
+ wg.get("creationDate", "").isoformat()
361
+ if wg.get("creationDate")
362
+ else None
363
+ ),
364
+ risk_flags=wg_risk_flags,
365
+ cost_estimate_usd=estimated_monthly_cost,
366
+ usage_metrics={
367
+ "namespace_name": namespace_name,
368
+ "base_capacity": base_capacity,
369
+ "status": wg.get("status", "unknown"),
370
+ "publicly_accessible": publicly_accessible,
371
+ "enhanced_vpc_routing": wg.get("enhancedVpcRouting", False),
372
+ },
373
+ )
374
+ )
375
+
154
376
  except ClientError as e:
155
377
  print(f"Error collecting Redshift Serverless in {region}: {e}")
156
378
 
157
379
  return assets
158
380
 
381
+ def _collect_datashares(self) -> list[Asset]:
382
+ """Collect Redshift Datashares (cross-account data sharing)."""
383
+ assets = []
384
+
385
+ regions_to_check = self.regions if self.regions else ["us-east-1"]
386
+
387
+ for region in regions_to_check:
388
+ try:
389
+ redshift_client = self.session.client("redshift", region_name=region)
390
+
391
+ # Get all datashares
392
+ try:
393
+ response = redshift_client.describe_data_shares()
394
+
395
+ for datashare in response.get("DataShares", []):
396
+ share_arn = datashare.get("DataShareArn", "")
397
+ share_name = share_arn.split("/")[-1] if "/" in share_arn else share_arn
398
+ producer_arn = datashare.get("ProducerArn", "")
399
+
400
+ # Get associations (consumers)
401
+ associations = datashare.get("DataShareAssociations", [])
402
+ consumer_accounts = []
403
+ cross_account = False
404
+ cross_region = False
405
+
406
+ for assoc in associations:
407
+ consumer_id = assoc.get("ConsumerIdentifier", "")
408
+ consumer_region = assoc.get("ConsumerRegion", "")
409
+ status = assoc.get("Status", "")
410
+
411
+ if consumer_id and consumer_id != self._get_account_id():
412
+ cross_account = True
413
+ if consumer_region and consumer_region != region:
414
+ cross_region = True
415
+
416
+ consumer_accounts.append(
417
+ {
418
+ "account_id": consumer_id,
419
+ "region": consumer_region,
420
+ "status": status,
421
+ }
422
+ )
423
+
424
+ # Build risk flags
425
+ risk_flags = []
426
+ if cross_account:
427
+ risk_flags.append("cross_account_sharing")
428
+ if cross_region:
429
+ risk_flags.append("cross_region_sharing")
430
+ if datashare.get("AllowPubliclyAccessibleConsumers", False):
431
+ risk_flags.append("allows_public_consumers")
432
+
433
+ # Determine share type
434
+ share_type = (
435
+ "OUTBOUND"
436
+ if producer_arn.split(":")[4] == self._get_account_id()
437
+ else "INBOUND"
438
+ )
439
+
440
+ assets.append(
441
+ Asset(
442
+ provider="aws",
443
+ asset_type="redshift_datashare",
444
+ normalized_category=NormalizedCategory.DATA_SHARING,
445
+ service="Redshift",
446
+ region=region,
447
+ arn=share_arn,
448
+ name=share_name,
449
+ risk_flags=risk_flags,
450
+ usage_metrics={
451
+ "share_type": share_type,
452
+ "producer_arn": producer_arn,
453
+ "consumer_count": len(consumer_accounts),
454
+ "consumers": consumer_accounts[:5], # Limit to first 5 for size
455
+ "cross_account": cross_account,
456
+ "cross_region": cross_region,
457
+ "allows_public_consumers": datashare.get(
458
+ "AllowPubliclyAccessibleConsumers", False
459
+ ),
460
+ },
461
+ )
462
+ )
463
+
464
+ except ClientError as e:
465
+ if "AccessDenied" not in str(e):
466
+ print(f"Error collecting datashares in {region}: {e}")
467
+
468
+ except ClientError as e:
469
+ print(f"Error collecting Redshift datashares in {region}: {e}")
470
+
471
+ return assets
472
+
159
473
  def _estimate_cluster_cost(self, node_type: str, node_count: int) -> float:
160
474
  """Estimate monthly cost for Redshift cluster."""
161
- # Redshift pricing (approximate, as of 2024)
162
- # dc2.large: ~$0.25/hour = ~$180/month
163
- # ra3.xlplus: ~$3.26/hour = ~$2,347/month
164
- # etc.
165
-
475
+ # Redshift pricing (approximate, as of 2024-2025)
166
476
  pricing = {
167
477
  "dc2.large": 180.0,
168
478
  "dc2.8xlarge": 1440.0,
169
479
  "ra3.xlplus": 2347.0,
170
480
  "ra3.4xlarge": 4694.0,
171
481
  "ra3.16xlarge": 18776.0,
482
+ "ds2.xlarge": 850.0,
483
+ "ds2.8xlarge": 6800.0,
172
484
  }
173
485
 
174
486
  base_cost = pricing.get(node_type, 500.0) # Default estimate
175
487
  return base_cost * node_count
176
488
 
489
+ def _infer_ownership(self, tags: dict[str, str], name: str) -> dict[str, str]:
490
+ """Infer ownership from tags."""
491
+ owner = None
492
+ confidence = "unknown"
493
+
494
+ for key in ["owner", "Owner", "team", "Team", "created-by", "CreatedBy"]:
495
+ if key in tags:
496
+ owner = tags[key]
497
+ confidence = "high" if key.lower() == "owner" else "medium"
498
+ break
499
+
500
+ return {"owner": owner, "confidence": confidence}
501
+
177
502
  def get_usage_metrics(self, asset: Asset) -> dict[str, Any]:
178
503
  """Get usage metrics for Redshift asset."""
179
504
  return asset.usage_metrics or {}
180
505
 
181
- def _get_last_activity(self, cluster_id: str, region: str) -> str | None:
182
- """Get last activity timestamp for a Redshift cluster."""
183
- from datetime import datetime, timedelta
506
+ def _get_last_activity_cloudwatch(self, cluster_id: str, region: str) -> str | None:
507
+ """Get last activity timestamp using CloudWatch metrics (more reliable than CloudTrail)."""
508
+ try:
509
+ cloudwatch = self.session.client("cloudwatch", region_name=region)
510
+
511
+ end_time = datetime.now(timezone.utc)
512
+ start_time = end_time - timedelta(days=14) # Look back 14 days
513
+
514
+ # Check DatabaseConnections metric - indicates actual usage
515
+ response = cloudwatch.get_metric_statistics(
516
+ Namespace="AWS/Redshift",
517
+ MetricName="DatabaseConnections",
518
+ Dimensions=[
519
+ {"Name": "ClusterIdentifier", "Value": cluster_id},
520
+ ],
521
+ StartTime=start_time,
522
+ EndTime=end_time,
523
+ Period=3600, # 1 hour granularity
524
+ Statistics=["Maximum"],
525
+ )
526
+
527
+ datapoints = response.get("Datapoints", [])
528
+ if datapoints:
529
+ # Find the most recent datapoint with connections > 0
530
+ active_points = [dp for dp in datapoints if dp.get("Maximum", 0) > 0]
531
+ if active_points:
532
+ latest = max(active_points, key=lambda x: x["Timestamp"])
533
+ return latest["Timestamp"].isoformat()
534
+ else:
535
+ # No connections in the last 14 days
536
+ return None
537
+
538
+ # Fallback to CPUUtilization as activity indicator
539
+ response = cloudwatch.get_metric_statistics(
540
+ Namespace="AWS/Redshift",
541
+ MetricName="CPUUtilization",
542
+ Dimensions=[
543
+ {"Name": "ClusterIdentifier", "Value": cluster_id},
544
+ ],
545
+ StartTime=start_time,
546
+ EndTime=end_time,
547
+ Period=3600,
548
+ Statistics=["Average"],
549
+ )
550
+
551
+ datapoints = response.get("Datapoints", [])
552
+ if datapoints:
553
+ # Find most recent with CPU > 5% (indicates active queries)
554
+ active_points = [dp for dp in datapoints if dp.get("Average", 0) > 5]
555
+ if active_points:
556
+ latest = max(active_points, key=lambda x: x["Timestamp"])
557
+ return latest["Timestamp"].isoformat()
558
+
559
+ except ClientError as e:
560
+ if "AccessDenied" not in str(e):
561
+ print(f"Error getting CloudWatch metrics for {cluster_id}: {e}")
562
+ except Exception:
563
+ pass
564
+
565
+ return None
184
566
 
567
+ def _get_last_activity(self, cluster_id: str, region: str) -> str | None:
568
+ """Get last activity timestamp using CloudTrail (fallback method)."""
185
569
  try:
186
- # Try CloudTrail to find last API call to this cluster
187
570
  cloudtrail_client = self.session.client("cloudtrail", region_name="us-east-1")
188
571
 
189
- end_time = datetime.utcnow()
572
+ end_time = datetime.now(timezone.utc)
190
573
  start_time = end_time - timedelta(days=90)
191
574
 
192
575
  try:
@@ -215,14 +598,13 @@ class RedshiftCollector:
215
598
 
216
599
  def _calculate_days_since_last_use(self, last_activity: str | None) -> int | None:
217
600
  """Calculate days since last use."""
218
- from datetime import datetime
219
-
220
601
  if not last_activity:
221
602
  return None
222
603
 
223
604
  try:
224
605
  last_used = datetime.fromisoformat(last_activity.replace("Z", "+00:00"))
225
- days = (datetime.utcnow() - last_used.replace(tzinfo=None)).days
606
+ now = datetime.now(timezone.utc)
607
+ days = (now - last_used).days
226
608
  return days
227
609
  except Exception:
228
610
  return None
@@ -230,3 +612,299 @@ class RedshiftCollector:
230
612
  def get_cost_estimate(self, asset: Asset) -> float:
231
613
  """Get cost estimate for Redshift asset."""
232
614
  return asset.cost_estimate_usd or 0.0
615
+
616
+ def _check_reservation_coverage(self, region: str, node_type: str, node_count: int) -> dict:
617
+ """Check if cluster nodes are covered by reserved nodes."""
618
+ reserved_nodes = self._reserved_nodes.get(region, [])
619
+
620
+ # Find matching reservations by node type
621
+ matching = [r for r in reserved_nodes if r.get("node_type") == node_type]
622
+ total_reserved = sum(r.get("node_count", 0) for r in matching)
623
+
624
+ if total_reserved >= node_count:
625
+ return {
626
+ "covered": True,
627
+ "reserved_count": node_count,
628
+ "on_demand_count": 0,
629
+ }
630
+ else:
631
+ return {
632
+ "covered": total_reserved > 0,
633
+ "reserved_count": total_reserved,
634
+ "on_demand_count": node_count - total_reserved,
635
+ }
636
+
637
+ def _get_wlm_configuration(self, redshift_client, cluster_id: str) -> dict:
638
+ """Get WLM (Workload Management) configuration for a cluster."""
639
+ try:
640
+ # Get cluster parameter group
641
+ clusters_resp = redshift_client.describe_clusters(ClusterIdentifier=cluster_id)
642
+ clusters = clusters_resp.get("Clusters", [])
643
+
644
+ if not clusters:
645
+ return {
646
+ "queue_count": 0,
647
+ "is_default_only": True,
648
+ "has_unlimited_queue": False,
649
+ "auto_wlm": False,
650
+ }
651
+
652
+ cluster = clusters[0]
653
+ param_groups = cluster.get("ClusterParameterGroups", [])
654
+
655
+ if not param_groups:
656
+ return {
657
+ "queue_count": 0,
658
+ "is_default_only": True,
659
+ "has_unlimited_queue": False,
660
+ "auto_wlm": False,
661
+ }
662
+
663
+ param_group_name = param_groups[0].get("ParameterGroupName", "")
664
+
665
+ # Get WLM configuration from parameter group
666
+ params_resp = redshift_client.describe_cluster_parameters(
667
+ ParameterGroupName=param_group_name
668
+ )
669
+
670
+ wlm_config = {
671
+ "queue_count": 0,
672
+ "is_default_only": True,
673
+ "has_unlimited_queue": False,
674
+ "auto_wlm": False,
675
+ }
676
+
677
+ for param in params_resp.get("Parameters", []):
678
+ param_name = param.get("ParameterName", "")
679
+ param_value = param.get("ParameterValue", "")
680
+
681
+ if param_name == "wlm_json_configuration" and param_value:
682
+ try:
683
+ import json
684
+
685
+ wlm_json = json.loads(param_value)
686
+
687
+ if isinstance(wlm_json, list):
688
+ wlm_config["queue_count"] = len(wlm_json)
689
+ wlm_config["is_default_only"] = len(wlm_json) <= 1
690
+
691
+ for queue in wlm_json:
692
+ if (
693
+ queue.get("query_concurrency", 0) == 0
694
+ or queue.get("memory_percent_to_use", 0) == 100
695
+ ):
696
+ wlm_config["has_unlimited_queue"] = True
697
+ if queue.get("auto_wlm"):
698
+ wlm_config["auto_wlm"] = True
699
+ except (json.JSONDecodeError, TypeError):
700
+ pass
701
+
702
+ return wlm_config
703
+
704
+ except ClientError as e:
705
+ if "AccessDenied" not in str(e) and "ClusterNotFound" not in str(e):
706
+ print(f"Error getting WLM config for {cluster_id}: {e}")
707
+ return {
708
+ "queue_count": 0,
709
+ "is_default_only": True,
710
+ "has_unlimited_queue": False,
711
+ "auto_wlm": False,
712
+ }
713
+
714
+ def _collect_snapshots(self) -> list[Asset]:
715
+ """Collect Redshift snapshots with cost and retention analysis."""
716
+ assets = []
717
+
718
+ regions_to_check = self.regions if self.regions else ["us-east-1"]
719
+
720
+ for region in regions_to_check:
721
+ try:
722
+ redshift_client = self.session.client("redshift", region_name=region)
723
+
724
+ # Get all snapshots (both manual and automated)
725
+ for snapshot_type in ["manual", "automated"]:
726
+ try:
727
+ paginator = redshift_client.get_paginator("describe_cluster_snapshots")
728
+
729
+ for page in paginator.paginate(SnapshotType=snapshot_type):
730
+ for snapshot in page.get("Snapshots", []):
731
+ snapshot_id = snapshot.get("SnapshotIdentifier", "")
732
+ cluster_id = snapshot.get("ClusterIdentifier", "")
733
+
734
+ # Get snapshot details
735
+ create_time = snapshot.get("SnapshotCreateTime")
736
+ snapshot_size_gb = (
737
+ snapshot.get("TotalBackupSizeInMegaBytes", 0) / 1024
738
+ )
739
+ status = snapshot.get("Status", "unknown")
740
+
741
+ # Calculate age
742
+ snapshot_age_days = None
743
+ if create_time:
744
+ snapshot_age_days = (
745
+ datetime.now(timezone.utc)
746
+ - create_time.replace(tzinfo=timezone.utc)
747
+ ).days
748
+
749
+ # Estimate storage cost (~$0.024/GB-month for Redshift snapshots)
750
+ # Note: Automated snapshots are FREE up to cluster storage size
751
+ # Only manual snapshots and storage beyond cluster size are billed
752
+ if snapshot_type == "automated":
753
+ # Automated snapshots are mostly free - only count for awareness
754
+ monthly_storage_cost = 0.0 # Free tier
755
+ else:
756
+ monthly_storage_cost = snapshot_size_gb * 0.024
757
+
758
+ # Build risk flags
759
+ risk_flags = []
760
+
761
+ # Flag old manual snapshots (potential cost waste)
762
+ if snapshot_type == "manual":
763
+ if snapshot_age_days and snapshot_age_days > 90:
764
+ risk_flags.append("old_snapshot")
765
+ if snapshot_age_days and snapshot_age_days > 365:
766
+ risk_flags.append("very_old_snapshot")
767
+
768
+ # Large snapshots
769
+ if snapshot_size_gb > 1000: # > 1TB
770
+ risk_flags.append("large_snapshot")
771
+
772
+ # Check if source cluster still exists
773
+ is_orphan = (
774
+ snapshot.get("ClusterCreateTime") is None
775
+ and snapshot_type == "manual"
776
+ )
777
+ if is_orphan:
778
+ risk_flags.append("orphan_snapshot")
779
+
780
+ # Get tags
781
+ tags = {
782
+ tag["Key"]: tag["Value"] for tag in snapshot.get("Tags", [])
783
+ }
784
+
785
+ assets.append(
786
+ Asset(
787
+ provider="aws",
788
+ asset_type="redshift_snapshot",
789
+ normalized_category=NormalizedCategory.DATA_WAREHOUSE,
790
+ service="Redshift",
791
+ region=region,
792
+ arn=snapshot.get(
793
+ "SnapshotArn",
794
+ f"arn:aws:redshift:{region}:{self._get_account_id()}:snapshot:{cluster_id}/{snapshot_id}",
795
+ ),
796
+ name=snapshot_id,
797
+ created_at=create_time.isoformat() if create_time else None,
798
+ tags=tags,
799
+ risk_flags=risk_flags,
800
+ size_bytes=int(snapshot_size_gb * 1024 * 1024 * 1024),
801
+ cost_estimate_usd=monthly_storage_cost,
802
+ usage_metrics={
803
+ "snapshot_type": snapshot_type,
804
+ "cluster_identifier": cluster_id,
805
+ "status": status,
806
+ "size_gb": round(snapshot_size_gb, 2),
807
+ "age_days": snapshot_age_days,
808
+ "encrypted": snapshot.get("Encrypted", False),
809
+ "is_orphan": is_orphan,
810
+ "retention_period": snapshot.get(
811
+ "ManualSnapshotRetentionPeriod", -1
812
+ ),
813
+ },
814
+ )
815
+ )
816
+
817
+ except ClientError as e:
818
+ if "AccessDenied" not in str(e):
819
+ print(f"Error collecting {snapshot_type} snapshots in {region}: {e}")
820
+
821
+ except ClientError as e:
822
+ print(f"Error collecting Redshift snapshots in {region}: {e}")
823
+
824
+ return assets
825
+
826
+ def _collect_reserved_nodes_as_assets(self) -> list[Asset]:
827
+ """Create assets for reserved nodes for visibility and tracking."""
828
+ assets = []
829
+
830
+ regions_to_check = self.regions if self.regions else ["us-east-1"]
831
+
832
+ for region in regions_to_check:
833
+ try:
834
+ redshift_client = self.session.client("redshift", region_name=region)
835
+ response = redshift_client.describe_reserved_nodes()
836
+
837
+ for node in response.get("ReservedNodes", []):
838
+ node_id = node.get("ReservedNodeId", "")
839
+ node_type = node.get("NodeType", "")
840
+ node_count = node.get("NodeCount", 0)
841
+ state = node.get("State", "unknown")
842
+ offering_type = node.get("OfferingType", "")
843
+ duration = node.get("Duration", 0) # seconds
844
+ start_time = node.get("StartTime")
845
+
846
+ # Calculate remaining time
847
+ remaining_days = None
848
+ is_expired = False
849
+ is_expiring_soon = False
850
+
851
+ if start_time and duration:
852
+ end_time = start_time + timedelta(seconds=duration)
853
+ remaining_days = (end_time - datetime.now(timezone.utc)).days
854
+
855
+ if remaining_days < 0:
856
+ is_expired = True
857
+ remaining_days = 0
858
+ elif remaining_days < 30:
859
+ is_expiring_soon = True
860
+
861
+ # Calculate annual cost
862
+ fixed_price = node.get("FixedPrice", 0)
863
+ recurring_charges = node.get("RecurringCharges", [])
864
+ monthly_recurring = sum(
865
+ c.get("RecurringChargeAmount", 0) for c in recurring_charges
866
+ )
867
+ annual_cost = fixed_price + (monthly_recurring * 12)
868
+
869
+ # Build risk flags
870
+ risk_flags = []
871
+ if is_expired:
872
+ risk_flags.append("reservation_expired")
873
+ if is_expiring_soon:
874
+ risk_flags.append("reservation_expiring_soon")
875
+ if state != "active":
876
+ risk_flags.append(f"reservation_{state}")
877
+
878
+ assets.append(
879
+ Asset(
880
+ provider="aws",
881
+ asset_type="redshift_reserved_node",
882
+ normalized_category=NormalizedCategory.BILLING,
883
+ service="Redshift",
884
+ region=region,
885
+ arn=f"arn:aws:redshift:{region}:{self._get_account_id()}:reserved-node:{node_id}",
886
+ name=f"{node_type} x{node_count} ({offering_type})",
887
+ created_at=start_time.isoformat() if start_time else None,
888
+ risk_flags=risk_flags,
889
+ cost_estimate_usd=annual_cost / 12, # Monthly equivalent
890
+ usage_metrics={
891
+ "reserved_node_id": node_id,
892
+ "node_type": node_type,
893
+ "node_count": node_count,
894
+ "state": state,
895
+ "offering_type": offering_type,
896
+ "duration_years": duration / (365 * 24 * 3600) if duration else 0,
897
+ "remaining_days": remaining_days,
898
+ "is_expiring_soon": is_expiring_soon,
899
+ "fixed_price": fixed_price,
900
+ "monthly_recurring": monthly_recurring,
901
+ "annual_cost": annual_cost,
902
+ },
903
+ )
904
+ )
905
+
906
+ except ClientError as e:
907
+ if "AccessDenied" not in str(e):
908
+ print(f"Error collecting reserved nodes in {region}: {e}")
909
+
910
+ return assets