argus-cloud-optimizer 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. adapters/__init__.py +0 -0
  2. adapters/aws/__init__.py +0 -0
  3. adapters/aws/adapter.py +85 -0
  4. adapters/aws/auth.py +57 -0
  5. adapters/aws/cloudtrail.py +83 -0
  6. adapters/aws/cloudwatch.py +732 -0
  7. adapters/aws/config.py +9 -0
  8. adapters/aws/cost_explorer.py +116 -0
  9. adapters/aws/resource_explorer.py +186 -0
  10. adapters/aws/retry.py +55 -0
  11. adapters/azure/__init__.py +0 -0
  12. adapters/azure/activity_log.py +159 -0
  13. adapters/azure/adapter.py +117 -0
  14. adapters/azure/cost_management.py +125 -0
  15. adapters/azure/monitor.py +311 -0
  16. adapters/azure/resource_graph.py +113 -0
  17. adapters/azure/retry.py +57 -0
  18. adapters/base.py +105 -0
  19. adapters/gcp/__init__.py +0 -0
  20. adapters/gcp/adapter.py +86 -0
  21. adapters/gcp/asset_inventory.py +116 -0
  22. adapters/gcp/billing.py +118 -0
  23. adapters/gcp/cloud_logging.py +93 -0
  24. adapters/gcp/cloud_monitoring.py +276 -0
  25. adapters/gcp/retry.py +46 -0
  26. ai/__init__.py +0 -0
  27. ai/anthropic.py +174 -0
  28. ai/azure_openai.py +241 -0
  29. ai/base.py +78 -0
  30. ai/bedrock.py +169 -0
  31. ai/vertexai.py +234 -0
  32. argus_cloud_optimizer-0.2.0.dist-info/METADATA +433 -0
  33. argus_cloud_optimizer-0.2.0.dist-info/RECORD +62 -0
  34. argus_cloud_optimizer-0.2.0.dist-info/WHEEL +5 -0
  35. argus_cloud_optimizer-0.2.0.dist-info/entry_points.txt +2 -0
  36. argus_cloud_optimizer-0.2.0.dist-info/licenses/LICENSE +21 -0
  37. argus_cloud_optimizer-0.2.0.dist-info/top_level.txt +4 -0
  38. core/__init__.py +0 -0
  39. core/__version__.py +1 -0
  40. core/agent/__init__.py +0 -0
  41. core/agent/loop.py +390 -0
  42. core/agent/prompts.py +317 -0
  43. core/config.py +235 -0
  44. core/log.py +69 -0
  45. core/models/__init__.py +0 -0
  46. core/models/finding.py +76 -0
  47. core/py.typed +0 -0
  48. core/reports/__init__.py +0 -0
  49. core/reports/comparison.py +49 -0
  50. core/reports/delivery.py +323 -0
  51. core/reports/export.py +111 -0
  52. core/reports/generator.py +168 -0
  53. core/reports/html.py +286 -0
  54. core/reports/multi_cloud.py +162 -0
  55. core/secrets.py +145 -0
  56. core/token_tracker.py +97 -0
  57. core/validation.py +214 -0
  58. entrypoints/__init__.py +0 -0
  59. entrypoints/aws_lambda.py +299 -0
  60. entrypoints/azure_function.py +257 -0
  61. entrypoints/cli.py +156 -0
  62. entrypoints/gcp_cloudrun.py +209 -0
@@ -0,0 +1,732 @@
1
+ from __future__ import annotations
2
+
3
+ import os as _os
4
+ from datetime import datetime, timedelta, timezone
5
+ from typing import Any
6
+
7
+ import boto3
8
+ import structlog
9
+ from botocore.exceptions import ClientError
10
+
11
+ from adapters.aws.config import BOTO_TIMEOUT_CONFIG
12
+ from adapters.aws.retry import retry_on_transient
13
+ from adapters.base import MetricSummary
14
+
15
+ logger = structlog.get_logger(__name__)
16
+
17
+ # (MetricName, Namespace, Stat, CloudWatch Dimension Key)
18
+ # Stat is "Average" for utilisation metrics, "Sum" for throughput/count metrics.
19
+ _METRICS: dict[str, list[tuple[str, str, str, str]]] = {
20
+ "AWS::EC2::Instance": [
21
+ ("CPUUtilization", "AWS/EC2", "Average", "InstanceId"),
22
+ ("NetworkOut", "AWS/EC2", "Sum", "InstanceId"),
23
+ ("NetworkIn", "AWS/EC2", "Sum", "InstanceId"),
24
+ ],
25
+ "AWS::RDS::DBInstance": [
26
+ ("CPUUtilization", "AWS/RDS", "Average", "DBInstanceIdentifier"),
27
+ ("DatabaseConnections", "AWS/RDS", "Average", "DBInstanceIdentifier"),
28
+ ("NetworkReceiveThroughput", "AWS/RDS", "Sum", "DBInstanceIdentifier"),
29
+ ],
30
+ "AWS::EC2::NatGateway": [
31
+ ("BytesOutToDestination", "AWS/NatGateway", "Sum", "NatGatewayId"),
32
+ ("BytesInFromDestination", "AWS/NatGateway", "Sum", "NatGatewayId"),
33
+ ("PacketsOutToDestination", "AWS/NatGateway", "Sum", "NatGatewayId"),
34
+ ],
35
+ "AWS::ElasticLoadBalancingV2::LoadBalancer": [
36
+ ("RequestCount", "AWS/ApplicationELB", "Sum", "LoadBalancer"),
37
+ ("ActiveConnectionCount", "AWS/ApplicationELB", "Sum", "LoadBalancer"),
38
+ ("TargetResponseTime", "AWS/ApplicationELB", "Average", "LoadBalancer"),
39
+ ],
40
+ "AWS::ElasticLoadBalancing::LoadBalancer": [
41
+ ("RequestCount", "AWS/ELB", "Sum", "LoadBalancerName"),
42
+ ("HealthyHostCount", "AWS/ELB", "Average", "LoadBalancerName"),
43
+ ("UnHealthyHostCount", "AWS/ELB", "Average", "LoadBalancerName"),
44
+ ],
45
+ "AWS::Lambda::Function": [
46
+ ("Invocations", "AWS/Lambda", "Sum", "FunctionName"),
47
+ ("Duration", "AWS/Lambda", "Average", "FunctionName"),
48
+ ("Errors", "AWS/Lambda", "Sum", "FunctionName"),
49
+ ],
50
+ "AWS::EC2::Volume": [
51
+ ("VolumeReadOps", "AWS/EBS", "Sum", "VolumeId"),
52
+ ("VolumeWriteOps", "AWS/EBS", "Sum", "VolumeId"),
53
+ ("VolumeReadBytes", "AWS/EBS", "Sum", "VolumeId"),
54
+ ],
55
+ "AWS::DynamoDB::Table": [
56
+ ("ConsumedReadCapacityUnits", "AWS/DynamoDB", "Sum", "TableName"),
57
+ ("ConsumedWriteCapacityUnits", "AWS/DynamoDB", "Sum", "TableName"),
58
+ ("SuccessfulRequestLatency", "AWS/DynamoDB", "Average", "TableName"),
59
+ ],
60
+ "AWS::SQS::Queue": [
61
+ ("NumberOfMessagesSent", "AWS/SQS", "Sum", "QueueName"),
62
+ ("NumberOfMessagesReceived", "AWS/SQS", "Sum", "QueueName"),
63
+ ("ApproximateNumberOfMessagesVisible", "AWS/SQS", "Average", "QueueName"),
64
+ ],
65
+ "AWS::ElastiCache::CacheCluster": [
66
+ ("CPUUtilization", "AWS/ElastiCache", "Average", "CacheClusterId"),
67
+ ("CurrConnections", "AWS/ElastiCache", "Average", "CacheClusterId"),
68
+ ("CacheHits", "AWS/ElastiCache", "Sum", "CacheClusterId"),
69
+ ],
70
+ "AWS::Redshift::Cluster": [
71
+ ("CPUUtilization", "AWS/Redshift", "Average", "ClusterIdentifier"),
72
+ ("DatabaseConnections", "AWS/Redshift", "Average", "ClusterIdentifier"),
73
+ ("ReadIOPS", "AWS/Redshift", "Average", "ClusterIdentifier"),
74
+ ],
75
+ "AWS::OpenSearchService::Domain": [
76
+ ("CPUUtilization", "AWS/ES", "Average", "DomainName"),
77
+ ("SearchableDocuments", "AWS/ES", "Average", "DomainName"),
78
+ ("IndexingRate", "AWS/ES", "Average", "DomainName"),
79
+ ],
80
+ "AWS::ECS::Service": [
81
+ ("CPUUtilization", "AWS/ECS", "Average", "ServiceName"),
82
+ ("MemoryUtilization", "AWS/ECS", "Average", "ServiceName"),
83
+ ],
84
+ "AWS::EKS::Cluster": [
85
+ # Requires Container Insights enabled on the cluster.
86
+ ("cluster_node_count", "ContainerInsights", "Average", "ClusterName"),
87
+ ("node_cpu_utilization", "ContainerInsights", "Average", "ClusterName"),
88
+ ("node_memory_utilization", "ContainerInsights", "Average", "ClusterName"),
89
+ ],
90
+ "AWS::Kinesis::Stream": [
91
+ ("GetRecords.Records", "AWS/Kinesis", "Sum", "StreamName"),
92
+ ("IncomingRecords", "AWS/Kinesis", "Sum", "StreamName"),
93
+ ("PutRecord.Success", "AWS/Kinesis", "Sum", "StreamName"),
94
+ ],
95
+ "AWS::SNS::Topic": [
96
+ ("NumberOfNotificationsDelivered", "AWS/SNS", "Sum", "TopicName"),
97
+ ("NumberOfMessagesPublished", "AWS/SNS", "Sum", "TopicName"),
98
+ ("NumberOfNotificationsFailed", "AWS/SNS", "Sum", "TopicName"),
99
+ ],
100
+ "AWS::ApiGateway::RestApi": [
101
+ ("Count", "AWS/ApiGateway", "Sum", "ApiName"),
102
+ ("4XXError", "AWS/ApiGateway", "Sum", "ApiName"),
103
+ ("5XXError", "AWS/ApiGateway", "Sum", "ApiName"),
104
+ ],
105
+ "AWS::ApiGateway::Stage": [
106
+ ("Count", "AWS/ApiGateway", "Sum", "Stage"),
107
+ ("4XXError", "AWS/ApiGateway", "Sum", "Stage"),
108
+ ("Latency", "AWS/ApiGateway", "Average", "Stage"),
109
+ ],
110
+ "AWS::CloudFront::Distribution": [
111
+ ("Requests", "AWS/CloudFront", "Sum", "DistributionId"),
112
+ ("BytesDownloaded", "AWS/CloudFront", "Sum", "DistributionId"),
113
+ ("4xxErrorRate", "AWS/CloudFront", "Average", "DistributionId"),
114
+ ],
115
+ "AWS::StepFunctions::StateMachine": [
116
+ ("ExecutionsStarted", "AWS/States", "Sum", "StateMachineArn"),
117
+ ("ExecutionsSucceeded", "AWS/States", "Sum", "StateMachineArn"),
118
+ ("ExecutionsFailed", "AWS/States", "Sum", "StateMachineArn"),
119
+ ],
120
+ "AWS::Glue::Job": [
121
+ ("glue.driver.aggregate.bytesRead", "Glue", "Sum", "JobName"),
122
+ ("glue.driver.aggregate.elapsedTime", "Glue", "Average", "JobName"),
123
+ ],
124
+ "AWS::MSK::Cluster": [
125
+ ("BytesInPerSec", "AWS/Kafka", "Sum", "Cluster Name"),
126
+ ("BytesOutPerSec", "AWS/Kafka", "Sum", "Cluster Name"),
127
+ ("KafkaDataLogsDiskUsed", "AWS/Kafka", "Average", "Cluster Name"),
128
+ ],
129
+ "AWS::SageMaker::Endpoint": [
130
+ ("Invocations", "AWS/SageMaker", "Sum", "EndpointName"),
131
+ ("ModelLatency", "AWS/SageMaker", "Average", "EndpointName"),
132
+ ("CPUUtilization", "AWS/SageMaker", "Average", "EndpointName"),
133
+ ],
134
+ # ── Aurora / RDS Cluster ──────────────────────────────────────────────────
135
+ "AWS::RDS::DBCluster": [
136
+ ("CPUUtilization", "AWS/RDS", "Average", "DBClusterIdentifier"),
137
+ ("DatabaseConnections", "AWS/RDS", "Average", "DBClusterIdentifier"),
138
+ ("AuroraReplicaLag", "AWS/RDS", "Average", "DBClusterIdentifier"),
139
+ ],
140
+ # ── ElastiCache Replication Group ─────────────────────────────────────────
141
+ "AWS::ElastiCache::ReplicationGroup": [
142
+ ("CurrConnections", "AWS/ElastiCache", "Average", "ReplicationGroupId"),
143
+ ("CacheHitRate", "AWS/ElastiCache", "Average", "ReplicationGroupId"),
144
+ ("ReplicationLag", "AWS/ElastiCache", "Average", "ReplicationGroupId"),
145
+ ],
146
+ # ── EMR Cluster ───────────────────────────────────────────────────────────
147
+ "AWS::EMR::Cluster": [
148
+ (
149
+ "YARNMemoryAvailablePercentage",
150
+ "AWS/ElasticMapReduce",
151
+ "Average",
152
+ "JobFlowId",
153
+ ),
154
+ ("ContainerPendingRatio", "AWS/ElasticMapReduce", "Average", "JobFlowId"),
155
+ ("AppsRunning", "AWS/ElasticMapReduce", "Average", "JobFlowId"),
156
+ ],
157
+ # ── DMS Replication Instance ──────────────────────────────────────────────
158
+ "AWS::DMS::ReplicationInstance": [
159
+ ("CPUUtilization", "AWS/DMS", "Average", "ReplicationInstanceIdentifier"),
160
+ ("FreeableMemory", "AWS/DMS", "Average", "ReplicationInstanceIdentifier"),
161
+ ("CDCLatencySource", "AWS/DMS", "Average", "ReplicationInstanceIdentifier"),
162
+ ],
163
+ # ── Neptune Cluster ───────────────────────────────────────────────────────
164
+ "AWS::Neptune::DBCluster": [
165
+ ("CPUUtilization", "AWS/Neptune", "Average", "DBClusterIdentifier"),
166
+ ("DatabaseConnections", "AWS/Neptune", "Average", "DBClusterIdentifier"),
167
+ ("BufferCacheHitRatio", "AWS/Neptune", "Average", "DBClusterIdentifier"),
168
+ ],
169
+ # ── DocumentDB Cluster ────────────────────────────────────────────────────
170
+ "AWS::DocDB::DBCluster": [
171
+ ("CPUUtilization", "AWS/DocDB", "Average", "DBClusterIdentifier"),
172
+ ("DatabaseConnections", "AWS/DocDB", "Average", "DBClusterIdentifier"),
173
+ ("BufferCacheHitRatio", "AWS/DocDB", "Average", "DBClusterIdentifier"),
174
+ ],
175
+ # ── WorkSpaces ────────────────────────────────────────────────────────────
176
+ "AWS::WorkSpaces::Workspace": [
177
+ ("Available", "AWS/WorkSpaces", "Average", "WorkspaceId"),
178
+ ("InSessionLatency", "AWS/WorkSpaces", "Average", "WorkspaceId"),
179
+ ("SessionLaunchTime", "AWS/WorkSpaces", "Average", "WorkspaceId"),
180
+ ],
181
+ # ── Kinesis Firehose ──────────────────────────────────────────────────────
182
+ "AWS::KinesisFirehose::DeliveryStream": [
183
+ ("IncomingBytes", "AWS/Firehose", "Sum", "DeliveryStreamName"),
184
+ ("IncomingRecords", "AWS/Firehose", "Sum", "DeliveryStreamName"),
185
+ ("DeliveryToS3.Success", "AWS/Firehose", "Sum", "DeliveryStreamName"),
186
+ ],
187
+ # ── AppSync GraphQL API ───────────────────────────────────────────────────
188
+ "AWS::AppSync::GraphQLApi": [
189
+ ("4XXError", "AWS/AppSync", "Sum", "GraphQLAPIId"),
190
+ ("5XXError", "AWS/AppSync", "Sum", "GraphQLAPIId"),
191
+ ("Latency", "AWS/AppSync", "Average", "GraphQLAPIId"),
192
+ ],
193
+ # ── EventBridge Rule ──────────────────────────────────────────────────────
194
+ "AWS::Events::Rule": [
195
+ ("TriggeredRules", "AWS/Events", "Sum", "RuleName"),
196
+ ("Invocations", "AWS/Events", "Sum", "RuleName"),
197
+ ("FailedInvocations", "AWS/Events", "Sum", "RuleName"),
198
+ ],
199
+ # ── Elastic Beanstalk Environment ─────────────────────────────────────────
200
+ "AWS::ElasticBeanstalk::Environment": [
201
+ ("EnvironmentHealth", "AWS/ElasticBeanstalk", "Average", "EnvironmentName"),
202
+ ("ApplicationRequestsTotal", "AWS/ElasticBeanstalk", "Sum", "EnvironmentName"),
203
+ ("CPUUtilization", "AWS/ElasticBeanstalk", "Average", "EnvironmentName"),
204
+ ],
205
+ # ── CodeBuild Project ─────────────────────────────────────────────────────
206
+ "AWS::CodeBuild::Project": [
207
+ ("Builds", "AWS/CodeBuild", "Sum", "ProjectName"),
208
+ ("SucceededBuilds", "AWS/CodeBuild", "Sum", "ProjectName"),
209
+ ("Duration", "AWS/CodeBuild", "Average", "ProjectName"),
210
+ ],
211
+ # ── Transfer Family Server ────────────────────────────────────────────────
212
+ "AWS::Transfer::Server": [
213
+ ("FilesIn", "AWS/Transfer", "Sum", "ServerId"),
214
+ ("FilesOut", "AWS/Transfer", "Sum", "ServerId"),
215
+ ("BytesIn", "AWS/Transfer", "Sum", "ServerId"),
216
+ ],
217
+ # ── WAFv2 WebACL ──────────────────────────────────────────────────────────
218
+ "AWS::WAFv2::WebACL": [
219
+ ("AllowedRequests", "AWS/WAFV2", "Sum", "WebACL"),
220
+ ("BlockedRequests", "AWS/WAFV2", "Sum", "WebACL"),
221
+ ("CountedRequests", "AWS/WAFV2", "Sum", "WebACL"),
222
+ ],
223
+ # ── S3 Bucket (requires per-bucket request metrics enabled) ───────────────
224
+ "AWS::S3::Bucket": [
225
+ ("NumberOfObjects", "AWS/S3", "Average", "BucketName"),
226
+ ("BucketSizeBytes", "AWS/S3", "Average", "BucketName"),
227
+ ("AllRequests", "AWS/S3", "Sum", "BucketName"),
228
+ ],
229
+ # ── Cognito User Pool ─────────────────────────────────────────────────────
230
+ "AWS::Cognito::UserPool": [
231
+ ("SignInSuccesses", "AWS/Cognito", "Sum", "UserPool"),
232
+ ("TokenRefreshSuccesses", "AWS/Cognito", "Sum", "UserPool"),
233
+ ("SignUpSuccesses", "AWS/Cognito", "Sum", "UserPool"),
234
+ ],
235
+ # ── IoT Core ──────────────────────────────────────────────────────────────
236
+ "AWS::IoT::Thing": [
237
+ ("PublishIn.Success", "AWS/IoT", "Sum", "Protocol"),
238
+ ("PublishOut.Success", "AWS/IoT", "Sum", "Protocol"),
239
+ ("Connect.Success", "AWS/IoT", "Sum", "Protocol"),
240
+ ],
241
+ # ── MediaLive Channel ─────────────────────────────────────────────────────
242
+ "AWS::MediaLive::Channel": [
243
+ ("ActiveOutputs", "AWS/MediaLive", "Average", "ChannelId"),
244
+ ("DroppedFrames", "AWS/MediaLive", "Sum", "ChannelId"),
245
+ ("NetworkIn", "AWS/MediaLive", "Sum", "ChannelId"),
246
+ ],
247
+ # ── Batch Job Queue ───────────────────────────────────────────────────────
248
+ "AWS::Batch::JobQueue": [
249
+ ("PendingJobCount", "AWS/Batch", "Average", "JobQueueName"),
250
+ ("RunnableJobCount", "AWS/Batch", "Average", "JobQueueName"),
251
+ ("RunningJobCount", "AWS/Batch", "Average", "JobQueueName"),
252
+ ],
253
+ # ── Route 53 Hosted Zone ──────────────────────────────────────────────────
254
+ "AWS::Route53::HostedZone": [
255
+ ("DNSQueries", "AWS/Route53", "Sum", "HostedZoneId"),
256
+ ],
257
+ }
258
+
259
+ _PERIOD_SECONDS = 86400 # daily granularity — one data point per day
260
+
261
+ # Default lookback for metric queries. 90 days covers quarterly usage patterns and
262
+ # aligns with the CloudTrail lookback window so both signals share the same horizon.
263
+ # At daily granularity CloudWatch retains data for 455 days, so 90 days is safe.
264
+ # Override via METRICS_LOOKBACK_DAYS env var (e.g. 14 for faster/cheaper dev runs).
265
+ DEFAULT_METRICS_DAYS: int = int(_os.environ.get("METRICS_LOOKBACK_DAYS", "90"))
266
+
267
+
268
+ def get_metrics(
269
+ session: boto3.Session,
270
+ resource_id: str,
271
+ resource_type: str,
272
+ days: int = DEFAULT_METRICS_DAYS,
273
+ ) -> MetricSummary:
274
+ """
275
+ Fetch usage metrics for a resource using CloudWatch GetMetricData (batched).
276
+ Returns a MetricSummary with averaged/summed values over the period.
277
+ For resource types without a hand-coded _METRICS entry, falls back to
278
+ auto-discovery via ListMetrics so the AI still gets signal data.
279
+
280
+ Also injects instance size details (instance_type, memory_mb, vcpus) into
281
+ the metrics dict for resource types where right-sizing is actionable. This
282
+ gives the AI the current instance size so it can recommend a specific
283
+ smaller tier rather than a generic "consider downsizing".
284
+ """
285
+ metric_defs = _METRICS.get(resource_type)
286
+ if not metric_defs:
287
+ metric_defs = _discover_metrics(session, resource_id, resource_type)
288
+ if not metric_defs:
289
+ return MetricSummary(
290
+ resource_id=resource_id,
291
+ resource_type=resource_type,
292
+ period_days=days,
293
+ metrics={},
294
+ has_data=False,
295
+ )
296
+
297
+ region = _region_from_arn(resource_id)
298
+ dim_value = _dimension_value(resource_id, resource_type)
299
+ client = session.client(
300
+ "cloudwatch", region_name=region, config=BOTO_TIMEOUT_CONFIG
301
+ )
302
+
303
+ end_time = datetime.now(tz=timezone.utc)
304
+ start_time = end_time - timedelta(days=days)
305
+
306
+ queries: list[Any] = [
307
+ {
308
+ "Id": f"m{i}",
309
+ "MetricStat": {
310
+ "Metric": {
311
+ "Namespace": namespace,
312
+ "MetricName": metric_name,
313
+ "Dimensions": [{"Name": dim_key, "Value": dim_value}],
314
+ },
315
+ "Period": _PERIOD_SECONDS,
316
+ "Stat": stat,
317
+ },
318
+ "ReturnData": True,
319
+ }
320
+ for i, (metric_name, namespace, stat, dim_key) in enumerate(metric_defs)
321
+ ]
322
+
323
+ try:
324
+ response = retry_on_transient(
325
+ client.get_metric_data,
326
+ MetricDataQueries=queries,
327
+ StartTime=start_time,
328
+ EndTime=end_time,
329
+ )
330
+ except ClientError as exc:
331
+ logger.warning(
332
+ "cloudwatch_get_metric_data_failed",
333
+ extra={"resource_id": resource_id, "error": str(exc)},
334
+ )
335
+ return MetricSummary(
336
+ resource_id=resource_id,
337
+ resource_type=resource_type,
338
+ period_days=days,
339
+ metrics={},
340
+ has_data=False,
341
+ )
342
+
343
+ summary = _parse_results(
344
+ results=response.get("MetricDataResults", []),
345
+ metric_defs=metric_defs,
346
+ resource_id=resource_id,
347
+ resource_type=resource_type,
348
+ days=days,
349
+ )
350
+
351
+ # Best-effort: inject current instance size so AI can recommend a specific
352
+ # right-sizing target rather than a generic "consider downsizing".
353
+ _enrich_instance_details(session, resource_id, resource_type, summary.metrics)
354
+
355
+ return summary
356
+
357
+
358
+ def _enrich_instance_details(
359
+ session: boto3.Session,
360
+ resource_id: str,
361
+ resource_type: str,
362
+ metrics: dict[str, Any],
363
+ ) -> None:
364
+ """
365
+ Inject current instance size metadata into the metrics dict (in-place).
366
+
367
+ This enriches the AI's context so it can recommend a *specific* right-sizing
368
+ target (e.g. "downsize from db.r5.4xlarge → db.r5.xlarge") rather than a
369
+ vague "consider downsizing". Failures are silently ignored — metrics are
370
+ still valid without this data.
371
+
372
+ Adds keys such as:
373
+ instance_type — e.g. "t3.medium", "db.r5.4xlarge", "cache.m6g.large"
374
+ memory_mb — Lambda allocated memory in MB
375
+ vcpus — EC2 vCPU count (from InstanceType metadata)
376
+ node_type — Redshift node type
377
+ instance_count — Redshift / OpenSearch cluster node count
378
+ """
379
+ region = _region_from_arn(resource_id)
380
+ resp: Any # declared here so mypy doesn't infer a narrow type from first assignment
381
+ try:
382
+ match resource_type:
383
+ case "AWS::EC2::Instance":
384
+ ec2 = session.client(
385
+ "ec2", region_name=region, config=BOTO_TIMEOUT_CONFIG
386
+ )
387
+ instance_id = resource_id.split("/")[-1].split(":")[-1]
388
+ resp = ec2.describe_instances(InstanceIds=[instance_id])
389
+ reservations = resp.get("Reservations", [])
390
+ if reservations:
391
+ inst = reservations[0]["Instances"][0]
392
+ metrics["instance_type"] = inst.get("InstanceType")
393
+ # vCPU count helps the AI understand the scale of the machine
394
+ cpu_opts = inst.get("CpuOptions", {})
395
+ if cpu_opts:
396
+ metrics["vcpus"] = cpu_opts.get("CoreCount", 1) * cpu_opts.get(
397
+ "ThreadsPerCore", 1
398
+ )
399
+
400
+ case "AWS::RDS::DBInstance":
401
+ rds = session.client(
402
+ "rds", region_name=region, config=BOTO_TIMEOUT_CONFIG
403
+ )
404
+ db_id = resource_id.split(":")[-1]
405
+ resp = rds.describe_db_instances(DBInstanceIdentifier=db_id)
406
+ instances = resp.get("DBInstances", [])
407
+ if instances:
408
+ db = instances[0]
409
+ metrics["instance_type"] = db.get("DBInstanceClass")
410
+ metrics["engine"] = (
411
+ f"{db.get('Engine')} {db.get('EngineVersion', '')}".strip()
412
+ )
413
+ metrics["storage_gb"] = db.get("AllocatedStorage")
414
+ metrics["multi_az"] = db.get("MultiAZ", False)
415
+
416
+ case "AWS::RDS::DBCluster":
417
+ rds = session.client(
418
+ "rds", region_name=region, config=BOTO_TIMEOUT_CONFIG
419
+ )
420
+ cluster_id = resource_id.split(":")[-1]
421
+ resp = rds.describe_db_clusters(DBClusterIdentifier=cluster_id)
422
+ clusters = resp.get("DBClusters", [])
423
+ if clusters:
424
+ cluster = clusters[0]
425
+ engine = cluster.get("Engine", "")
426
+ version = cluster.get("EngineVersion", "")
427
+ metrics["engine"] = f"{engine} {version}".strip()
428
+ metrics["instance_count"] = len(cluster.get("DBClusterMembers", []))
429
+ # Fetch instance class from the writer instance
430
+ members = cluster.get("DBClusterMembers", [])
431
+ writer = next(
432
+ (m for m in members if m.get("IsClusterWriter")), None
433
+ )
434
+ if writer:
435
+ inst_resp = rds.describe_db_instances(
436
+ DBInstanceIdentifier=writer["DBInstanceIdentifier"]
437
+ )
438
+ inst_list = inst_resp.get("DBInstances", [])
439
+ if inst_list:
440
+ metrics["instance_type"] = inst_list[0].get(
441
+ "DBInstanceClass"
442
+ )
443
+
444
+ case "AWS::ElastiCache::CacheCluster":
445
+ ec = session.client(
446
+ "elasticache", region_name=region, config=BOTO_TIMEOUT_CONFIG
447
+ )
448
+ cluster_id = resource_id.split(":")[-1]
449
+ resp = ec.describe_cache_clusters(CacheClusterId=cluster_id)
450
+ clusters = resp.get("CacheClusters", [])
451
+ if clusters:
452
+ c = clusters[0]
453
+ metrics["instance_type"] = c.get("CacheNodeType")
454
+ metrics["num_cache_nodes"] = c.get("NumCacheNodes")
455
+ metrics["engine"] = (
456
+ f"{c.get('Engine')} {c.get('EngineVersion', '')}".strip()
457
+ )
458
+
459
+ case "AWS::ElastiCache::ReplicationGroup":
460
+ ec = session.client(
461
+ "elasticache", region_name=region, config=BOTO_TIMEOUT_CONFIG
462
+ )
463
+ rg_id = resource_id.split(":")[-1]
464
+ resp = ec.describe_replication_groups(ReplicationGroupId=rg_id)
465
+ groups = resp.get("ReplicationGroups", [])
466
+ if groups:
467
+ rg = groups[0]
468
+ metrics["instance_type"] = rg.get("CacheNodeType")
469
+ metrics["node_count"] = sum(
470
+ len(ng.get("NodeGroupMembers", []))
471
+ for ng in rg.get("NodeGroups", [])
472
+ )
473
+
474
+ case "AWS::Redshift::Cluster":
475
+ rs = session.client(
476
+ "redshift", region_name=region, config=BOTO_TIMEOUT_CONFIG
477
+ )
478
+ cluster_id = resource_id.split(":")[-1]
479
+ resp = rs.describe_clusters(ClusterIdentifier=cluster_id)
480
+ clusters = resp.get("Clusters", [])
481
+ if clusters:
482
+ c = clusters[0]
483
+ metrics["instance_type"] = c.get("NodeType")
484
+ metrics["instance_count"] = c.get("NumberOfNodes")
485
+
486
+ case "AWS::OpenSearchService::Domain":
487
+ oss = session.client(
488
+ "opensearch", region_name=region, config=BOTO_TIMEOUT_CONFIG
489
+ )
490
+ domain_name = resource_id.split("/")[-1]
491
+ resp = oss.describe_domain(DomainName=domain_name)
492
+ config = resp.get("DomainStatus", {}).get("ClusterConfig", {})
493
+ if config:
494
+ metrics["instance_type"] = config.get("InstanceType")
495
+ metrics["instance_count"] = config.get("InstanceCount")
496
+ metrics["dedicated_master"] = config.get(
497
+ "DedicatedMasterEnabled", False
498
+ )
499
+
500
+ case "AWS::Lambda::Function":
501
+ lam = session.client(
502
+ "lambda", region_name=region, config=BOTO_TIMEOUT_CONFIG
503
+ )
504
+ func_name = resource_id.split(":")[-1]
505
+ resp = lam.get_function_configuration(FunctionName=func_name)
506
+ metrics["memory_mb"] = resp.get("MemorySize")
507
+ metrics["ephemeral_storage_mb"] = resp.get("EphemeralStorage", {}).get(
508
+ "Size"
509
+ )
510
+ metrics["runtime"] = resp.get("Runtime")
511
+
512
+ case "AWS::DMS::ReplicationInstance":
513
+ dms = session.client(
514
+ "dms", region_name=region, config=BOTO_TIMEOUT_CONFIG
515
+ )
516
+ # DMS uses the ARN as the filter
517
+ resp = dms.describe_replication_instances(
518
+ Filters=[
519
+ {"Name": "replication-instance-arn", "Values": [resource_id]}
520
+ ]
521
+ )
522
+ instances = resp.get("ReplicationInstances", [])
523
+ if instances:
524
+ metrics["instance_type"] = instances[0].get(
525
+ "ReplicationInstanceClass"
526
+ )
527
+ metrics["storage_gb"] = instances[0].get("AllocatedStorage")
528
+
529
+ except ClientError as exc:
530
+ # Best-effort — missing size data doesn't invalidate the metrics
531
+ logger.debug(
532
+ "instance_details_fetch_failed",
533
+ extra={"resource_id": resource_id, "error": str(exc)},
534
+ )
535
+ except Exception as exc: # noqa: BLE001
536
+ logger.debug(
537
+ "instance_details_unexpected_error",
538
+ extra={"resource_id": resource_id, "error": str(exc)},
539
+ )
540
+
541
+
542
+ def _parse_results(
543
+ results: list[Any],
544
+ metric_defs: list[tuple[str, str, str, str]],
545
+ resource_id: str,
546
+ resource_type: str,
547
+ days: int,
548
+ ) -> MetricSummary:
549
+ metrics: dict[str, Any] = {}
550
+ has_data = False
551
+
552
+ for result, (metric_name, _, stat, _) in zip(results, metric_defs, strict=False):
553
+ values: list[float] = result.get("Values", [])
554
+ if not values:
555
+ metrics[metric_name] = None
556
+ continue
557
+ has_data = True
558
+ if stat == "Average":
559
+ metrics[metric_name] = round(sum(values) / len(values), 4)
560
+ else:
561
+ metrics[metric_name] = round(sum(values), 2)
562
+
563
+ return MetricSummary(
564
+ resource_id=resource_id,
565
+ resource_type=resource_type,
566
+ period_days=days,
567
+ metrics=metrics,
568
+ has_data=has_data,
569
+ )
570
+
571
+
572
+ _FALLBACK_METRIC_LIMIT = 5 # max metrics to auto-discover per unknown resource
573
+
574
+
575
+ def _discover_metrics(
576
+ session: boto3.Session,
577
+ resource_id: str,
578
+ resource_type: str,
579
+ ) -> list[tuple[str, str, str, str]]:
580
+ """
581
+ For resource types not in _METRICS, ask CloudWatch what metrics exist
582
+ for this resource and return up to _FALLBACK_METRIC_LIMIT definitions.
583
+ Uses the ARN as a dimension value where possible.
584
+ """
585
+ region = _region_from_arn(resource_id)
586
+ client = session.client(
587
+ "cloudwatch", region_name=region, config=BOTO_TIMEOUT_CONFIG
588
+ )
589
+
590
+ # Try to find metrics that reference this resource by ARN or last-segment name.
591
+ dim_value = _dimension_value(resource_id, resource_type)
592
+ discovered: list[tuple[str, str, str, str]] = []
593
+
594
+ try:
595
+ paginator = client.get_paginator("list_metrics")
596
+ for page in paginator.paginate():
597
+ for m in page.get("Metrics", []):
598
+ for dim in m.get("Dimensions", []):
599
+ if dim.get("Value") in (resource_id, dim_value):
600
+ metric_name: str = m["MetricName"]
601
+ namespace: str = m["Namespace"]
602
+ dim_key: str = dim["Name"]
603
+ # Use Sum for count/bytes-sounding names, Average otherwise.
604
+ stat = (
605
+ "Sum"
606
+ if any(
607
+ kw in metric_name.lower()
608
+ for kw in (
609
+ "count",
610
+ "bytes",
611
+ "records",
612
+ "invocations",
613
+ "requests",
614
+ )
615
+ )
616
+ else "Average"
617
+ )
618
+ discovered.append((metric_name, namespace, stat, dim_key))
619
+ if len(discovered) >= _FALLBACK_METRIC_LIMIT:
620
+ return discovered
621
+ except ClientError as exc:
622
+ logger.warning(
623
+ "cloudwatch_list_metrics_failed",
624
+ extra={"resource_id": resource_id, "error": str(exc)},
625
+ )
626
+
627
+ return discovered
628
+
629
+
630
+ def _region_from_arn(arn: str) -> str:
631
+ parts = arn.split(":")
632
+ region = parts[3] if len(parts) > 3 else ""
633
+ return region or "us-east-1"
634
+
635
+
636
+ def _dimension_value(arn: str, resource_type: str) -> str:
637
+ """
638
+ Extract the CloudWatch dimension value from an ARN.
639
+ Most resources use the last segment; some (ALB, RDS, Lambda) need special handling.
640
+ """
641
+ parts = arn.split(":")
642
+ resource_part = ":".join(parts[5:])
643
+
644
+ match resource_type:
645
+ case "AWS::ElasticLoadBalancingV2::LoadBalancer":
646
+ # arn:...:loadbalancer/app/name/id -> app/name/id
647
+ if "loadbalancer/" in resource_part:
648
+ return resource_part.split("loadbalancer/", 1)[1]
649
+ case "AWS::RDS::DBInstance" | "AWS::Lambda::Function" | "AWS::SNS::Topic":
650
+ # arn:...:db:name or function:name or :topic-name
651
+ return resource_part.split(":")[-1]
652
+ case "AWS::SQS::Queue":
653
+ # arn:aws:sqs:region:account:queue-name
654
+ return resource_part # queue name is the whole resource_part
655
+ case "AWS::CloudFront::Distribution":
656
+ # arn:aws:cloudfront::account:distribution/EDFDVBD6EXAMPLE
657
+ return resource_part.split("/")[-1]
658
+ case "AWS::StepFunctions::StateMachine":
659
+ # dimension is the full ARN for Step Functions
660
+ return arn
661
+ case "AWS::MSK::Cluster":
662
+ # arn:aws:kafka:region:account:cluster/name/uuid -> name
663
+ if "cluster/" in resource_part:
664
+ return resource_part.split("cluster/")[1].split("/")[0]
665
+ case "AWS::SageMaker::Endpoint":
666
+ # arn:aws:sagemaker:region:account:endpoint/name
667
+ return resource_part.split("/")[-1]
668
+ case "AWS::Glue::Job":
669
+ # arn:aws:glue:region:account:job/name
670
+ return resource_part.split("/")[-1]
671
+ case (
672
+ "AWS::RDS::DBCluster"
673
+ | "AWS::Neptune::DBCluster"
674
+ | "AWS::DocDB::DBCluster"
675
+ ):
676
+ # arn:...:cluster:name
677
+ return resource_part.split(":")[-1]
678
+ case "AWS::ElastiCache::ReplicationGroup":
679
+ # arn:aws:elasticache:region:account:replicationgroup:name
680
+ return resource_part.split(":")[-1]
681
+ case "AWS::EMR::Cluster":
682
+ # arn:aws:elasticmapreduce:region:account:cluster/j-XXXXXXXX
683
+ return resource_part.split("/")[-1]
684
+ case "AWS::DMS::ReplicationInstance":
685
+ # arn:aws:dms:region:account:rep:name
686
+ return resource_part.split(":")[-1]
687
+ case "AWS::WorkSpaces::Workspace":
688
+ # arn:aws:workspaces:region:account:workspace/ws-xxxxxxxx
689
+ return resource_part.split("/")[-1]
690
+ case "AWS::KinesisFirehose::DeliveryStream":
691
+ # arn:aws:firehose:region:account:deliverystream/name
692
+ return resource_part.split("/")[-1]
693
+ case "AWS::AppSync::GraphQLApi":
694
+ # arn:aws:appsync:region:account:apis/apiId
695
+ return resource_part.split("/")[-1]
696
+ case "AWS::Events::Rule":
697
+ # arn:aws:events:region:account:rule/name
698
+ return resource_part.split("/")[-1]
699
+ case "AWS::ElasticBeanstalk::Environment":
700
+ # arn:aws:elasticbeanstalk:region:account:environment/app/env
701
+ return resource_part.split("/")[-1]
702
+ case "AWS::CodeBuild::Project":
703
+ # arn:aws:codebuild:region:account:project/name
704
+ return resource_part.split("/")[-1]
705
+ case "AWS::Transfer::Server":
706
+ # arn:aws:transfer:region:account:server/s-xxxxxxxx
707
+ return resource_part.split("/")[-1]
708
+ case "AWS::WAFv2::WebACL":
709
+ # arn:aws:wafv2:region:account:regional/webacl/name/id -> name
710
+ parts_slash = resource_part.split("/")
711
+ return parts_slash[-2] if len(parts_slash) >= 2 else parts_slash[-1]
712
+ case "AWS::S3::Bucket":
713
+ # arn:aws:s3:::bucket-name
714
+ return resource_part
715
+ case "AWS::Cognito::UserPool":
716
+ # arn:aws:cognito-idp:region:account:userpool/us-east-1_XXXXXXX
717
+ return resource_part.split("/")[-1]
718
+ case "AWS::MediaLive::Channel":
719
+ # arn:aws:medialive:region:account:channel:id
720
+ return resource_part.split(":")[-1]
721
+ case "AWS::Batch::JobQueue":
722
+ # arn:aws:batch:region:account:job-queue/name
723
+ return resource_part.split("/")[-1]
724
+ case "AWS::Route53::HostedZone":
725
+ # arn:aws:route53:::hostedzone/ZXXXXXXX
726
+ return resource_part.split("/")[-1]
727
+
728
+ if "/" in resource_part:
729
+ return resource_part.split("/")[-1]
730
+ if ":" in resource_part:
731
+ return resource_part.split(":")[-1]
732
+ return resource_part