argus-cloud-optimizer 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adapters/__init__.py +0 -0
- adapters/aws/__init__.py +0 -0
- adapters/aws/adapter.py +85 -0
- adapters/aws/auth.py +57 -0
- adapters/aws/cloudtrail.py +83 -0
- adapters/aws/cloudwatch.py +732 -0
- adapters/aws/config.py +9 -0
- adapters/aws/cost_explorer.py +116 -0
- adapters/aws/resource_explorer.py +186 -0
- adapters/aws/retry.py +55 -0
- adapters/azure/__init__.py +0 -0
- adapters/azure/activity_log.py +159 -0
- adapters/azure/adapter.py +117 -0
- adapters/azure/cost_management.py +125 -0
- adapters/azure/monitor.py +311 -0
- adapters/azure/resource_graph.py +113 -0
- adapters/azure/retry.py +57 -0
- adapters/base.py +105 -0
- adapters/gcp/__init__.py +0 -0
- adapters/gcp/adapter.py +86 -0
- adapters/gcp/asset_inventory.py +116 -0
- adapters/gcp/billing.py +118 -0
- adapters/gcp/cloud_logging.py +93 -0
- adapters/gcp/cloud_monitoring.py +276 -0
- adapters/gcp/retry.py +46 -0
- ai/__init__.py +0 -0
- ai/anthropic.py +174 -0
- ai/azure_openai.py +241 -0
- ai/base.py +78 -0
- ai/bedrock.py +169 -0
- ai/vertexai.py +234 -0
- argus_cloud_optimizer-0.2.0.dist-info/METADATA +433 -0
- argus_cloud_optimizer-0.2.0.dist-info/RECORD +62 -0
- argus_cloud_optimizer-0.2.0.dist-info/WHEEL +5 -0
- argus_cloud_optimizer-0.2.0.dist-info/entry_points.txt +2 -0
- argus_cloud_optimizer-0.2.0.dist-info/licenses/LICENSE +21 -0
- argus_cloud_optimizer-0.2.0.dist-info/top_level.txt +4 -0
- core/__init__.py +0 -0
- core/__version__.py +1 -0
- core/agent/__init__.py +0 -0
- core/agent/loop.py +390 -0
- core/agent/prompts.py +317 -0
- core/config.py +235 -0
- core/log.py +69 -0
- core/models/__init__.py +0 -0
- core/models/finding.py +76 -0
- core/py.typed +0 -0
- core/reports/__init__.py +0 -0
- core/reports/comparison.py +49 -0
- core/reports/delivery.py +323 -0
- core/reports/export.py +111 -0
- core/reports/generator.py +168 -0
- core/reports/html.py +286 -0
- core/reports/multi_cloud.py +162 -0
- core/secrets.py +145 -0
- core/token_tracker.py +97 -0
- core/validation.py +214 -0
- entrypoints/__init__.py +0 -0
- entrypoints/aws_lambda.py +299 -0
- entrypoints/azure_function.py +257 -0
- entrypoints/cli.py +156 -0
- entrypoints/gcp_cloudrun.py +209 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timedelta, timezone
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
from azure.core.exceptions import HttpResponseError
|
|
8
|
+
from azure.identity import DefaultAzureCredential
|
|
9
|
+
from azure.mgmt.costmanagement import CostManagementClient
|
|
10
|
+
from azure.mgmt.costmanagement.models import (
|
|
11
|
+
QueryComparisonExpression,
|
|
12
|
+
QueryDataset,
|
|
13
|
+
QueryDefinition,
|
|
14
|
+
QueryFilter,
|
|
15
|
+
QueryGrouping,
|
|
16
|
+
QueryTimePeriod,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from adapters.azure.retry import retry_on_transient
|
|
20
|
+
|
|
21
|
+
logger = structlog.get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
_BATCH_SIZE = 50 # Cost Management API supports up to ~100 resource IDs per filter
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_cost(
|
|
27
|
+
subscription_id: str,
|
|
28
|
+
resource_ids: list[str],
|
|
29
|
+
days: int = 30,
|
|
30
|
+
credential: Any = None,
|
|
31
|
+
) -> dict[str, float]:
|
|
32
|
+
"""
|
|
33
|
+
Return estimated cost in USD per resource ID over the last N days.
|
|
34
|
+
|
|
35
|
+
Uses Azure Cost Management QueryUsage API, batched to avoid filter size limits.
|
|
36
|
+
Cost Management requires the subscription to have a spending plan (not free tier).
|
|
37
|
+
Returns zeros with a warning if cost data is unavailable.
|
|
38
|
+
"""
|
|
39
|
+
if not resource_ids:
|
|
40
|
+
return {}
|
|
41
|
+
|
|
42
|
+
cred = credential or DefaultAzureCredential()
|
|
43
|
+
client = CostManagementClient(cred, connection_timeout=10, read_timeout=60)
|
|
44
|
+
scope = f"/subscriptions/{subscription_id}"
|
|
45
|
+
|
|
46
|
+
costs: dict[str, float] = {rid: 0.0 for rid in resource_ids}
|
|
47
|
+
|
|
48
|
+
# Process in batches to stay within API filter limits
|
|
49
|
+
for i in range(0, len(resource_ids), _BATCH_SIZE):
|
|
50
|
+
batch = resource_ids[i : i + _BATCH_SIZE]
|
|
51
|
+
try:
|
|
52
|
+
_query_batch(client, scope, batch, days, costs)
|
|
53
|
+
except HttpResponseError as exc:
|
|
54
|
+
if exc.status_code in (403, 404):
|
|
55
|
+
logger.warning(
|
|
56
|
+
"azure_cost_management_unavailable",
|
|
57
|
+
extra={
|
|
58
|
+
"subscription_id": subscription_id,
|
|
59
|
+
"error": str(exc),
|
|
60
|
+
"hint": (
|
|
61
|
+
"Cost Management requires a paid Azure subscription. "
|
|
62
|
+
"Free trial accounts return no cost data."
|
|
63
|
+
),
|
|
64
|
+
},
|
|
65
|
+
)
|
|
66
|
+
break
|
|
67
|
+
logger.error(
|
|
68
|
+
"azure_cost_management_failed",
|
|
69
|
+
extra={"subscription_id": subscription_id, "error": str(exc)},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
logger.info(
|
|
73
|
+
"azure_cost_query_complete",
|
|
74
|
+
extra={
|
|
75
|
+
"subscription_id": subscription_id,
|
|
76
|
+
"resources_queried": len(resource_ids),
|
|
77
|
+
"resources_with_cost": sum(1 for v in costs.values() if v > 0),
|
|
78
|
+
},
|
|
79
|
+
)
|
|
80
|
+
return costs
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _query_batch(
|
|
84
|
+
client: CostManagementClient,
|
|
85
|
+
scope: str,
|
|
86
|
+
resource_ids: list[str],
|
|
87
|
+
days: int,
|
|
88
|
+
costs: dict[str, float],
|
|
89
|
+
) -> None:
|
|
90
|
+
end_date = datetime.now(tz=timezone.utc)
|
|
91
|
+
start_date = end_date - timedelta(days=days)
|
|
92
|
+
|
|
93
|
+
query = QueryDefinition(
|
|
94
|
+
type="Usage",
|
|
95
|
+
timeframe="Custom",
|
|
96
|
+
time_period=QueryTimePeriod(
|
|
97
|
+
from_property=start_date,
|
|
98
|
+
to=end_date,
|
|
99
|
+
),
|
|
100
|
+
dataset=QueryDataset(
|
|
101
|
+
granularity="None",
|
|
102
|
+
aggregation={"totalCost": {"name": "PreTaxCost", "function": "Sum"}}, # type: ignore[dict-item]
|
|
103
|
+
grouping=[QueryGrouping(type="Dimension", name="ResourceId")],
|
|
104
|
+
filter=QueryFilter(
|
|
105
|
+
dimensions=QueryComparisonExpression(
|
|
106
|
+
name="ResourceId",
|
|
107
|
+
operator="In",
|
|
108
|
+
values=resource_ids,
|
|
109
|
+
)
|
|
110
|
+
),
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
result = retry_on_transient(client.query.usage, scope=scope, parameters=query)
|
|
115
|
+
|
|
116
|
+
# Result rows: [cost, currency, resourceId]
|
|
117
|
+
for row in result.rows if result and result.rows else []:
|
|
118
|
+
if len(row) >= 3:
|
|
119
|
+
amount = float(row[0])
|
|
120
|
+
resource_id: str = str(row[2])
|
|
121
|
+
# Match case-insensitively — Azure resource IDs are case-insensitive
|
|
122
|
+
for rid in costs:
|
|
123
|
+
if rid.lower() == resource_id.lower():
|
|
124
|
+
costs[rid] = costs[rid] + amount
|
|
125
|
+
break
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timedelta, timezone
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
from azure.core.exceptions import HttpResponseError
|
|
8
|
+
from azure.identity import DefaultAzureCredential
|
|
9
|
+
from azure.monitor.query import MetricAggregationType, MetricsQueryClient
|
|
10
|
+
|
|
11
|
+
from adapters.azure.retry import retry_on_transient
|
|
12
|
+
from adapters.base import MetricSummary
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
# (MetricName, AggregationType)
|
|
17
|
+
_METRICS: dict[str, list[tuple[str, str]]] = {
|
|
18
|
+
# Virtual Machines
|
|
19
|
+
"microsoft.compute/virtualmachines": [
|
|
20
|
+
("Percentage CPU", "Average"),
|
|
21
|
+
("Network In Total", "Total"),
|
|
22
|
+
("Network Out Total", "Total"),
|
|
23
|
+
],
|
|
24
|
+
# VM Scale Sets
|
|
25
|
+
"microsoft.compute/virtualmachinescalesets": [
|
|
26
|
+
("Percentage CPU", "Average"),
|
|
27
|
+
("Network In Total", "Total"),
|
|
28
|
+
("Network Out Total", "Total"),
|
|
29
|
+
],
|
|
30
|
+
# Managed Disks
|
|
31
|
+
"microsoft.compute/disks": [
|
|
32
|
+
("Composite Disk Read Operations/sec", "Average"),
|
|
33
|
+
("Composite Disk Write Operations/sec", "Average"),
|
|
34
|
+
],
|
|
35
|
+
# Azure SQL Database
|
|
36
|
+
"microsoft.sql/servers/databases": [
|
|
37
|
+
("cpu_percent", "Average"),
|
|
38
|
+
("connection_successful", "Total"),
|
|
39
|
+
("storage_percent", "Average"),
|
|
40
|
+
],
|
|
41
|
+
# Azure SQL Managed Instance
|
|
42
|
+
"microsoft.sql/managedinstances": [
|
|
43
|
+
("avg_cpu_percent", "Average"),
|
|
44
|
+
("storage_space_used_mb", "Average"),
|
|
45
|
+
],
|
|
46
|
+
# App Service Plans
|
|
47
|
+
"microsoft.web/serverfarms": [
|
|
48
|
+
("CpuPercentage", "Average"),
|
|
49
|
+
("MemoryPercentage", "Average"),
|
|
50
|
+
("HttpQueueLength", "Average"),
|
|
51
|
+
],
|
|
52
|
+
# App Services / Function Apps
|
|
53
|
+
"microsoft.web/sites": [
|
|
54
|
+
("CpuTime", "Total"),
|
|
55
|
+
("Requests", "Total"),
|
|
56
|
+
("BytesReceived", "Total"),
|
|
57
|
+
],
|
|
58
|
+
# AKS Clusters
|
|
59
|
+
"microsoft.containerservice/managedclusters": [
|
|
60
|
+
("node_cpu_usage_percentage", "Average"),
|
|
61
|
+
("node_memory_rss_percentage", "Average"),
|
|
62
|
+
("kube_node_status_allocatable_cpu_cores", "Average"),
|
|
63
|
+
],
|
|
64
|
+
# Container Instances
|
|
65
|
+
"microsoft.containerinstance/containergroups": [
|
|
66
|
+
("CpuUsage", "Average"),
|
|
67
|
+
("MemoryUsage", "Average"),
|
|
68
|
+
("NetworkBytesReceivedPerSecond", "Average"),
|
|
69
|
+
],
|
|
70
|
+
# Azure Cache for Redis
|
|
71
|
+
"microsoft.cache/redis": [
|
|
72
|
+
("connectedclients", "Average"),
|
|
73
|
+
("cachehits", "Total"),
|
|
74
|
+
("cachemisses", "Total"),
|
|
75
|
+
],
|
|
76
|
+
# Cosmos DB
|
|
77
|
+
"microsoft.documentdb/databaseaccounts": [
|
|
78
|
+
("TotalRequests", "Total"),
|
|
79
|
+
("NormalizedRUConsumption", "Average"),
|
|
80
|
+
("ServerSideLatency", "Average"),
|
|
81
|
+
],
|
|
82
|
+
# Storage Accounts
|
|
83
|
+
"microsoft.storage/storageaccounts": [
|
|
84
|
+
("Transactions", "Total"),
|
|
85
|
+
("Ingress", "Total"),
|
|
86
|
+
("Egress", "Total"),
|
|
87
|
+
],
|
|
88
|
+
# Azure Kubernetes Service Node Pools
|
|
89
|
+
"microsoft.containerservice/managedclusters/agentpools": [
|
|
90
|
+
("node_cpu_usage_percentage", "Average"),
|
|
91
|
+
("node_memory_rss_percentage", "Average"),
|
|
92
|
+
],
|
|
93
|
+
# Event Hubs
|
|
94
|
+
"microsoft.eventhub/namespaces": [
|
|
95
|
+
("IncomingMessages", "Total"),
|
|
96
|
+
("OutgoingMessages", "Total"),
|
|
97
|
+
("ActiveConnections", "Average"),
|
|
98
|
+
],
|
|
99
|
+
# Service Bus
|
|
100
|
+
"microsoft.servicebus/namespaces": [
|
|
101
|
+
("IncomingMessages", "Total"),
|
|
102
|
+
("OutgoingMessages", "Total"),
|
|
103
|
+
("ActiveConnections", "Average"),
|
|
104
|
+
],
|
|
105
|
+
# Azure Functions (same as web/sites but grouped for clarity)
|
|
106
|
+
"microsoft.web/sites/functions": [
|
|
107
|
+
("FunctionExecutionCount", "Total"),
|
|
108
|
+
("FunctionExecutionUnits", "Total"),
|
|
109
|
+
],
|
|
110
|
+
# API Management
|
|
111
|
+
"microsoft.apimanagement/service": [
|
|
112
|
+
("TotalRequests", "Total"),
|
|
113
|
+
("SuccessfulRequests", "Total"),
|
|
114
|
+
("Capacity", "Average"),
|
|
115
|
+
],
|
|
116
|
+
# Application Gateway
|
|
117
|
+
"microsoft.network/applicationgateways": [
|
|
118
|
+
("TotalRequests", "Total"),
|
|
119
|
+
("CurrentConnections", "Average"),
|
|
120
|
+
("Throughput", "Average"),
|
|
121
|
+
],
|
|
122
|
+
# Load Balancers
|
|
123
|
+
"microsoft.network/loadbalancers": [
|
|
124
|
+
("PacketCount", "Total"),
|
|
125
|
+
("ByteCount", "Total"),
|
|
126
|
+
("AllocatedSnatPorts", "Average"),
|
|
127
|
+
],
|
|
128
|
+
# Azure Databricks
|
|
129
|
+
"microsoft.databricks/workspaces": [
|
|
130
|
+
("autoOptimizeClusterUtilization", "Average"),
|
|
131
|
+
("numActiveClusters", "Average"),
|
|
132
|
+
],
|
|
133
|
+
# HDInsight
|
|
134
|
+
"microsoft.hdinsight/clusters": [
|
|
135
|
+
("GatewayRequests", "Total"),
|
|
136
|
+
("CategorizedGatewayRequests", "Total"),
|
|
137
|
+
],
|
|
138
|
+
# Logic Apps
|
|
139
|
+
"microsoft.logic/workflows": [
|
|
140
|
+
("RunsStarted", "Total"),
|
|
141
|
+
("RunsCompleted", "Total"),
|
|
142
|
+
("RunsFailed", "Total"),
|
|
143
|
+
],
|
|
144
|
+
# Cognitive Services / OpenAI
|
|
145
|
+
"microsoft.cognitiveservices/accounts": [
|
|
146
|
+
("TotalCalls", "Total"),
|
|
147
|
+
("TotalErrors", "Total"),
|
|
148
|
+
("Latency", "Average"),
|
|
149
|
+
],
|
|
150
|
+
# Azure Search
|
|
151
|
+
"microsoft.search/searchservices": [
|
|
152
|
+
("SearchQueriesPerSecond", "Average"),
|
|
153
|
+
("ThrottledSearchQueriesPercentage", "Average"),
|
|
154
|
+
],
|
|
155
|
+
# Azure Stream Analytics
|
|
156
|
+
"microsoft.streamanalytics/streamingjobs": [
|
|
157
|
+
("InputEvents", "Total"),
|
|
158
|
+
("OutputEvents", "Total"),
|
|
159
|
+
("ResourceUtilization", "Average"),
|
|
160
|
+
],
|
|
161
|
+
# Azure Data Factory
|
|
162
|
+
"microsoft.datafactory/factories": [
|
|
163
|
+
("PipelineRunsStarted", "Total"),
|
|
164
|
+
("ActivityRunsStarted", "Total"),
|
|
165
|
+
("TriggerRunsStarted", "Total"),
|
|
166
|
+
],
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
_AGGREGATION_MAP = {
|
|
170
|
+
"Average": MetricAggregationType.AVERAGE,
|
|
171
|
+
"Total": MetricAggregationType.TOTAL,
|
|
172
|
+
"Minimum": MetricAggregationType.MINIMUM,
|
|
173
|
+
"Maximum": MetricAggregationType.MAXIMUM,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
_FALLBACK_METRIC_LIMIT = 5
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_metrics(
|
|
180
|
+
resource_id: str,
|
|
181
|
+
resource_type: str,
|
|
182
|
+
days: int = 90,
|
|
183
|
+
credential: Any = None,
|
|
184
|
+
) -> MetricSummary:
|
|
185
|
+
"""
|
|
186
|
+
Fetch Azure Monitor metrics for a resource.
|
|
187
|
+
Falls back to querying available metric definitions for unknown resource types.
|
|
188
|
+
"""
|
|
189
|
+
metric_defs = _METRICS.get(resource_type.lower())
|
|
190
|
+
if not metric_defs:
|
|
191
|
+
metric_defs = _discover_metrics(resource_id, resource_type, credential)
|
|
192
|
+
if not metric_defs:
|
|
193
|
+
return MetricSummary(
|
|
194
|
+
resource_id=resource_id,
|
|
195
|
+
resource_type=resource_type,
|
|
196
|
+
period_days=days,
|
|
197
|
+
metrics={},
|
|
198
|
+
has_data=False,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
cred = credential or DefaultAzureCredential()
|
|
202
|
+
client = MetricsQueryClient(cred, connection_timeout=10, read_timeout=60)
|
|
203
|
+
|
|
204
|
+
end_time = datetime.now(tz=timezone.utc)
|
|
205
|
+
start_time = end_time - timedelta(days=days)
|
|
206
|
+
granularity = timedelta(days=1)
|
|
207
|
+
|
|
208
|
+
metric_names = [name for name, _ in metric_defs]
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
response = retry_on_transient(
|
|
212
|
+
client.query_resource,
|
|
213
|
+
resource_uri=resource_id,
|
|
214
|
+
metric_names=metric_names,
|
|
215
|
+
timespan=(start_time, end_time),
|
|
216
|
+
granularity=granularity,
|
|
217
|
+
)
|
|
218
|
+
except HttpResponseError as exc:
|
|
219
|
+
logger.warning(
|
|
220
|
+
"azure_monitor_query_failed",
|
|
221
|
+
extra={"resource_id": resource_id, "error": str(exc)},
|
|
222
|
+
)
|
|
223
|
+
return MetricSummary(
|
|
224
|
+
resource_id=resource_id,
|
|
225
|
+
resource_type=resource_type,
|
|
226
|
+
period_days=days,
|
|
227
|
+
metrics={},
|
|
228
|
+
has_data=False,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
return _parse_response(response, metric_defs, resource_id, resource_type, days)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _parse_response(
|
|
235
|
+
response: Any,
|
|
236
|
+
metric_defs: list[tuple[str, str]],
|
|
237
|
+
resource_id: str,
|
|
238
|
+
resource_type: str,
|
|
239
|
+
days: int,
|
|
240
|
+
) -> MetricSummary:
|
|
241
|
+
metrics: dict[str, Any] = {}
|
|
242
|
+
has_data = False
|
|
243
|
+
|
|
244
|
+
agg_map = {name: agg for name, agg in metric_defs}
|
|
245
|
+
|
|
246
|
+
for metric in response.metrics:
|
|
247
|
+
name: str = metric.name
|
|
248
|
+
agg_type: str = agg_map.get(name, "Average")
|
|
249
|
+
values: list[float] = []
|
|
250
|
+
|
|
251
|
+
for ts in metric.timeseries:
|
|
252
|
+
for data_point in ts.data:
|
|
253
|
+
val = data_point.total if agg_type == "Total" else data_point.average
|
|
254
|
+
if val is not None:
|
|
255
|
+
values.append(val)
|
|
256
|
+
|
|
257
|
+
if not values:
|
|
258
|
+
metrics[name] = None
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
has_data = True
|
|
262
|
+
metrics[name] = round(
|
|
263
|
+
sum(values) if agg_type == "Total" else sum(values) / len(values), 4
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return MetricSummary(
|
|
267
|
+
resource_id=resource_id,
|
|
268
|
+
resource_type=resource_type,
|
|
269
|
+
period_days=days,
|
|
270
|
+
metrics=metrics,
|
|
271
|
+
has_data=has_data,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _discover_metrics(
|
|
276
|
+
resource_id: str,
|
|
277
|
+
resource_type: str,
|
|
278
|
+
credential: Any = None,
|
|
279
|
+
) -> list[tuple[str, str]]:
|
|
280
|
+
"""Auto-discover available metrics for unknown Azure resource types."""
|
|
281
|
+
from azure.monitor.query import MetricsQueryClient
|
|
282
|
+
|
|
283
|
+
cred = credential or DefaultAzureCredential()
|
|
284
|
+
client = MetricsQueryClient(cred, connection_timeout=10, read_timeout=60)
|
|
285
|
+
discovered: list[tuple[str, str]] = []
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
definitions = retry_on_transient(
|
|
289
|
+
client.list_metric_definitions, resource_uri=resource_id
|
|
290
|
+
)
|
|
291
|
+
for defn in definitions:
|
|
292
|
+
metric_name: str = defn.name or ""
|
|
293
|
+
# Pick aggregation based on metric name heuristics
|
|
294
|
+
agg = (
|
|
295
|
+
"Total"
|
|
296
|
+
if any(
|
|
297
|
+
kw in metric_name.lower()
|
|
298
|
+
for kw in ("count", "bytes", "requests", "transactions", "total")
|
|
299
|
+
)
|
|
300
|
+
else "Average"
|
|
301
|
+
)
|
|
302
|
+
discovered.append((metric_name, agg))
|
|
303
|
+
if len(discovered) >= _FALLBACK_METRIC_LIMIT:
|
|
304
|
+
break
|
|
305
|
+
except HttpResponseError as exc:
|
|
306
|
+
logger.warning(
|
|
307
|
+
"azure_monitor_list_metrics_failed",
|
|
308
|
+
extra={"resource_id": resource_id, "error": str(exc)},
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
return discovered
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import structlog
|
|
6
|
+
from azure.core.exceptions import HttpResponseError
|
|
7
|
+
from azure.identity import DefaultAzureCredential
|
|
8
|
+
from azure.mgmt.resourcegraph import ResourceGraphClient
|
|
9
|
+
from azure.mgmt.resourcegraph.models import QueryRequest, QueryRequestOptions
|
|
10
|
+
|
|
11
|
+
from adapters.azure.retry import retry_on_transient
|
|
12
|
+
from adapters.base import Resource
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
# KQL query — returns all resources with their type, location, tags, and resource group.
|
|
17
|
+
# We exclude resource types that have no billing impact (e.g. locks, role assignments).
|
|
18
|
+
_RESOURCE_QUERY = """
|
|
19
|
+
Resources
|
|
20
|
+
| where type !in~ (
|
|
21
|
+
'microsoft.authorization/roleassignments',
|
|
22
|
+
'microsoft.authorization/roledefinitions',
|
|
23
|
+
'microsoft.authorization/locks',
|
|
24
|
+
'microsoft.resources/deployments',
|
|
25
|
+
'microsoft.resources/tags'
|
|
26
|
+
)
|
|
27
|
+
| project id, name, type, location, resourceGroup, tags, subscriptionId
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
_PAGE_SIZE = 1000 # Resource Graph max per page
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def list_resources(
|
|
34
|
+
subscription_ids: list[str],
|
|
35
|
+
ignore_regions: list[str] | None = None,
|
|
36
|
+
credential: Any = None,
|
|
37
|
+
) -> list[Resource]:
|
|
38
|
+
"""
|
|
39
|
+
Return all billable Azure resources across the given subscriptions
|
|
40
|
+
using Azure Resource Graph (single cross-subscription query).
|
|
41
|
+
|
|
42
|
+
Auth: DefaultAzureCredential — Managed Identity in production,
|
|
43
|
+
az login / env vars for local dev.
|
|
44
|
+
"""
|
|
45
|
+
cred = credential or DefaultAzureCredential()
|
|
46
|
+
client = ResourceGraphClient(cred, connection_timeout=10, read_timeout=60)
|
|
47
|
+
ignore_set = {r.lower() for r in (ignore_regions or [])}
|
|
48
|
+
resources: list[Resource] = []
|
|
49
|
+
|
|
50
|
+
request = QueryRequest(
|
|
51
|
+
subscriptions=subscription_ids,
|
|
52
|
+
query=_RESOURCE_QUERY,
|
|
53
|
+
options=QueryRequestOptions(result_format="objectArray", top=_PAGE_SIZE),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
skip_token: str | None = None
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
while True:
|
|
60
|
+
if skip_token:
|
|
61
|
+
request.options.skip_token = skip_token
|
|
62
|
+
|
|
63
|
+
response = retry_on_transient(client.resources, request)
|
|
64
|
+
|
|
65
|
+
for raw in response.data or []:
|
|
66
|
+
parsed = _parse_resource(raw, ignore_set)
|
|
67
|
+
if parsed:
|
|
68
|
+
resources.append(parsed)
|
|
69
|
+
|
|
70
|
+
_raw_token = getattr(response, "skip_token", None) or getattr(
|
|
71
|
+
response, "$skipToken", None
|
|
72
|
+
)
|
|
73
|
+
skip_token = _raw_token if isinstance(_raw_token, str) else None
|
|
74
|
+
if not skip_token:
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
except HttpResponseError as exc:
|
|
78
|
+
if exc.status_code == 403:
|
|
79
|
+
raise PermissionError(
|
|
80
|
+
"Argus service principal is missing Reader role "
|
|
81
|
+
"on the subscription(s). "
|
|
82
|
+
"Assign 'Reader' at the subscription scope."
|
|
83
|
+
) from exc
|
|
84
|
+
raise
|
|
85
|
+
|
|
86
|
+
logger.info(
|
|
87
|
+
"resource_graph_query_complete",
|
|
88
|
+
extra={"subscriptions": subscription_ids, "total": len(resources)},
|
|
89
|
+
)
|
|
90
|
+
return resources
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _parse_resource(raw: dict[str, Any], ignore_set: set[str]) -> Resource | None:
|
|
94
|
+
resource_id: str = raw.get("id", "")
|
|
95
|
+
name: str = raw.get("name", "")
|
|
96
|
+
resource_type: str = raw.get("type", "")
|
|
97
|
+
location: str = raw.get("location", "global")
|
|
98
|
+
|
|
99
|
+
if not resource_id or not resource_type:
|
|
100
|
+
return None
|
|
101
|
+
if location.lower() in ignore_set:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
tags: dict[str, str] = {str(k): str(v) for k, v in (raw.get("tags") or {}).items()}
|
|
105
|
+
|
|
106
|
+
return Resource(
|
|
107
|
+
resource_id=resource_id,
|
|
108
|
+
resource_type=resource_type.lower(),
|
|
109
|
+
cloud="azure",
|
|
110
|
+
region=location,
|
|
111
|
+
name=name or None,
|
|
112
|
+
tags=tags,
|
|
113
|
+
)
|
adapters/azure/retry.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from typing import Any, TypeVar
|
|
7
|
+
|
|
8
|
+
import structlog
|
|
9
|
+
from azure.core.exceptions import HttpResponseError
|
|
10
|
+
|
|
11
|
+
logger = structlog.get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T")
|
|
14
|
+
|
|
15
|
+
_MAX_RETRIES = 3
|
|
16
|
+
_BASE_DELAY = 1.0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def retry_on_transient(
|
|
20
|
+
fn: Callable[..., T],
|
|
21
|
+
*args: Any,
|
|
22
|
+
**kwargs: Any,
|
|
23
|
+
) -> T:
|
|
24
|
+
delay = _BASE_DELAY
|
|
25
|
+
for attempt in range(_MAX_RETRIES):
|
|
26
|
+
try:
|
|
27
|
+
return fn(*args, **kwargs)
|
|
28
|
+
except HttpResponseError as exc:
|
|
29
|
+
status = exc.status_code or 0
|
|
30
|
+
if status in (429, 500, 502, 503, 504) and attempt < _MAX_RETRIES - 1:
|
|
31
|
+
retry_after = _parse_retry_after(exc)
|
|
32
|
+
jitter = random.uniform(0, delay * 0.5) # noqa: S311
|
|
33
|
+
sleep_time = retry_after if retry_after else delay + jitter
|
|
34
|
+
logger.warning(
|
|
35
|
+
"azure_transient_error_retrying",
|
|
36
|
+
status_code=status,
|
|
37
|
+
attempt=attempt + 1,
|
|
38
|
+
max_retries=_MAX_RETRIES,
|
|
39
|
+
retry_in=round(sleep_time, 1),
|
|
40
|
+
)
|
|
41
|
+
time.sleep(sleep_time)
|
|
42
|
+
delay *= 2
|
|
43
|
+
else:
|
|
44
|
+
raise
|
|
45
|
+
raise RuntimeError("Unreachable") # pragma: no cover
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _parse_retry_after(exc: HttpResponseError) -> float | None:
|
|
49
|
+
if exc.response is None:
|
|
50
|
+
return None
|
|
51
|
+
header = exc.response.headers.get("Retry-After")
|
|
52
|
+
if header is None:
|
|
53
|
+
return None
|
|
54
|
+
try:
|
|
55
|
+
return float(header)
|
|
56
|
+
except ValueError:
|
|
57
|
+
return None
|