argus-cloud-optimizer 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. adapters/__init__.py +0 -0
  2. adapters/aws/__init__.py +0 -0
  3. adapters/aws/adapter.py +85 -0
  4. adapters/aws/auth.py +57 -0
  5. adapters/aws/cloudtrail.py +83 -0
  6. adapters/aws/cloudwatch.py +732 -0
  7. adapters/aws/config.py +9 -0
  8. adapters/aws/cost_explorer.py +116 -0
  9. adapters/aws/resource_explorer.py +186 -0
  10. adapters/aws/retry.py +55 -0
  11. adapters/azure/__init__.py +0 -0
  12. adapters/azure/activity_log.py +159 -0
  13. adapters/azure/adapter.py +117 -0
  14. adapters/azure/cost_management.py +125 -0
  15. adapters/azure/monitor.py +311 -0
  16. adapters/azure/resource_graph.py +113 -0
  17. adapters/azure/retry.py +57 -0
  18. adapters/base.py +105 -0
  19. adapters/gcp/__init__.py +0 -0
  20. adapters/gcp/adapter.py +86 -0
  21. adapters/gcp/asset_inventory.py +116 -0
  22. adapters/gcp/billing.py +118 -0
  23. adapters/gcp/cloud_logging.py +93 -0
  24. adapters/gcp/cloud_monitoring.py +276 -0
  25. adapters/gcp/retry.py +46 -0
  26. ai/__init__.py +0 -0
  27. ai/anthropic.py +174 -0
  28. ai/azure_openai.py +241 -0
  29. ai/base.py +78 -0
  30. ai/bedrock.py +169 -0
  31. ai/vertexai.py +234 -0
  32. argus_cloud_optimizer-0.2.0.dist-info/METADATA +433 -0
  33. argus_cloud_optimizer-0.2.0.dist-info/RECORD +62 -0
  34. argus_cloud_optimizer-0.2.0.dist-info/WHEEL +5 -0
  35. argus_cloud_optimizer-0.2.0.dist-info/entry_points.txt +2 -0
  36. argus_cloud_optimizer-0.2.0.dist-info/licenses/LICENSE +21 -0
  37. argus_cloud_optimizer-0.2.0.dist-info/top_level.txt +4 -0
  38. core/__init__.py +0 -0
  39. core/__version__.py +1 -0
  40. core/agent/__init__.py +0 -0
  41. core/agent/loop.py +390 -0
  42. core/agent/prompts.py +317 -0
  43. core/config.py +235 -0
  44. core/log.py +69 -0
  45. core/models/__init__.py +0 -0
  46. core/models/finding.py +76 -0
  47. core/py.typed +0 -0
  48. core/reports/__init__.py +0 -0
  49. core/reports/comparison.py +49 -0
  50. core/reports/delivery.py +323 -0
  51. core/reports/export.py +111 -0
  52. core/reports/generator.py +168 -0
  53. core/reports/html.py +286 -0
  54. core/reports/multi_cloud.py +162 -0
  55. core/secrets.py +145 -0
  56. core/token_tracker.py +97 -0
  57. core/validation.py +214 -0
  58. entrypoints/__init__.py +0 -0
  59. entrypoints/aws_lambda.py +299 -0
  60. entrypoints/azure_function.py +257 -0
  61. entrypoints/cli.py +156 -0
  62. entrypoints/gcp_cloudrun.py +209 -0
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timedelta, timezone
4
+ from typing import Any
5
+
6
+ import structlog
7
+ from azure.core.exceptions import HttpResponseError
8
+ from azure.identity import DefaultAzureCredential
9
+ from azure.mgmt.costmanagement import CostManagementClient
10
+ from azure.mgmt.costmanagement.models import (
11
+ QueryComparisonExpression,
12
+ QueryDataset,
13
+ QueryDefinition,
14
+ QueryFilter,
15
+ QueryGrouping,
16
+ QueryTimePeriod,
17
+ )
18
+
19
+ from adapters.azure.retry import retry_on_transient
20
+
21
+ logger = structlog.get_logger(__name__)
22
+
23
+ _BATCH_SIZE = 50 # Cost Management API supports up to ~100 resource IDs per filter
24
+
25
+
26
+ def get_cost(
27
+ subscription_id: str,
28
+ resource_ids: list[str],
29
+ days: int = 30,
30
+ credential: Any = None,
31
+ ) -> dict[str, float]:
32
+ """
33
+ Return estimated cost in USD per resource ID over the last N days.
34
+
35
+ Uses Azure Cost Management QueryUsage API, batched to avoid filter size limits.
36
+ Cost Management requires the subscription to have a spending plan (not free tier).
37
+ Returns zeros with a warning if cost data is unavailable.
38
+ """
39
+ if not resource_ids:
40
+ return {}
41
+
42
+ cred = credential or DefaultAzureCredential()
43
+ client = CostManagementClient(cred, connection_timeout=10, read_timeout=60)
44
+ scope = f"/subscriptions/{subscription_id}"
45
+
46
+ costs: dict[str, float] = {rid: 0.0 for rid in resource_ids}
47
+
48
+ # Process in batches to stay within API filter limits
49
+ for i in range(0, len(resource_ids), _BATCH_SIZE):
50
+ batch = resource_ids[i : i + _BATCH_SIZE]
51
+ try:
52
+ _query_batch(client, scope, batch, days, costs)
53
+ except HttpResponseError as exc:
54
+ if exc.status_code in (403, 404):
55
+ logger.warning(
56
+ "azure_cost_management_unavailable",
57
+ extra={
58
+ "subscription_id": subscription_id,
59
+ "error": str(exc),
60
+ "hint": (
61
+ "Cost Management requires a paid Azure subscription. "
62
+ "Free trial accounts return no cost data."
63
+ ),
64
+ },
65
+ )
66
+ break
67
+ logger.error(
68
+ "azure_cost_management_failed",
69
+ extra={"subscription_id": subscription_id, "error": str(exc)},
70
+ )
71
+
72
+ logger.info(
73
+ "azure_cost_query_complete",
74
+ extra={
75
+ "subscription_id": subscription_id,
76
+ "resources_queried": len(resource_ids),
77
+ "resources_with_cost": sum(1 for v in costs.values() if v > 0),
78
+ },
79
+ )
80
+ return costs
81
+
82
+
83
+ def _query_batch(
84
+ client: CostManagementClient,
85
+ scope: str,
86
+ resource_ids: list[str],
87
+ days: int,
88
+ costs: dict[str, float],
89
+ ) -> None:
90
+ end_date = datetime.now(tz=timezone.utc)
91
+ start_date = end_date - timedelta(days=days)
92
+
93
+ query = QueryDefinition(
94
+ type="Usage",
95
+ timeframe="Custom",
96
+ time_period=QueryTimePeriod(
97
+ from_property=start_date,
98
+ to=end_date,
99
+ ),
100
+ dataset=QueryDataset(
101
+ granularity="None",
102
+ aggregation={"totalCost": {"name": "PreTaxCost", "function": "Sum"}}, # type: ignore[dict-item]
103
+ grouping=[QueryGrouping(type="Dimension", name="ResourceId")],
104
+ filter=QueryFilter(
105
+ dimensions=QueryComparisonExpression(
106
+ name="ResourceId",
107
+ operator="In",
108
+ values=resource_ids,
109
+ )
110
+ ),
111
+ ),
112
+ )
113
+
114
+ result = retry_on_transient(client.query.usage, scope=scope, parameters=query)
115
+
116
+ # Result rows: [cost, currency, resourceId]
117
+ for row in result.rows if result and result.rows else []:
118
+ if len(row) >= 3:
119
+ amount = float(row[0])
120
+ resource_id: str = str(row[2])
121
+ # Match case-insensitively — Azure resource IDs are case-insensitive
122
+ for rid in costs:
123
+ if rid.lower() == resource_id.lower():
124
+ costs[rid] = costs[rid] + amount
125
+ break
@@ -0,0 +1,311 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timedelta, timezone
4
+ from typing import Any
5
+
6
+ import structlog
7
+ from azure.core.exceptions import HttpResponseError
8
+ from azure.identity import DefaultAzureCredential
9
+ from azure.monitor.query import MetricAggregationType, MetricsQueryClient
10
+
11
+ from adapters.azure.retry import retry_on_transient
12
+ from adapters.base import MetricSummary
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+ # (MetricName, AggregationType)
17
+ _METRICS: dict[str, list[tuple[str, str]]] = {
18
+ # Virtual Machines
19
+ "microsoft.compute/virtualmachines": [
20
+ ("Percentage CPU", "Average"),
21
+ ("Network In Total", "Total"),
22
+ ("Network Out Total", "Total"),
23
+ ],
24
+ # VM Scale Sets
25
+ "microsoft.compute/virtualmachinescalesets": [
26
+ ("Percentage CPU", "Average"),
27
+ ("Network In Total", "Total"),
28
+ ("Network Out Total", "Total"),
29
+ ],
30
+ # Managed Disks
31
+ "microsoft.compute/disks": [
32
+ ("Composite Disk Read Operations/sec", "Average"),
33
+ ("Composite Disk Write Operations/sec", "Average"),
34
+ ],
35
+ # Azure SQL Database
36
+ "microsoft.sql/servers/databases": [
37
+ ("cpu_percent", "Average"),
38
+ ("connection_successful", "Total"),
39
+ ("storage_percent", "Average"),
40
+ ],
41
+ # Azure SQL Managed Instance
42
+ "microsoft.sql/managedinstances": [
43
+ ("avg_cpu_percent", "Average"),
44
+ ("storage_space_used_mb", "Average"),
45
+ ],
46
+ # App Service Plans
47
+ "microsoft.web/serverfarms": [
48
+ ("CpuPercentage", "Average"),
49
+ ("MemoryPercentage", "Average"),
50
+ ("HttpQueueLength", "Average"),
51
+ ],
52
+ # App Services / Function Apps
53
+ "microsoft.web/sites": [
54
+ ("CpuTime", "Total"),
55
+ ("Requests", "Total"),
56
+ ("BytesReceived", "Total"),
57
+ ],
58
+ # AKS Clusters
59
+ "microsoft.containerservice/managedclusters": [
60
+ ("node_cpu_usage_percentage", "Average"),
61
+ ("node_memory_rss_percentage", "Average"),
62
+ ("kube_node_status_allocatable_cpu_cores", "Average"),
63
+ ],
64
+ # Container Instances
65
+ "microsoft.containerinstance/containergroups": [
66
+ ("CpuUsage", "Average"),
67
+ ("MemoryUsage", "Average"),
68
+ ("NetworkBytesReceivedPerSecond", "Average"),
69
+ ],
70
+ # Azure Cache for Redis
71
+ "microsoft.cache/redis": [
72
+ ("connectedclients", "Average"),
73
+ ("cachehits", "Total"),
74
+ ("cachemisses", "Total"),
75
+ ],
76
+ # Cosmos DB
77
+ "microsoft.documentdb/databaseaccounts": [
78
+ ("TotalRequests", "Total"),
79
+ ("NormalizedRUConsumption", "Average"),
80
+ ("ServerSideLatency", "Average"),
81
+ ],
82
+ # Storage Accounts
83
+ "microsoft.storage/storageaccounts": [
84
+ ("Transactions", "Total"),
85
+ ("Ingress", "Total"),
86
+ ("Egress", "Total"),
87
+ ],
88
+ # Azure Kubernetes Service Node Pools
89
+ "microsoft.containerservice/managedclusters/agentpools": [
90
+ ("node_cpu_usage_percentage", "Average"),
91
+ ("node_memory_rss_percentage", "Average"),
92
+ ],
93
+ # Event Hubs
94
+ "microsoft.eventhub/namespaces": [
95
+ ("IncomingMessages", "Total"),
96
+ ("OutgoingMessages", "Total"),
97
+ ("ActiveConnections", "Average"),
98
+ ],
99
+ # Service Bus
100
+ "microsoft.servicebus/namespaces": [
101
+ ("IncomingMessages", "Total"),
102
+ ("OutgoingMessages", "Total"),
103
+ ("ActiveConnections", "Average"),
104
+ ],
105
+ # Azure Functions (same as web/sites but grouped for clarity)
106
+ "microsoft.web/sites/functions": [
107
+ ("FunctionExecutionCount", "Total"),
108
+ ("FunctionExecutionUnits", "Total"),
109
+ ],
110
+ # API Management
111
+ "microsoft.apimanagement/service": [
112
+ ("TotalRequests", "Total"),
113
+ ("SuccessfulRequests", "Total"),
114
+ ("Capacity", "Average"),
115
+ ],
116
+ # Application Gateway
117
+ "microsoft.network/applicationgateways": [
118
+ ("TotalRequests", "Total"),
119
+ ("CurrentConnections", "Average"),
120
+ ("Throughput", "Average"),
121
+ ],
122
+ # Load Balancers
123
+ "microsoft.network/loadbalancers": [
124
+ ("PacketCount", "Total"),
125
+ ("ByteCount", "Total"),
126
+ ("AllocatedSnatPorts", "Average"),
127
+ ],
128
+ # Azure Databricks
129
+ "microsoft.databricks/workspaces": [
130
+ ("autoOptimizeClusterUtilization", "Average"),
131
+ ("numActiveClusters", "Average"),
132
+ ],
133
+ # HDInsight
134
+ "microsoft.hdinsight/clusters": [
135
+ ("GatewayRequests", "Total"),
136
+ ("CategorizedGatewayRequests", "Total"),
137
+ ],
138
+ # Logic Apps
139
+ "microsoft.logic/workflows": [
140
+ ("RunsStarted", "Total"),
141
+ ("RunsCompleted", "Total"),
142
+ ("RunsFailed", "Total"),
143
+ ],
144
+ # Cognitive Services / OpenAI
145
+ "microsoft.cognitiveservices/accounts": [
146
+ ("TotalCalls", "Total"),
147
+ ("TotalErrors", "Total"),
148
+ ("Latency", "Average"),
149
+ ],
150
+ # Azure Search
151
+ "microsoft.search/searchservices": [
152
+ ("SearchQueriesPerSecond", "Average"),
153
+ ("ThrottledSearchQueriesPercentage", "Average"),
154
+ ],
155
+ # Azure Stream Analytics
156
+ "microsoft.streamanalytics/streamingjobs": [
157
+ ("InputEvents", "Total"),
158
+ ("OutputEvents", "Total"),
159
+ ("ResourceUtilization", "Average"),
160
+ ],
161
+ # Azure Data Factory
162
+ "microsoft.datafactory/factories": [
163
+ ("PipelineRunsStarted", "Total"),
164
+ ("ActivityRunsStarted", "Total"),
165
+ ("TriggerRunsStarted", "Total"),
166
+ ],
167
+ }
168
+
169
+ _AGGREGATION_MAP = {
170
+ "Average": MetricAggregationType.AVERAGE,
171
+ "Total": MetricAggregationType.TOTAL,
172
+ "Minimum": MetricAggregationType.MINIMUM,
173
+ "Maximum": MetricAggregationType.MAXIMUM,
174
+ }
175
+
176
+ _FALLBACK_METRIC_LIMIT = 5
177
+
178
+
179
+ def get_metrics(
180
+ resource_id: str,
181
+ resource_type: str,
182
+ days: int = 90,
183
+ credential: Any = None,
184
+ ) -> MetricSummary:
185
+ """
186
+ Fetch Azure Monitor metrics for a resource.
187
+ Falls back to querying available metric definitions for unknown resource types.
188
+ """
189
+ metric_defs = _METRICS.get(resource_type.lower())
190
+ if not metric_defs:
191
+ metric_defs = _discover_metrics(resource_id, resource_type, credential)
192
+ if not metric_defs:
193
+ return MetricSummary(
194
+ resource_id=resource_id,
195
+ resource_type=resource_type,
196
+ period_days=days,
197
+ metrics={},
198
+ has_data=False,
199
+ )
200
+
201
+ cred = credential or DefaultAzureCredential()
202
+ client = MetricsQueryClient(cred, connection_timeout=10, read_timeout=60)
203
+
204
+ end_time = datetime.now(tz=timezone.utc)
205
+ start_time = end_time - timedelta(days=days)
206
+ granularity = timedelta(days=1)
207
+
208
+ metric_names = [name for name, _ in metric_defs]
209
+
210
+ try:
211
+ response = retry_on_transient(
212
+ client.query_resource,
213
+ resource_uri=resource_id,
214
+ metric_names=metric_names,
215
+ timespan=(start_time, end_time),
216
+ granularity=granularity,
217
+ )
218
+ except HttpResponseError as exc:
219
+ logger.warning(
220
+ "azure_monitor_query_failed",
221
+ extra={"resource_id": resource_id, "error": str(exc)},
222
+ )
223
+ return MetricSummary(
224
+ resource_id=resource_id,
225
+ resource_type=resource_type,
226
+ period_days=days,
227
+ metrics={},
228
+ has_data=False,
229
+ )
230
+
231
+ return _parse_response(response, metric_defs, resource_id, resource_type, days)
232
+
233
+
234
+ def _parse_response(
235
+ response: Any,
236
+ metric_defs: list[tuple[str, str]],
237
+ resource_id: str,
238
+ resource_type: str,
239
+ days: int,
240
+ ) -> MetricSummary:
241
+ metrics: dict[str, Any] = {}
242
+ has_data = False
243
+
244
+ agg_map = {name: agg for name, agg in metric_defs}
245
+
246
+ for metric in response.metrics:
247
+ name: str = metric.name
248
+ agg_type: str = agg_map.get(name, "Average")
249
+ values: list[float] = []
250
+
251
+ for ts in metric.timeseries:
252
+ for data_point in ts.data:
253
+ val = data_point.total if agg_type == "Total" else data_point.average
254
+ if val is not None:
255
+ values.append(val)
256
+
257
+ if not values:
258
+ metrics[name] = None
259
+ continue
260
+
261
+ has_data = True
262
+ metrics[name] = round(
263
+ sum(values) if agg_type == "Total" else sum(values) / len(values), 4
264
+ )
265
+
266
+ return MetricSummary(
267
+ resource_id=resource_id,
268
+ resource_type=resource_type,
269
+ period_days=days,
270
+ metrics=metrics,
271
+ has_data=has_data,
272
+ )
273
+
274
+
275
+ def _discover_metrics(
276
+ resource_id: str,
277
+ resource_type: str,
278
+ credential: Any = None,
279
+ ) -> list[tuple[str, str]]:
280
+ """Auto-discover available metrics for unknown Azure resource types."""
281
+ from azure.monitor.query import MetricsQueryClient
282
+
283
+ cred = credential or DefaultAzureCredential()
284
+ client = MetricsQueryClient(cred, connection_timeout=10, read_timeout=60)
285
+ discovered: list[tuple[str, str]] = []
286
+
287
+ try:
288
+ definitions = retry_on_transient(
289
+ client.list_metric_definitions, resource_uri=resource_id
290
+ )
291
+ for defn in definitions:
292
+ metric_name: str = defn.name or ""
293
+ # Pick aggregation based on metric name heuristics
294
+ agg = (
295
+ "Total"
296
+ if any(
297
+ kw in metric_name.lower()
298
+ for kw in ("count", "bytes", "requests", "transactions", "total")
299
+ )
300
+ else "Average"
301
+ )
302
+ discovered.append((metric_name, agg))
303
+ if len(discovered) >= _FALLBACK_METRIC_LIMIT:
304
+ break
305
+ except HttpResponseError as exc:
306
+ logger.warning(
307
+ "azure_monitor_list_metrics_failed",
308
+ extra={"resource_id": resource_id, "error": str(exc)},
309
+ )
310
+
311
+ return discovered
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import structlog
6
+ from azure.core.exceptions import HttpResponseError
7
+ from azure.identity import DefaultAzureCredential
8
+ from azure.mgmt.resourcegraph import ResourceGraphClient
9
+ from azure.mgmt.resourcegraph.models import QueryRequest, QueryRequestOptions
10
+
11
+ from adapters.azure.retry import retry_on_transient
12
+ from adapters.base import Resource
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+ # KQL query — returns all resources with their type, location, tags, and resource group.
17
+ # We exclude resource types that have no billing impact (e.g. locks, role assignments).
18
+ _RESOURCE_QUERY = """
19
+ Resources
20
+ | where type !in~ (
21
+ 'microsoft.authorization/roleassignments',
22
+ 'microsoft.authorization/roledefinitions',
23
+ 'microsoft.authorization/locks',
24
+ 'microsoft.resources/deployments',
25
+ 'microsoft.resources/tags'
26
+ )
27
+ | project id, name, type, location, resourceGroup, tags, subscriptionId
28
+ """
29
+
30
+ _PAGE_SIZE = 1000 # Resource Graph max per page
31
+
32
+
33
+ def list_resources(
34
+ subscription_ids: list[str],
35
+ ignore_regions: list[str] | None = None,
36
+ credential: Any = None,
37
+ ) -> list[Resource]:
38
+ """
39
+ Return all billable Azure resources across the given subscriptions
40
+ using Azure Resource Graph (single cross-subscription query).
41
+
42
+ Auth: DefaultAzureCredential — Managed Identity in production,
43
+ az login / env vars for local dev.
44
+ """
45
+ cred = credential or DefaultAzureCredential()
46
+ client = ResourceGraphClient(cred, connection_timeout=10, read_timeout=60)
47
+ ignore_set = {r.lower() for r in (ignore_regions or [])}
48
+ resources: list[Resource] = []
49
+
50
+ request = QueryRequest(
51
+ subscriptions=subscription_ids,
52
+ query=_RESOURCE_QUERY,
53
+ options=QueryRequestOptions(result_format="objectArray", top=_PAGE_SIZE),
54
+ )
55
+
56
+ skip_token: str | None = None
57
+
58
+ try:
59
+ while True:
60
+ if skip_token:
61
+ request.options.skip_token = skip_token
62
+
63
+ response = retry_on_transient(client.resources, request)
64
+
65
+ for raw in response.data or []:
66
+ parsed = _parse_resource(raw, ignore_set)
67
+ if parsed:
68
+ resources.append(parsed)
69
+
70
+ _raw_token = getattr(response, "skip_token", None) or getattr(
71
+ response, "$skipToken", None
72
+ )
73
+ skip_token = _raw_token if isinstance(_raw_token, str) else None
74
+ if not skip_token:
75
+ break
76
+
77
+ except HttpResponseError as exc:
78
+ if exc.status_code == 403:
79
+ raise PermissionError(
80
+ "Argus service principal is missing Reader role "
81
+ "on the subscription(s). "
82
+ "Assign 'Reader' at the subscription scope."
83
+ ) from exc
84
+ raise
85
+
86
+ logger.info(
87
+ "resource_graph_query_complete",
88
+ extra={"subscriptions": subscription_ids, "total": len(resources)},
89
+ )
90
+ return resources
91
+
92
+
93
+ def _parse_resource(raw: dict[str, Any], ignore_set: set[str]) -> Resource | None:
94
+ resource_id: str = raw.get("id", "")
95
+ name: str = raw.get("name", "")
96
+ resource_type: str = raw.get("type", "")
97
+ location: str = raw.get("location", "global")
98
+
99
+ if not resource_id or not resource_type:
100
+ return None
101
+ if location.lower() in ignore_set:
102
+ return None
103
+
104
+ tags: dict[str, str] = {str(k): str(v) for k, v in (raw.get("tags") or {}).items()}
105
+
106
+ return Resource(
107
+ resource_id=resource_id,
108
+ resource_type=resource_type.lower(),
109
+ cloud="azure",
110
+ region=location,
111
+ name=name or None,
112
+ tags=tags,
113
+ )
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import time
5
+ from collections.abc import Callable
6
+ from typing import Any, TypeVar
7
+
8
+ import structlog
9
+ from azure.core.exceptions import HttpResponseError
10
+
11
+ logger = structlog.get_logger(__name__)
12
+
13
+ T = TypeVar("T")
14
+
15
+ _MAX_RETRIES = 3
16
+ _BASE_DELAY = 1.0
17
+
18
+
19
+ def retry_on_transient(
20
+ fn: Callable[..., T],
21
+ *args: Any,
22
+ **kwargs: Any,
23
+ ) -> T:
24
+ delay = _BASE_DELAY
25
+ for attempt in range(_MAX_RETRIES):
26
+ try:
27
+ return fn(*args, **kwargs)
28
+ except HttpResponseError as exc:
29
+ status = exc.status_code or 0
30
+ if status in (429, 500, 502, 503, 504) and attempt < _MAX_RETRIES - 1:
31
+ retry_after = _parse_retry_after(exc)
32
+ jitter = random.uniform(0, delay * 0.5) # noqa: S311
33
+ sleep_time = retry_after if retry_after else delay + jitter
34
+ logger.warning(
35
+ "azure_transient_error_retrying",
36
+ status_code=status,
37
+ attempt=attempt + 1,
38
+ max_retries=_MAX_RETRIES,
39
+ retry_in=round(sleep_time, 1),
40
+ )
41
+ time.sleep(sleep_time)
42
+ delay *= 2
43
+ else:
44
+ raise
45
+ raise RuntimeError("Unreachable") # pragma: no cover
46
+
47
+
48
+ def _parse_retry_after(exc: HttpResponseError) -> float | None:
49
+ if exc.response is None:
50
+ return None
51
+ header = exc.response.headers.get("Retry-After")
52
+ if header is None:
53
+ return None
54
+ try:
55
+ return float(header)
56
+ except ValueError:
57
+ return None