argus-cloud-optimizer 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. adapters/__init__.py +0 -0
  2. adapters/aws/__init__.py +0 -0
  3. adapters/aws/adapter.py +85 -0
  4. adapters/aws/auth.py +57 -0
  5. adapters/aws/cloudtrail.py +83 -0
  6. adapters/aws/cloudwatch.py +732 -0
  7. adapters/aws/config.py +9 -0
  8. adapters/aws/cost_explorer.py +116 -0
  9. adapters/aws/resource_explorer.py +186 -0
  10. adapters/aws/retry.py +55 -0
  11. adapters/azure/__init__.py +0 -0
  12. adapters/azure/activity_log.py +159 -0
  13. adapters/azure/adapter.py +117 -0
  14. adapters/azure/cost_management.py +125 -0
  15. adapters/azure/monitor.py +311 -0
  16. adapters/azure/resource_graph.py +113 -0
  17. adapters/azure/retry.py +57 -0
  18. adapters/base.py +105 -0
  19. adapters/gcp/__init__.py +0 -0
  20. adapters/gcp/adapter.py +86 -0
  21. adapters/gcp/asset_inventory.py +116 -0
  22. adapters/gcp/billing.py +118 -0
  23. adapters/gcp/cloud_logging.py +93 -0
  24. adapters/gcp/cloud_monitoring.py +276 -0
  25. adapters/gcp/retry.py +46 -0
  26. ai/__init__.py +0 -0
  27. ai/anthropic.py +174 -0
  28. ai/azure_openai.py +241 -0
  29. ai/base.py +78 -0
  30. ai/bedrock.py +169 -0
  31. ai/vertexai.py +234 -0
  32. argus_cloud_optimizer-0.2.0.dist-info/METADATA +433 -0
  33. argus_cloud_optimizer-0.2.0.dist-info/RECORD +62 -0
  34. argus_cloud_optimizer-0.2.0.dist-info/WHEEL +5 -0
  35. argus_cloud_optimizer-0.2.0.dist-info/entry_points.txt +2 -0
  36. argus_cloud_optimizer-0.2.0.dist-info/licenses/LICENSE +21 -0
  37. argus_cloud_optimizer-0.2.0.dist-info/top_level.txt +4 -0
  38. core/__init__.py +0 -0
  39. core/__version__.py +1 -0
  40. core/agent/__init__.py +0 -0
  41. core/agent/loop.py +390 -0
  42. core/agent/prompts.py +317 -0
  43. core/config.py +235 -0
  44. core/log.py +69 -0
  45. core/models/__init__.py +0 -0
  46. core/models/finding.py +76 -0
  47. core/py.typed +0 -0
  48. core/reports/__init__.py +0 -0
  49. core/reports/comparison.py +49 -0
  50. core/reports/delivery.py +323 -0
  51. core/reports/export.py +111 -0
  52. core/reports/generator.py +168 -0
  53. core/reports/html.py +286 -0
  54. core/reports/multi_cloud.py +162 -0
  55. core/secrets.py +145 -0
  56. core/token_tracker.py +97 -0
  57. core/validation.py +214 -0
  58. entrypoints/__init__.py +0 -0
  59. entrypoints/aws_lambda.py +299 -0
  60. entrypoints/azure_function.py +257 -0
  61. entrypoints/cli.py +156 -0
  62. entrypoints/gcp_cloudrun.py +209 -0
@@ -0,0 +1,276 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timedelta, timezone
4
+ from typing import Any
5
+
6
+ import structlog
7
+ from google.api_core.exceptions import GoogleAPICallError
8
+ from google.cloud import monitoring_v3
9
+ from google.protobuf.timestamp_pb2 import Timestamp
10
+
11
+ from adapters.base import MetricSummary
12
+ from adapters.gcp.retry import retry_on_transient
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+ # (MetricType, Stat, label_key_for_resource_filter)
17
+ # Stat: "mean" for utilisation, "sum" for throughput/count.
18
+ # label_key: the monitored-resource label used to filter to this specific resource.
19
+ _METRICS: dict[str, list[tuple[str, str]]] = {
20
+ "compute.googleapis.com/Instance": [
21
+ ("compute.googleapis.com/instance/cpu/utilization", "mean"),
22
+ ("compute.googleapis.com/instance/network/sent_bytes_count", "sum"),
23
+ ("compute.googleapis.com/instance/network/received_bytes_count", "sum"),
24
+ ],
25
+ "compute.googleapis.com/Disk": [
26
+ ("compute.googleapis.com/instance/disk/read_ops_count", "sum"),
27
+ ("compute.googleapis.com/instance/disk/write_ops_count", "sum"),
28
+ ],
29
+ "sql.googleapis.com/Instance": [
30
+ ("cloudsql.googleapis.com/database/cpu/utilization", "mean"),
31
+ ("cloudsql.googleapis.com/database/network/connections", "mean"),
32
+ ("cloudsql.googleapis.com/database/network/received_bytes_count", "sum"),
33
+ ],
34
+ "container.googleapis.com/Cluster": [
35
+ ("kubernetes.io/container/cpu/request_utilization", "mean"),
36
+ ("kubernetes.io/container/memory/request_utilization", "mean"),
37
+ ("kubernetes.io/node/cpu/allocatable_utilization", "mean"),
38
+ ],
39
+ "run.googleapis.com/Service": [
40
+ ("run.googleapis.com/request_count", "sum"),
41
+ ("run.googleapis.com/request_latencies", "mean"),
42
+ ("run.googleapis.com/container/cpu/utilizations", "mean"),
43
+ ],
44
+ "cloudfunctions.googleapis.com/Function": [
45
+ ("cloudfunctions.googleapis.com/function/execution_count", "sum"),
46
+ ("cloudfunctions.googleapis.com/function/execution_times", "mean"),
47
+ ],
48
+ "storage.googleapis.com/Bucket": [
49
+ ("storage.googleapis.com/api/request_count", "sum"),
50
+ ("storage.googleapis.com/network/sent_bytes_count", "sum"),
51
+ ],
52
+ "bigquery.googleapis.com/Dataset": [
53
+ ("bigquery.googleapis.com/storage/table_count", "mean"),
54
+ ("bigquery.googleapis.com/storage/stored_bytes", "mean"),
55
+ ],
56
+ "bigquery.googleapis.com/Table": [
57
+ ("bigquery.googleapis.com/storage/stored_bytes", "mean"),
58
+ ("bigquery.googleapis.com/storage/row_count", "mean"),
59
+ ],
60
+ "redis.googleapis.com/Instance": [
61
+ ("redis.googleapis.com/clients/connected", "mean"),
62
+ ("redis.googleapis.com/stats/cache_hit_ratio", "mean"),
63
+ ("redis.googleapis.com/stats/memory/usage_ratio", "mean"),
64
+ ],
65
+ "spanner.googleapis.com/Instance": [
66
+ ("spanner.googleapis.com/instance/cpu/utilization", "mean"),
67
+ ("spanner.googleapis.com/instance/session_count", "mean"),
68
+ ],
69
+ "pubsub.googleapis.com/Topic": [
70
+ ("pubsub.googleapis.com/topic/send_message_operation_count", "sum"),
71
+ ("pubsub.googleapis.com/topic/byte_cost", "sum"),
72
+ ],
73
+ "pubsub.googleapis.com/Subscription": [
74
+ ("pubsub.googleapis.com/subscription/pull_message_operation_count", "sum"),
75
+ ("pubsub.googleapis.com/subscription/num_undelivered_messages", "mean"),
76
+ ],
77
+ "dataflow.googleapis.com/Job": [
78
+ ("dataflow.googleapis.com/job/data_watermark_age", "mean"),
79
+ ("dataflow.googleapis.com/job/elapsed_time", "mean"),
80
+ ("dataflow.googleapis.com/job/element_count", "sum"),
81
+ ],
82
+ "dataproc.googleapis.com/Cluster": [
83
+ ("dataproc.googleapis.com/cluster/yarn/allocated_memory_percentage", "mean"),
84
+ ("dataproc.googleapis.com/cluster/hdfs/storage_utilization", "mean"),
85
+ ],
86
+ "aiplatform.googleapis.com/Endpoint": [
87
+ ("aiplatform.googleapis.com/prediction/online/request_count", "sum"),
88
+ ("aiplatform.googleapis.com/prediction/online/latencies", "mean"),
89
+ ],
90
+ }
91
+
92
+ _PERIOD_SECONDS = 86400 # daily granularity
93
+ _FALLBACK_METRIC_LIMIT = 5
94
+
95
+
96
+ def get_metrics(
97
+ project_id: str,
98
+ resource_id: str,
99
+ resource_type: str,
100
+ days: int = 90,
101
+ ) -> MetricSummary:
102
+ """
103
+ Fetch Cloud Monitoring metrics for a GCP resource.
104
+ Falls back to listing available metrics for unknown resource types.
105
+ """
106
+ metric_defs = _METRICS.get(resource_type)
107
+ if not metric_defs:
108
+ metric_defs = _discover_metrics(project_id, resource_id, resource_type)
109
+ if not metric_defs:
110
+ return MetricSummary(
111
+ resource_id=resource_id,
112
+ resource_type=resource_type,
113
+ period_days=days,
114
+ metrics={},
115
+ has_data=False,
116
+ )
117
+
118
+ client = monitoring_v3.MetricServiceClient()
119
+ project_name = f"projects/{project_id}"
120
+
121
+ end_time = datetime.now(tz=timezone.utc)
122
+ start_time = end_time - timedelta(days=days)
123
+
124
+ interval = monitoring_v3.TimeInterval(
125
+ start_time=_to_proto_timestamp(start_time),
126
+ end_time=_to_proto_timestamp(end_time),
127
+ )
128
+ aggregation = monitoring_v3.Aggregation(
129
+ alignment_period={"seconds": _PERIOD_SECONDS},
130
+ cross_series_reducer=monitoring_v3.Aggregation.Reducer.REDUCE_MEAN,
131
+ per_series_aligner=monitoring_v3.Aggregation.Aligner.ALIGN_MEAN,
132
+ )
133
+
134
+ resource_filter = _resource_filter(resource_id, resource_type)
135
+ metrics: dict[str, Any] = {}
136
+ has_data = False
137
+
138
+ for metric_type, stat in metric_defs:
139
+ filter_str = f'metric.type="{metric_type}"'
140
+ if resource_filter:
141
+ filter_str += f" AND {resource_filter}"
142
+
143
+ request = monitoring_v3.ListTimeSeriesRequest(
144
+ name=project_name,
145
+ filter=filter_str,
146
+ interval=interval,
147
+ aggregation=aggregation,
148
+ )
149
+
150
+ try:
151
+ series = list(
152
+ retry_on_transient(client.list_time_series, request=request, timeout=60)
153
+ )
154
+ except GoogleAPICallError as exc:
155
+ logger.warning(
156
+ "cloud_monitoring_failed",
157
+ extra={
158
+ "resource_id": resource_id,
159
+ "metric": metric_type,
160
+ "error": str(exc),
161
+ },
162
+ )
163
+ metrics[metric_type] = None
164
+ continue
165
+
166
+ values: list[float] = [
167
+ point.value.double_value or point.value.int64_value
168
+ for ts in series
169
+ for point in ts.points
170
+ ]
171
+
172
+ if not values:
173
+ metrics[metric_type] = None
174
+ continue
175
+
176
+ has_data = True
177
+ metrics[metric_type] = round(
178
+ sum(values) / len(values) if stat == "mean" else sum(values), 4
179
+ )
180
+
181
+ return MetricSummary(
182
+ resource_id=resource_id,
183
+ resource_type=resource_type,
184
+ period_days=days,
185
+ metrics=metrics,
186
+ has_data=has_data,
187
+ )
188
+
189
+
190
+ def _discover_metrics(
191
+ project_id: str,
192
+ resource_id: str,
193
+ resource_type: str,
194
+ ) -> list[tuple[str, str]]:
195
+ """Auto-discover available metrics for unknown resource types."""
196
+ client = monitoring_v3.MetricServiceClient()
197
+ project_name = f"projects/{project_id}"
198
+
199
+ # Derive a Cloud Monitoring monitored resource type prefix from the asset type.
200
+ # e.g. "compute.googleapis.com/SomeNewThing" -> filter on "compute.googleapis.com"
201
+ service_prefix = (
202
+ resource_type.split("/")[0] if "/" in resource_type else resource_type
203
+ )
204
+
205
+ request = monitoring_v3.ListMetricDescriptorsRequest(
206
+ name=project_name,
207
+ filter=f'metric.type=starts_with("{service_prefix}")',
208
+ )
209
+
210
+ discovered: list[tuple[str, str]] = []
211
+ try:
212
+ for descriptor in client.list_metric_descriptors(request=request, timeout=60):
213
+ metric_type: str = descriptor.type
214
+ stat = (
215
+ "sum"
216
+ if any(
217
+ kw in metric_type.lower()
218
+ for kw in ("count", "bytes", "requests", "operations")
219
+ )
220
+ else "mean"
221
+ )
222
+ discovered.append((metric_type, stat))
223
+ if len(discovered) >= _FALLBACK_METRIC_LIMIT:
224
+ break
225
+ except GoogleAPICallError as exc:
226
+ logger.warning(
227
+ "cloud_monitoring_list_metrics_failed",
228
+ extra={"resource_id": resource_id, "error": str(exc)},
229
+ )
230
+
231
+ return discovered
232
+
233
+
234
+ def _resource_filter(resource_id: str, resource_type: str) -> str:
235
+ """
236
+ Build a Cloud Monitoring filter string to scope metrics to a specific resource.
237
+ Resource IDs are full asset names, e.g.:
238
+ //compute.googleapis.com/projects/p/zones/z/instances/name
239
+ """
240
+ # Extract the short resource name from the full asset name.
241
+ name = resource_id.rstrip("/").split("/")[-1]
242
+
243
+ match resource_type:
244
+ case "compute.googleapis.com/Instance":
245
+ return f'resource.labels.instance_id="{name}"'
246
+ case "sql.googleapis.com/Instance":
247
+ return f'resource.labels.database_id="{name}"'
248
+ case "container.googleapis.com/Cluster":
249
+ return f'resource.labels.cluster_name="{name}"'
250
+ case "run.googleapis.com/Service":
251
+ return f'resource.labels.service_name="{name}"'
252
+ case "cloudfunctions.googleapis.com/Function":
253
+ return f'resource.labels.function_name="{name}"'
254
+ case "storage.googleapis.com/Bucket":
255
+ return f'resource.labels.bucket_name="{name}"'
256
+ case "pubsub.googleapis.com/Topic":
257
+ return f'resource.labels.topic_id="{name}"'
258
+ case "pubsub.googleapis.com/Subscription":
259
+ return f'resource.labels.subscription_id="{name}"'
260
+ case "redis.googleapis.com/Instance":
261
+ return f'resource.labels.instance_id="{name}"'
262
+ case "spanner.googleapis.com/Instance":
263
+ return f'resource.labels.instance_id="{name}"'
264
+ case "dataflow.googleapis.com/Job":
265
+ return f'resource.labels.job_id="{name}"'
266
+ case "dataproc.googleapis.com/Cluster":
267
+ return f'resource.labels.cluster_name="{name}"'
268
+ case "aiplatform.googleapis.com/Endpoint":
269
+ return f'resource.labels.endpoint_id="{name}"'
270
+ return ""
271
+
272
+
273
+ def _to_proto_timestamp(dt: datetime) -> Timestamp:
274
+ ts = Timestamp()
275
+ ts.FromDatetime(dt)
276
+ return ts
adapters/gcp/retry.py ADDED
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import time
5
+ from collections.abc import Callable
6
+ from typing import Any, TypeVar
7
+
8
+ import structlog
9
+ from google.api_core.exceptions import (
10
+ ResourceExhausted,
11
+ ServiceUnavailable,
12
+ )
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+ T = TypeVar("T")
17
+
18
+ _MAX_RETRIES = 3
19
+ _BASE_DELAY = 1.0
20
+
21
+
22
+ def retry_on_transient(
23
+ fn: Callable[..., T],
24
+ *args: Any,
25
+ **kwargs: Any,
26
+ ) -> T:
27
+ delay = _BASE_DELAY
28
+ for attempt in range(_MAX_RETRIES):
29
+ try:
30
+ return fn(*args, **kwargs)
31
+ except (ResourceExhausted, ServiceUnavailable) as exc:
32
+ if attempt < _MAX_RETRIES - 1:
33
+ jitter = random.uniform(0, delay * 0.5) # noqa: S311
34
+ sleep_time = delay + jitter
35
+ logger.warning(
36
+ "gcp_transient_error_retrying",
37
+ error_type=type(exc).__name__,
38
+ attempt=attempt + 1,
39
+ max_retries=_MAX_RETRIES,
40
+ retry_in=round(sleep_time, 1),
41
+ )
42
+ time.sleep(sleep_time)
43
+ delay *= 2
44
+ else:
45
+ raise
46
+ raise RuntimeError("Unreachable") # pragma: no cover
ai/__init__.py ADDED
File without changes
ai/anthropic.py ADDED
@@ -0,0 +1,174 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import time
5
+ from typing import Any
6
+
7
+ import anthropic as anthropic_sdk
8
+ import structlog
9
+
10
+ from ai.base import AIProvider, AIResponse, Message, Tool, ToolCall
11
+
12
+ logger = structlog.get_logger(__name__)
13
+
14
+ _MAX_RETRIES = 3
15
+ _BASE_DELAY = 1.0
16
+
17
+
18
+ class AnthropicProvider(AIProvider):
19
+ """
20
+ AI provider backed by the Anthropic direct API.
21
+ Works on any cloud — no AWS/GCP/Azure auth needed.
22
+ Best for local development and contributors without cloud AI access.
23
+ Configure via ANTHROPIC_API_KEY and optionally ANTHROPIC_MODEL env vars.
24
+ """
25
+
26
+ DEFAULT_MODEL = "claude-sonnet-4-6"
27
+ DEFAULT_MAX_TOKENS = 4096
28
+ DEFAULT_TEMPERATURE = 0.0
29
+
30
+ def __init__(
31
+ self,
32
+ api_key: str | None = None,
33
+ model: str | None = None,
34
+ max_tokens: int = DEFAULT_MAX_TOKENS,
35
+ temperature: float | None = None,
36
+ ) -> None:
37
+ from core.config import get_settings
38
+
39
+ cfg = get_settings().ai
40
+ resolved_key = api_key or cfg.anthropic_api_key
41
+ if not resolved_key:
42
+ raise EnvironmentError(
43
+ "ANTHROPIC_API_KEY is not set. "
44
+ "Export it or pass api_key= explicitly."
45
+ )
46
+ self._client = anthropic_sdk.Anthropic(api_key=resolved_key, timeout=60.0)
47
+ self._model = model or cfg.resolved_model("anthropic")
48
+ self._max_tokens = max_tokens
49
+ self._temperature = temperature if temperature is not None else cfg.temperature
50
+
51
+ def chat(
52
+ self,
53
+ messages: list[Message],
54
+ tools: list[Tool],
55
+ system_prompt: str | None = None,
56
+ ) -> AIResponse:
57
+ kwargs: dict[str, Any] = {
58
+ "model": self._model,
59
+ "max_tokens": self._max_tokens,
60
+ "temperature": self._temperature,
61
+ "messages": [self._to_anthropic_message(m) for m in messages],
62
+ "tools": [self._to_anthropic_tool(t) for t in tools],
63
+ }
64
+ if system_prompt:
65
+ # cache_control pins the system prompt in Anthropic's prompt cache.
66
+ # After the first iteration it's served from cache — no reprocessing charge.
67
+ # Requires claude-3-5-* or claude-sonnet-4-* models.
68
+ kwargs["system"] = [
69
+ {
70
+ "type": "text",
71
+ "text": system_prompt,
72
+ "cache_control": {"type": "ephemeral"},
73
+ }
74
+ ]
75
+
76
+ response = self._call_with_retry(kwargs)
77
+ return self._parse_response(response)
78
+
79
+ # ------------------------------------------------------------------
80
+ # Retry logic
81
+ # ------------------------------------------------------------------
82
+
83
+ def _call_with_retry(self, kwargs: dict[str, Any]) -> Any:
84
+ delay = _BASE_DELAY
85
+ for attempt in range(_MAX_RETRIES):
86
+ try:
87
+ return self._client.messages.create(**kwargs)
88
+ except (
89
+ anthropic_sdk.RateLimitError,
90
+ anthropic_sdk.InternalServerError,
91
+ ) as exc:
92
+ if attempt < _MAX_RETRIES - 1:
93
+ jitter = random.uniform(0, delay * 0.5) # noqa: S311
94
+ sleep_time = delay + jitter
95
+ logger.warning(
96
+ "anthropic_api_retrying",
97
+ error_type=type(exc).__name__,
98
+ attempt=attempt + 1,
99
+ max_retries=_MAX_RETRIES,
100
+ retry_in=round(sleep_time, 1),
101
+ )
102
+ time.sleep(sleep_time)
103
+ delay *= 2
104
+ else:
105
+ raise
106
+ raise RuntimeError("Unreachable") # pragma: no cover
107
+
108
+ # ------------------------------------------------------------------
109
+ # Internal conversion helpers
110
+ # ------------------------------------------------------------------
111
+
112
+ def _to_anthropic_message(self, msg: Message) -> dict[str, Any]:
113
+ if msg.role == "user":
114
+ if msg.tool_results:
115
+ return {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "tool_result",
120
+ "tool_use_id": tr.tool_call_id,
121
+ "content": tr.content,
122
+ **({"is_error": True} if tr.is_error else {}),
123
+ }
124
+ for tr in msg.tool_results
125
+ ],
126
+ }
127
+ return {"role": "user", "content": msg.text or ""}
128
+
129
+ # assistant
130
+ content: list[dict[str, Any]] = []
131
+ if msg.text:
132
+ content.append({"type": "text", "text": msg.text})
133
+ for tc in msg.tool_calls:
134
+ content.append(
135
+ {
136
+ "type": "tool_use",
137
+ "id": tc.id,
138
+ "name": tc.name,
139
+ "input": tc.arguments,
140
+ }
141
+ )
142
+ return {"role": "assistant", "content": content}
143
+
144
+ def _to_anthropic_tool(self, tool: Tool) -> dict[str, Any]:
145
+ return {
146
+ "name": tool.name,
147
+ "description": tool.description,
148
+ "input_schema": tool.input_schema,
149
+ }
150
+
151
+ def _parse_response(self, response: Any) -> AIResponse:
152
+ tool_calls: list[ToolCall] = []
153
+ text: str | None = None
154
+
155
+ for block in response.content:
156
+ if block.type == "tool_use":
157
+ tool_calls.append(
158
+ ToolCall(
159
+ id=block.id,
160
+ name=block.name,
161
+ arguments=dict(block.input),
162
+ )
163
+ )
164
+ elif block.type == "text":
165
+ text = block.text
166
+
167
+ usage = getattr(response, "usage", None)
168
+ return AIResponse(
169
+ stop_reason=response.stop_reason,
170
+ text=text,
171
+ tool_calls=tool_calls,
172
+ input_tokens=getattr(usage, "input_tokens", 0) if usage else 0,
173
+ output_tokens=getattr(usage, "output_tokens", 0) if usage else 0,
174
+ )