nuvu-scan 2.0.2__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -108,6 +108,11 @@ from ..formatters.json import JSONFormatter
108
108
  default="https://nuvu.dev",
109
109
  help="Nuvu Cloud API URL (default: https://nuvu.dev)",
110
110
  )
111
+ @click.option(
112
+ "--list-collectors",
113
+ is_flag=True,
114
+ help="List available collectors for the specified provider and exit.",
115
+ )
111
116
  def scan_command(
112
117
  provider: str,
113
118
  output_format: str,
@@ -344,6 +349,7 @@ def scan_command(
344
349
  "size_bytes": asset.size_bytes,
345
350
  "tags": asset.tags,
346
351
  "cost_estimate_usd": asset.cost_estimate_usd,
352
+ "usage_metrics": asset.usage_metrics, # Include all usage metrics
347
353
  "risk_flags": asset.risk_flags,
348
354
  "ownership_confidence": asset.ownership_confidence or "unknown",
349
355
  "suggested_owner": asset.suggested_owner,
@@ -353,7 +359,8 @@ def scan_command(
353
359
  }
354
360
 
355
361
  # Push to API using the /api/scans/import endpoint
356
- with httpx.Client(timeout=60) as client:
362
+ # Use longer timeout for large scans (2000+ assets can take minutes)
363
+ with httpx.Client(timeout=300) as client:
357
364
  response = client.post(
358
365
  f"{api_url.rstrip('/')}/api/scans/import",
359
366
  json=payload,
@@ -10,9 +10,8 @@ class HTMLFormatter:
10
10
 
11
11
  def format(self, result: ScanResult) -> str:
12
12
  """Format scan result as HTML."""
13
- # Build summary cards (use actual cost if available)
13
+ # Build summary cards (use actual cost from Cost Explorer if available)
14
14
  actual_total = result.summary.get("total_actual_cost_30d")
15
- estimated_assets_total = result.summary.get("estimated_assets_cost_total")
16
15
 
17
16
  # Calculate cost saving opportunities
18
17
  savings_opportunities = self._calculate_savings(result.assets)
@@ -29,17 +28,16 @@ class HTMLFormatter:
29
28
  <div class="summary-card">
30
29
  <h3>Actual 30-Day Cost</h3>
31
30
  <div class="value">${actual_total:,.2f}</div>
32
- </div>
33
- <div class="summary-card">
34
- <h3>Estimated Asset Cost</h3>
35
- <div class="value">${(estimated_assets_total or 0):,.2f}</div>
31
+ <div class="card-note">From AWS Cost Explorer</div>
36
32
  </div>
37
33
  """
38
34
  else:
35
+ # Fallback when Cost Explorer data not available
39
36
  summary_cards += f"""
40
37
  <div class="summary-card">
41
- <h3>Estimated Monthly Cost</h3>
38
+ <h3>Monthly Cost</h3>
42
39
  <div class="value">${result.total_cost_estimate_usd:,.2f}</div>
40
+ <div class="card-note">Add Cost Explorer permissions for accurate data</div>
43
41
  </div>
44
42
  """
45
43
 
@@ -99,6 +97,7 @@ class HTMLFormatter:
99
97
  .summary-card.savings {{ border-left-color: #ff9800; background: #fff8e1; }}
100
98
  .summary-card h3 {{ margin: 0 0 10px 0; color: #666; font-size: 13px; text-transform: uppercase; }}
101
99
  .summary-card .value {{ font-size: 22px; font-weight: bold; color: #333; }}
100
+ .summary-card .card-note {{ font-size: 11px; color: #888; margin-top: 5px; }}
102
101
  table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
103
102
  table.compact {{ font-size: 13px; }}
104
103
  table.compact th, table.compact td {{ padding: 8px; }}
@@ -133,6 +132,12 @@ class HTMLFormatter:
133
132
  <p><strong>Account ID:</strong> {result.account_id}</p>
134
133
  <p><strong>Scan Time:</strong> {result.scan_timestamp}</p>
135
134
 
135
+ <h2>📋 Scan Scope</h2>
136
+ <div class="insight-box info">
137
+ <p><strong>Collectors:</strong> {", ".join(result.scanned_collectors) if result.scanned_collectors else "All (Full Scan)"}</p>
138
+ <p><strong>Regions:</strong> {", ".join(result.scanned_regions[:10]) if result.scanned_regions else "All enabled regions"}{" (+ " + str(len(result.scanned_regions) - 10) + " more)" if len(result.scanned_regions) > 10 else ""}</p>
139
+ </div>
140
+
136
141
  <h2>Executive Summary</h2>
137
142
  <div class="summary">
138
143
  {summary_cards}
@@ -157,7 +162,27 @@ class HTMLFormatter:
157
162
  html += f" <tr><td>{category.replace('_', ' ').title()}</td><td>{count}</td></tr>\n"
158
163
 
159
164
  # All Assets - COLLAPSIBLE
160
- asset_count = len(result.assets)
165
+ # Filter out:
166
+ # - Expired/retired reserved nodes (historical clutter)
167
+ # - Cost summary (it's a summary row, not an asset)
168
+ # They're still counted in the governance summary for context
169
+ display_assets = [
170
+ a
171
+ for a in result.assets
172
+ if not (
173
+ # Exclude expired/retired reserved nodes
174
+ (
175
+ a.asset_type == "redshift_reserved_node"
176
+ and any(
177
+ flag in (a.risk_flags or [])
178
+ for flag in ["reservation_expired", "reservation_retired"]
179
+ )
180
+ )
181
+ # Exclude cost summary pseudo-asset
182
+ or a.asset_type == "cost_summary"
183
+ )
184
+ ]
185
+ asset_count = len(display_assets)
161
186
  html += f""" </table>
162
187
 
163
188
  <button class="collapsible">All Assets <span class="asset-count">({asset_count} items)</span></button>
@@ -175,11 +200,12 @@ class HTMLFormatter:
175
200
  """
176
201
 
177
202
  # Sort assets by cost (descending)
178
- sorted_assets = sorted(result.assets, key=lambda x: x.cost_estimate_usd or 0, reverse=True)
203
+ sorted_assets = sorted(display_assets, key=lambda x: x.cost_estimate_usd or 0, reverse=True)
179
204
 
180
205
  for asset in sorted_assets:
181
206
  owner_class = ""
182
- if asset.ownership_confidence == "unknown":
207
+ # Only show no-owner class if we have no suggested owner at all
208
+ if not asset.suggested_owner and asset.ownership_confidence == "unknown":
183
209
  owner_class = "no-owner"
184
210
 
185
211
  risk_flags_html = ""
@@ -213,8 +239,11 @@ class HTMLFormatter:
213
239
  </script>
214
240
 
215
241
  <div class="footer">
216
- <p>Generated by Nuvu - AWS Data Asset Control</p>
217
- <p>Visit <a href="https://nuvu.dev">https://nuvu.dev</a> for continuous monitoring</p>
242
+ <p><strong>nuvu-scan</strong> The Open Source Cloud Data Scanner</p>
243
+ <p><a href="https://github.com/nuvudev/nuvu-scan" target="_blank">github.com/nuvudev/nuvu-scan</a></p>
244
+ <p style="margin-top: 12px; font-size: 11px; color: #888;">
245
+ Add the governance layer: <a href="https://nuvu.dev" style="color: #666;">Nuvu Cloud</a> — historical tracking • team dashboards • scheduled scans • Slack/email alerts
246
+ </p>
218
247
  </div>
219
248
  </div>
220
249
  </body>
@@ -303,24 +332,61 @@ class HTMLFormatter:
303
332
  </div>
304
333
  """
305
334
 
306
- # Reserved nodes analysis
335
+ # Reserved nodes analysis - compare with provisioned clusters
307
336
  if reserved_nodes:
308
337
  active_reservations = [
309
338
  a for a in reserved_nodes if (a.usage_metrics or {}).get("state") == "active"
310
339
  ]
311
- expired = [a for a in reserved_nodes if "reservation_expired" in (a.risk_flags or [])]
340
+
341
+ # Count total nodes covered by active reservations
342
+ active_reserved_nodes = sum(
343
+ (a.usage_metrics or {}).get("node_count", 0) for a in active_reservations
344
+ )
345
+
346
+ # Count total provisioned cluster nodes
347
+ clusters = [a for a in assets if a.asset_type == "redshift_cluster"]
348
+ total_provisioned_nodes = sum(
349
+ (a.usage_metrics or {}).get("node_count", 0) for a in clusters
350
+ )
351
+
352
+ # Calculate uncovered nodes (potential savings opportunity)
353
+ uncovered_nodes = max(0, total_provisioned_nodes - active_reserved_nodes)
354
+
355
+ # Determine if this is a savings opportunity
356
+ is_savings_opportunity = uncovered_nodes > 0
357
+ box_class = "warning" if is_savings_opportunity else "info"
312
358
 
313
359
  html += f"""
314
- <div class="insight-box info">
315
- <h3>🎫 Reserved Nodes ({len(reserved_nodes)} total)</h3>
360
+ <div class="insight-box {box_class}">
361
+ <h3>🎫 Reserved vs On-Demand Nodes</h3>
316
362
  <ul>
317
- <li><strong>Active Reservations:</strong> {len(active_reservations)}</li>
318
- <li><strong>Expired/Retired:</strong> {len(expired)}</li>
319
- <li><strong>Expiring Soon:</strong> {len(expiring_reservations)}</li>
363
+ <li><strong>Provisioned Cluster Nodes:</strong> {total_provisioned_nodes}</li>
364
+ <li><strong>Active Reserved Nodes:</strong> {active_reserved_nodes} ({len(active_reservations)} reservations)</li>
365
+ <li><strong>Uncovered (On-Demand) Nodes:</strong> {uncovered_nodes}</li>
320
366
  </ul>
321
- </div>
322
367
  """
323
368
 
369
+ if is_savings_opportunity:
370
+ # Reserved pricing typically saves 30-40% vs on-demand
371
+ html += f"""
372
+ <p class="recommendation">💰 <strong>Potential Savings:</strong> {uncovered_nodes} nodes running on-demand pricing. Reserved nodes typically offer 30-40% discount.</p>
373
+ """
374
+ else:
375
+ html += """
376
+ <p class="recommendation">✅ All provisioned nodes are covered by reservations.</p>
377
+ """
378
+
379
+ # Show expiring reservations if any
380
+ if expiring_reservations:
381
+ expiring_nodes = sum(
382
+ (a.usage_metrics or {}).get("node_count", 0) for a in expiring_reservations
383
+ )
384
+ html += f"""
385
+ <p class="recommendation">⚠️ <strong>{len(expiring_reservations)} reservations ({expiring_nodes} nodes) expiring soon.</strong> Plan for renewal to maintain coverage.</p>
386
+ """
387
+
388
+ html += "</div>"
389
+
324
390
  return html
325
391
 
326
392
  def _build_governance_section(self, assets) -> str:
@@ -418,4 +484,59 @@ class HTMLFormatter:
418
484
  html += f"<li><strong>{cluster.name}</strong>: {queues} queues, Auto WLM: {auto_wlm} ({flags})</li>"
419
485
  html += "</ul></div>"
420
486
 
487
+ # Add cluster performance section
488
+ clusters_with_metrics = [
489
+ a
490
+ for a in clusters
491
+ if (a.usage_metrics or {}).get("cpu_utilization_max_24h") is not None
492
+ ]
493
+ if clusters_with_metrics:
494
+ html += """
495
+ <div class="insight-box info">
496
+ <h3>📊 Cluster Performance (Last 24h)</h3>
497
+ <table class="compact">
498
+ <tr><th>Cluster</th><th>CPU Max</th><th>CPU Avg</th><th>Queries</th><th>Disk Used</th><th>Recommendation</th></tr>
499
+ """
500
+ for cluster in clusters_with_metrics[:10]:
501
+ metrics = cluster.usage_metrics or {}
502
+ cpu_max = metrics.get("cpu_utilization_max_24h", 0)
503
+ cpu_avg = metrics.get("cpu_utilization_avg_24h", 0)
504
+ queries = metrics.get("queries_completed_24h", 0)
505
+ disk = metrics.get("disk_space_used_percent", 0)
506
+ rec = metrics.get("performance_recommendation", "-")
507
+ html += (
508
+ f"<tr><td>{cluster.name}</td><td>{cpu_max:.1f}%</td>"
509
+ f"<td>{cpu_avg:.1f}%</td><td>{queries}</td>"
510
+ f"<td>{disk:.1f}%</td><td>{rec if rec else '-'}</td></tr>"
511
+ )
512
+ html += "</table></div>"
513
+
514
+ # Add serverless workgroup performance section
515
+ serverless_wgs = [a for a in assets if a.asset_type == "redshift_serverless_workgroup"]
516
+ serverless_with_metrics = [
517
+ a for a in serverless_wgs if (a.usage_metrics or {}).get("rpu_max_7d") is not None
518
+ ]
519
+ if serverless_with_metrics:
520
+ html += """
521
+ <div class="insight-box info">
522
+ <h3>🚀 Serverless Workgroup Utilization</h3>
523
+ <table class="compact">
524
+ <tr><th>Workgroup</th><th>Base RPU</th><th>Max RPU (7d)</th><th>Avg RPU (7d)</th><th>Queries (24h)</th><th>Recommendation</th></tr>
525
+ """
526
+ for wg in serverless_with_metrics[:10]:
527
+ metrics = wg.usage_metrics or {}
528
+ base = metrics.get("base_capacity", 0)
529
+ rpu_max = metrics.get("rpu_max_7d", 0)
530
+ rpu_avg = metrics.get("rpu_avg_7d", 0)
531
+ queries = metrics.get("queries_completed_24h", 0) + metrics.get(
532
+ "queries_failed_24h", 0
533
+ )
534
+ rec = metrics.get("utilization_recommendation", "-")
535
+ html += (
536
+ f"<tr><td>{wg.name}</td><td>{base}</td>"
537
+ f"<td>{rpu_max:.1f}</td><td>{rpu_avg:.1f}</td>"
538
+ f"<td>{queries}</td><td>{rec if rec else '-'}</td></tr>"
539
+ )
540
+ html += "</table></div>"
541
+
421
542
  return html
nuvu_scan/core/base.py CHANGED
@@ -88,10 +88,17 @@ class ScanResult:
88
88
  assets: list[Asset]
89
89
  total_cost_estimate_usd: float
90
90
  summary: dict[str, Any] = None
91
+ # Scan scope metadata
92
+ scanned_regions: list[str] = None
93
+ scanned_collectors: list[str] = None
91
94
 
92
95
  def __post_init__(self):
93
96
  if self.summary is None:
94
97
  self.summary = {}
98
+ if self.scanned_regions is None:
99
+ self.scanned_regions = []
100
+ if self.scanned_collectors is None:
101
+ self.scanned_collectors = []
95
102
 
96
103
 
97
104
  class CloudProviderScan(ABC):
@@ -173,6 +180,12 @@ class CloudProviderScan(ABC):
173
180
  # Build summary
174
181
  summary = self._build_summary(assets)
175
182
 
183
+ # Get scanned regions from assets
184
+ scanned_regions = sorted(set(asset.region for asset in assets if asset.region))
185
+
186
+ # Get scanned collectors from config
187
+ scanned_collectors = self.config.collectors if self.config.collectors else []
188
+
176
189
  return ScanResult(
177
190
  provider=self.provider,
178
191
  account_id=self.config.account_id or "unknown",
@@ -180,6 +193,8 @@ class CloudProviderScan(ABC):
180
193
  assets=assets,
181
194
  total_cost_estimate_usd=total_cost,
182
195
  summary=summary,
196
+ scanned_regions=scanned_regions,
197
+ scanned_collectors=scanned_collectors,
183
198
  )
184
199
 
185
200
  def _build_summary(self, assets: list[Asset]) -> dict[str, Any]:
@@ -219,6 +234,21 @@ class CloudProviderScan(ABC):
219
234
  if asset.risk_flags:
220
235
  risky_count += 1
221
236
 
237
+ # Find cost summary asset if present
238
+ actual_costs_30d = {}
239
+ total_actual_cost_30d = None
240
+ for asset in assets:
241
+ if asset.asset_type == "cost_summary":
242
+ usage = asset.usage_metrics or {}
243
+ actual_costs_30d = usage.get("actual_costs_30d", {})
244
+ total_actual_cost_30d = usage.get("total_actual_cost_30d")
245
+ break
246
+
247
+ # Calculate estimated asset costs (excluding cost_summary)
248
+ estimated_assets_total = sum(
249
+ asset.cost_estimate_usd or 0 for asset in assets if asset.asset_type != "cost_summary"
250
+ )
251
+
222
252
  return {
223
253
  "total_assets": total_assets,
224
254
  "assets_by_category": assets_by_category,
@@ -226,4 +256,8 @@ class CloudProviderScan(ABC):
226
256
  "unused_count": unused_count,
227
257
  "no_owner_count": no_owner_count,
228
258
  "risky_count": risky_count,
259
+ # Cost data
260
+ "actual_costs_30d": actual_costs_30d,
261
+ "total_actual_cost_30d": total_actual_cost_30d,
262
+ "estimated_assets_cost_total": estimated_assets_total,
229
263
  }
@@ -236,17 +236,55 @@ class AWSScanner(CloudProviderScan):
236
236
  continue
237
237
 
238
238
  # Add a summary asset with actual costs from Cost Explorer
239
+ # Only include costs for services related to the scanned collectors
240
+ print("Fetching cost data from AWS Cost Explorer...", file=sys.stderr)
239
241
  try:
240
242
  from datetime import datetime, timedelta
241
243
 
242
244
  end_date = datetime.utcnow()
243
245
  start_date = end_date - timedelta(days=30)
244
246
  service_costs = self.cost_explorer.get_service_costs(start_date, end_date)
247
+ print(" → Cost data retrieved", file=sys.stderr)
245
248
 
246
249
  if service_costs:
247
- total_actual_cost = sum(service_costs.values())
250
+ # Map collectors to AWS service names in Cost Explorer
251
+ collector_to_services = {
252
+ "s3": ["Amazon Simple Storage Service"],
253
+ "glue": ["AWS Glue"],
254
+ "athena": ["Amazon Athena"],
255
+ "redshift": ["Amazon Redshift"],
256
+ "iam": [], # IAM is free
257
+ "mwaa": ["Amazon Managed Workflows for Apache Airflow"],
258
+ }
259
+
260
+ # Filter costs based on active collectors
261
+ active_collector_names = (
262
+ [name.lower() for name in self.config.collectors]
263
+ if self.config.collectors
264
+ else list(collector_to_services.keys())
265
+ )
266
+
267
+ # Build list of relevant AWS service names
268
+ relevant_services = set()
269
+ for collector_name in active_collector_names:
270
+ services = collector_to_services.get(collector_name, [])
271
+ relevant_services.update(services)
272
+
273
+ # Filter service_costs to only include relevant services
274
+ if self.config.collectors: # Only filter if specific collectors requested
275
+ filtered_costs = {
276
+ svc: cost for svc, cost in service_costs.items() if svc in relevant_services
277
+ }
278
+ total_actual_cost = sum(filtered_costs.values())
279
+ display_costs = filtered_costs
280
+ scope_note = f"Filtered to collectors: {', '.join(self.config.collectors)}"
281
+ else:
282
+ # Full scan - show all costs
283
+ total_actual_cost = sum(service_costs.values())
284
+ display_costs = service_costs
285
+ scope_note = "Full scan - all services"
286
+
248
287
  # Use the actual 30-day cost as monthly estimate
249
- # This represents the actual spend, not an extrapolation
250
288
  monthly_estimate = total_actual_cost
251
289
 
252
290
  # Create a summary asset
@@ -257,7 +295,7 @@ class AWSScanner(CloudProviderScan):
257
295
  service="Cost Explorer",
258
296
  region="global",
259
297
  arn="arn:aws:ce::cost-summary",
260
- name="AWS Cost Summary (Last 30 Days)",
298
+ name=f"AWS Cost Summary - {scope_note}",
261
299
  created_at=None,
262
300
  last_activity_at=datetime.utcnow().isoformat(),
263
301
  tags={},
@@ -266,10 +304,11 @@ class AWSScanner(CloudProviderScan):
266
304
  ownership_confidence="unknown",
267
305
  suggested_owner=None,
268
306
  usage_metrics={
269
- "actual_costs_30d": service_costs,
307
+ "actual_costs_30d": display_costs,
270
308
  "total_actual_cost_30d": total_actual_cost,
271
309
  "estimated_monthly_cost": monthly_estimate,
272
- "note": "Actual costs from AWS Cost Explorer API for the last 30 days. This represents real spend, not estimates. Note: Some costs shown are for services that are not data assets (e.g., domain registration, email services, DNS). Individual asset costs below may be estimates based on resource usage.",
310
+ "scope": scope_note,
311
+ "note": "Actual costs from AWS Cost Explorer API for the last 30 days.",
273
312
  },
274
313
  )
275
314
  all_assets.append(cost_summary_asset)
@@ -300,39 +339,16 @@ class AWSScanner(CloudProviderScan):
300
339
  def get_cost_estimate(self, asset: Asset) -> float:
301
340
  """Estimate monthly cost for an AWS asset.
302
341
 
303
- First tries to get actual cost from Cost Explorer API.
304
- Falls back to collector-based estimates if Cost Explorer data is not available.
342
+ Uses collector-based estimates for individual assets.
343
+ Service-level actual costs from Cost Explorer are already included
344
+ in the cost_summary asset and used for reporting.
305
345
  """
306
- # First, try to get actual cost from Cost Explorer API
307
- try:
308
- # Map service names to Cost Explorer service names
309
- service_mapping = {
310
- "S3": "Amazon Simple Storage Service",
311
- "Athena": "Amazon Athena",
312
- "Glue": "AWS Glue",
313
- "Redshift": "Amazon Redshift",
314
- "MWAA": "Amazon Managed Workflows for Apache Airflow",
315
- }
316
-
317
- cost_explorer_service = service_mapping.get(asset.service)
318
- if cost_explorer_service:
319
- # Get service-level cost from Cost Explorer (last 30 days actual cost)
320
- service_cost = self.cost_explorer.get_monthly_cost_for_service(
321
- cost_explorer_service
322
- )
323
- if service_cost > 0:
324
- # We have actual service-level cost from Cost Explorer
325
- # For now, we'll still use collector estimates for individual assets
326
- # because Cost Explorer doesn't provide per-resource costs without tags
327
- # But we could potentially distribute service cost across assets proportionally
328
- # For now, prefer collector estimates which are more accurate per-resource
329
- pass # Continue to collector-based estimation
330
-
331
- except Exception:
332
- # If Cost Explorer fails, fall back to collector-based estimation
333
- pass
346
+ # Use the cost already set by the collector during collection
347
+ # This avoids making Cost Explorer API calls for each asset
348
+ if asset.cost_estimate_usd is not None and asset.cost_estimate_usd > 0:
349
+ return asset.cost_estimate_usd
334
350
 
335
- # Delegate to appropriate collector based on service for detailed estimation
351
+ # Delegate to appropriate collector based on service for estimation
336
352
  for collector in self.collectors:
337
353
  if hasattr(collector, "get_cost_estimate"):
338
354
  try: