PyPI - nuvu-scan - Versions diffs - 2.0.2__py3-none-any.whl → 2.1.2__py3-none-any.whl - Mend

nuvu-scan 2.0.2py3-none-any.whl → 2.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

nuvu_scan/cli/commands/scan.py +8 -1
nuvu_scan/cli/formatters/html.py +141 -20
nuvu_scan/core/base.py +34 -0
nuvu_scan/core/providers/aws/aws_scanner.py +52 -36
nuvu_scan/core/providers/aws/collectors/athena.py +102 -67
nuvu_scan/core/providers/aws/collectors/glue.py +104 -34
nuvu_scan/core/providers/aws/collectors/mwaa.py +10 -5
nuvu_scan/core/providers/aws/collectors/redshift.py +381 -18
{nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/METADATA +41 -30
{nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/RECORD +12 -12
{nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/WHEEL +0 -0
{nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/entry_points.txt +0 -0

nuvu_scan/cli/commands/scan.py CHANGED Viewed

@@ -108,6 +108,11 @@ from ..formatters.json import JSONFormatter
     default="https://nuvu.dev",
     help="Nuvu Cloud API URL (default: https://nuvu.dev)",
 )
+@click.option(
+    "--list-collectors",
+    is_flag=True,
+    help="List available collectors for the specified provider and exit.",
+)
 def scan_command(
     provider: str,
     output_format: str,
@@ -344,6 +349,7 @@ def scan_command(
                         "size_bytes": asset.size_bytes,
                         "tags": asset.tags,
                         "cost_estimate_usd": asset.cost_estimate_usd,
+                        "usage_metrics": asset.usage_metrics,  # Include all usage metrics
                         "risk_flags": asset.risk_flags,
                         "ownership_confidence": asset.ownership_confidence or "unknown",
                         "suggested_owner": asset.suggested_owner,
@@ -353,7 +359,8 @@ def scan_command(
             }
             # Push to API using the /api/scans/import endpoint
-            with httpx.Client(timeout=60) as client:
+            # Use longer timeout for large scans (2000+ assets can take minutes)
+            with httpx.Client(timeout=300) as client:
                 response = client.post(
                     f"{api_url.rstrip('/')}/api/scans/import",
                     json=payload,

nuvu_scan/cli/formatters/html.py CHANGED Viewed

@@ -10,9 +10,8 @@ class HTMLFormatter:
     def format(self, result: ScanResult) -> str:
         """Format scan result as HTML."""
-        # Build summary cards (use actual cost if available)
+        # Build summary cards (use actual cost from Cost Explorer if available)
         actual_total = result.summary.get("total_actual_cost_30d")
-        estimated_assets_total = result.summary.get("estimated_assets_cost_total")
         # Calculate cost saving opportunities
         savings_opportunities = self._calculate_savings(result.assets)
@@ -29,17 +28,16 @@ class HTMLFormatter:
             <div class="summary-card">
                 <h3>Actual 30-Day Cost</h3>
                 <div class="value">${actual_total:,.2f}</div>
-            </div>
-            <div class="summary-card">
-                <h3>Estimated Asset Cost</h3>
-                <div class="value">${(estimated_assets_total or 0):,.2f}</div>
+                <div class="card-note">From AWS Cost Explorer</div>
             </div>
             """
         else:
+            # Fallback when Cost Explorer data not available
             summary_cards += f"""
             <div class="summary-card">
-                <h3>Estimated Monthly Cost</h3>
+                <h3>Monthly Cost</h3>
                 <div class="value">${result.total_cost_estimate_usd:,.2f}</div>
+                <div class="card-note">Add Cost Explorer permissions for accurate data</div>
             </div>
             """
@@ -99,6 +97,7 @@ class HTMLFormatter:
         .summary-card.savings {{ border-left-color: #ff9800; background: #fff8e1; }}
         .summary-card h3 {{ margin: 0 0 10px 0; color: #666; font-size: 13px; text-transform: uppercase; }}
         .summary-card .value {{ font-size: 22px; font-weight: bold; color: #333; }}
+        .summary-card .card-note {{ font-size: 11px; color: #888; margin-top: 5px; }}
         table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
         table.compact {{ font-size: 13px; }}
         table.compact th, table.compact td {{ padding: 8px; }}
@@ -133,6 +132,12 @@ class HTMLFormatter:
         <p><strong>Account ID:</strong> {result.account_id}</p>
         <p><strong>Scan Time:</strong> {result.scan_timestamp}</p>
+        <h2>📋 Scan Scope</h2>
+        <div class="insight-box info">
+            <p><strong>Collectors:</strong> {", ".join(result.scanned_collectors) if result.scanned_collectors else "All (Full Scan)"}</p>
+            <p><strong>Regions:</strong> {", ".join(result.scanned_regions[:10]) if result.scanned_regions else "All enabled regions"}{" (+ " + str(len(result.scanned_regions) - 10) + " more)" if len(result.scanned_regions) > 10 else ""}</p>
+        </div>
         <h2>Executive Summary</h2>
         <div class="summary">
 {summary_cards}
@@ -157,7 +162,27 @@ class HTMLFormatter:
             html += f"            <tr><td>{category.replace('_', ' ').title()}</td><td>{count}</td></tr>\n"
         # All Assets - COLLAPSIBLE
-        asset_count = len(result.assets)
+        # Filter out:
+        # - Expired/retired reserved nodes (historical clutter)
+        # - Cost summary (it's a summary row, not an asset)
+        # They're still counted in the governance summary for context
+        display_assets = [
+            a
+            for a in result.assets
+            if not (
+                # Exclude expired/retired reserved nodes
+                (
+                    a.asset_type == "redshift_reserved_node"
+                    and any(
+                        flag in (a.risk_flags or [])
+                        for flag in ["reservation_expired", "reservation_retired"]
+                    )
+                )
+                # Exclude cost summary pseudo-asset
+                or a.asset_type == "cost_summary"
+            )
+        ]
+        asset_count = len(display_assets)
         html += f"""        </table>
         <button class="collapsible">All Assets <span class="asset-count">({asset_count} items)</span></button>
@@ -175,11 +200,12 @@ class HTMLFormatter:
 """
         # Sort assets by cost (descending)
-        sorted_assets = sorted(result.assets, key=lambda x: x.cost_estimate_usd or 0, reverse=True)
+        sorted_assets = sorted(display_assets, key=lambda x: x.cost_estimate_usd or 0, reverse=True)
         for asset in sorted_assets:
             owner_class = ""
-            if asset.ownership_confidence == "unknown":
+            # Only show no-owner class if we have no suggested owner at all
+            if not asset.suggested_owner and asset.ownership_confidence == "unknown":
                 owner_class = "no-owner"
             risk_flags_html = ""
@@ -213,8 +239,11 @@ class HTMLFormatter:
         </script>
         <div class="footer">
-            <p>Generated by Nuvu - AWS Data Asset Control</p>
-            <p>Visit <a href="https://nuvu.dev">https://nuvu.dev</a> for continuous monitoring</p>
+            <p><strong>nuvu-scan</strong> — The Open Source Cloud Data Scanner</p>
+            <p><a href="https://github.com/nuvudev/nuvu-scan" target="_blank">github.com/nuvudev/nuvu-scan</a></p>
+            <p style="margin-top: 12px; font-size: 11px; color: #888;">
+                Add the governance layer: <a href="https://nuvu.dev" style="color: #666;">Nuvu Cloud</a> — historical tracking • team dashboards • scheduled scans • Slack/email alerts
+            </p>
         </div>
     </div>
 </body>
@@ -303,24 +332,61 @@ class HTMLFormatter:
         </div>
             """
-        # Reserved nodes analysis
+        # Reserved nodes analysis - compare with provisioned clusters
         if reserved_nodes:
             active_reservations = [
                 a for a in reserved_nodes if (a.usage_metrics or {}).get("state") == "active"
             ]
-            expired = [a for a in reserved_nodes if "reservation_expired" in (a.risk_flags or [])]
+            # Count total nodes covered by active reservations
+            active_reserved_nodes = sum(
+                (a.usage_metrics or {}).get("node_count", 0) for a in active_reservations
+            )
+            # Count total provisioned cluster nodes
+            clusters = [a for a in assets if a.asset_type == "redshift_cluster"]
+            total_provisioned_nodes = sum(
+                (a.usage_metrics or {}).get("node_count", 0) for a in clusters
+            )
+            # Calculate uncovered nodes (potential savings opportunity)
+            uncovered_nodes = max(0, total_provisioned_nodes - active_reserved_nodes)
+            # Determine if this is a savings opportunity
+            is_savings_opportunity = uncovered_nodes > 0
+            box_class = "warning" if is_savings_opportunity else "info"
             html += f"""
-        <div class="insight-box info">
-            <h3>🎫 Reserved Nodes ({len(reserved_nodes)} total)</h3>
+        <div class="insight-box {box_class}">
+            <h3>🎫 Reserved vs On-Demand Nodes</h3>
             <ul>
-                <li><strong>Active Reservations:</strong> {len(active_reservations)}</li>
-                <li><strong>Expired/Retired:</strong> {len(expired)}</li>
-                <li><strong>Expiring Soon:</strong> {len(expiring_reservations)}</li>
+                <li><strong>Provisioned Cluster Nodes:</strong> {total_provisioned_nodes}</li>
+                <li><strong>Active Reserved Nodes:</strong> {active_reserved_nodes} ({len(active_reservations)} reservations)</li>
+                <li><strong>Uncovered (On-Demand) Nodes:</strong> {uncovered_nodes}</li>
             </ul>
-        </div>
             """
+            if is_savings_opportunity:
+                # Reserved pricing typically saves 30-40% vs on-demand
+                html += f"""
+            <p class="recommendation">💰 <strong>Potential Savings:</strong> {uncovered_nodes} nodes running on-demand pricing. Reserved nodes typically offer 30-40% discount.</p>
+            """
+            else:
+                html += """
+            <p class="recommendation">✅ All provisioned nodes are covered by reservations.</p>
+            """
+            # Show expiring reservations if any
+            if expiring_reservations:
+                expiring_nodes = sum(
+                    (a.usage_metrics or {}).get("node_count", 0) for a in expiring_reservations
+                )
+                html += f"""
+            <p class="recommendation">⚠️ <strong>{len(expiring_reservations)} reservations ({expiring_nodes} nodes) expiring soon.</strong> Plan for renewal to maintain coverage.</p>
+            """
+            html += "</div>"
         return html
     def _build_governance_section(self, assets) -> str:
@@ -418,4 +484,59 @@ class HTMLFormatter:
                 html += f"<li><strong>{cluster.name}</strong>: {queues} queues, Auto WLM: {auto_wlm} ({flags})</li>"
             html += "</ul></div>"
+        # Add cluster performance section
+        clusters_with_metrics = [
+            a
+            for a in clusters
+            if (a.usage_metrics or {}).get("cpu_utilization_max_24h") is not None
+        ]
+        if clusters_with_metrics:
+            html += """
+        <div class="insight-box info">
+            <h3>📊 Cluster Performance (Last 24h)</h3>
+            <table class="compact">
+                <tr><th>Cluster</th><th>CPU Max</th><th>CPU Avg</th><th>Queries</th><th>Disk Used</th><th>Recommendation</th></tr>
+            """
+            for cluster in clusters_with_metrics[:10]:
+                metrics = cluster.usage_metrics or {}
+                cpu_max = metrics.get("cpu_utilization_max_24h", 0)
+                cpu_avg = metrics.get("cpu_utilization_avg_24h", 0)
+                queries = metrics.get("queries_completed_24h", 0)
+                disk = metrics.get("disk_space_used_percent", 0)
+                rec = metrics.get("performance_recommendation", "-")
+                html += (
+                    f"<tr><td>{cluster.name}</td><td>{cpu_max:.1f}%</td>"
+                    f"<td>{cpu_avg:.1f}%</td><td>{queries}</td>"
+                    f"<td>{disk:.1f}%</td><td>{rec if rec else '-'}</td></tr>"
+                )
+            html += "</table></div>"
+        # Add serverless workgroup performance section
+        serverless_wgs = [a for a in assets if a.asset_type == "redshift_serverless_workgroup"]
+        serverless_with_metrics = [
+            a for a in serverless_wgs if (a.usage_metrics or {}).get("rpu_max_7d") is not None
+        ]
+        if serverless_with_metrics:
+            html += """
+        <div class="insight-box info">
+            <h3>🚀 Serverless Workgroup Utilization</h3>
+            <table class="compact">
+                <tr><th>Workgroup</th><th>Base RPU</th><th>Max RPU (7d)</th><th>Avg RPU (7d)</th><th>Queries (24h)</th><th>Recommendation</th></tr>
+            """
+            for wg in serverless_with_metrics[:10]:
+                metrics = wg.usage_metrics or {}
+                base = metrics.get("base_capacity", 0)
+                rpu_max = metrics.get("rpu_max_7d", 0)
+                rpu_avg = metrics.get("rpu_avg_7d", 0)
+                queries = metrics.get("queries_completed_24h", 0) + metrics.get(
+                    "queries_failed_24h", 0
+                )
+                rec = metrics.get("utilization_recommendation", "-")
+                html += (
+                    f"<tr><td>{wg.name}</td><td>{base}</td>"
+                    f"<td>{rpu_max:.1f}</td><td>{rpu_avg:.1f}</td>"
+                    f"<td>{queries}</td><td>{rec if rec else '-'}</td></tr>"
+                )
+            html += "</table></div>"
         return html

nuvu_scan/core/base.py CHANGED Viewed

@@ -88,10 +88,17 @@ class ScanResult:
     assets: list[Asset]
     total_cost_estimate_usd: float
     summary: dict[str, Any] = None
+    # Scan scope metadata
+    scanned_regions: list[str] = None
+    scanned_collectors: list[str] = None
     def __post_init__(self):
         if self.summary is None:
             self.summary = {}
+        if self.scanned_regions is None:
+            self.scanned_regions = []
+        if self.scanned_collectors is None:
+            self.scanned_collectors = []
 class CloudProviderScan(ABC):
@@ -173,6 +180,12 @@ class CloudProviderScan(ABC):
         # Build summary
         summary = self._build_summary(assets)
+        # Get scanned regions from assets
+        scanned_regions = sorted(set(asset.region for asset in assets if asset.region))
+        # Get scanned collectors from config
+        scanned_collectors = self.config.collectors if self.config.collectors else []
         return ScanResult(
             provider=self.provider,
             account_id=self.config.account_id or "unknown",
@@ -180,6 +193,8 @@ class CloudProviderScan(ABC):
             assets=assets,
             total_cost_estimate_usd=total_cost,
             summary=summary,
+            scanned_regions=scanned_regions,
+            scanned_collectors=scanned_collectors,
         )
     def _build_summary(self, assets: list[Asset]) -> dict[str, Any]:
@@ -219,6 +234,21 @@ class CloudProviderScan(ABC):
             if asset.risk_flags:
                 risky_count += 1
+        # Find cost summary asset if present
+        actual_costs_30d = {}
+        total_actual_cost_30d = None
+        for asset in assets:
+            if asset.asset_type == "cost_summary":
+                usage = asset.usage_metrics or {}
+                actual_costs_30d = usage.get("actual_costs_30d", {})
+                total_actual_cost_30d = usage.get("total_actual_cost_30d")
+                break
+        # Calculate estimated asset costs (excluding cost_summary)
+        estimated_assets_total = sum(
+            asset.cost_estimate_usd or 0 for asset in assets if asset.asset_type != "cost_summary"
+        )
         return {
             "total_assets": total_assets,
             "assets_by_category": assets_by_category,
@@ -226,4 +256,8 @@ class CloudProviderScan(ABC):
             "unused_count": unused_count,
             "no_owner_count": no_owner_count,
             "risky_count": risky_count,
+            # Cost data
+            "actual_costs_30d": actual_costs_30d,
+            "total_actual_cost_30d": total_actual_cost_30d,
+            "estimated_assets_cost_total": estimated_assets_total,
         }

nuvu_scan/core/providers/aws/aws_scanner.py CHANGED Viewed

@@ -236,17 +236,55 @@ class AWSScanner(CloudProviderScan):
                 continue
         # Add a summary asset with actual costs from Cost Explorer
+        # Only include costs for services related to the scanned collectors
+        print("Fetching cost data from AWS Cost Explorer...", file=sys.stderr)
         try:
             from datetime import datetime, timedelta
             end_date = datetime.utcnow()
             start_date = end_date - timedelta(days=30)
             service_costs = self.cost_explorer.get_service_costs(start_date, end_date)
+            print("  → Cost data retrieved", file=sys.stderr)
             if service_costs:
-                total_actual_cost = sum(service_costs.values())
+                # Map collectors to AWS service names in Cost Explorer
+                collector_to_services = {
+                    "s3": ["Amazon Simple Storage Service"],
+                    "glue": ["AWS Glue"],
+                    "athena": ["Amazon Athena"],
+                    "redshift": ["Amazon Redshift"],
+                    "iam": [],  # IAM is free
+                    "mwaa": ["Amazon Managed Workflows for Apache Airflow"],
+                }
+                # Filter costs based on active collectors
+                active_collector_names = (
+                    [name.lower() for name in self.config.collectors]
+                    if self.config.collectors
+                    else list(collector_to_services.keys())
+                )
+                # Build list of relevant AWS service names
+                relevant_services = set()
+                for collector_name in active_collector_names:
+                    services = collector_to_services.get(collector_name, [])
+                    relevant_services.update(services)
+                # Filter service_costs to only include relevant services
+                if self.config.collectors:  # Only filter if specific collectors requested
+                    filtered_costs = {
+                        svc: cost for svc, cost in service_costs.items() if svc in relevant_services
+                    }
+                    total_actual_cost = sum(filtered_costs.values())
+                    display_costs = filtered_costs
+                    scope_note = f"Filtered to collectors: {', '.join(self.config.collectors)}"
+                else:
+                    # Full scan - show all costs
+                    total_actual_cost = sum(service_costs.values())
+                    display_costs = service_costs
+                    scope_note = "Full scan - all services"
                 # Use the actual 30-day cost as monthly estimate
-                # This represents the actual spend, not an extrapolation
                 monthly_estimate = total_actual_cost
                 # Create a summary asset
@@ -257,7 +295,7 @@ class AWSScanner(CloudProviderScan):
                     service="Cost Explorer",
                     region="global",
                     arn="arn:aws:ce::cost-summary",
-                    name="AWS Cost Summary (Last 30 Days)",
+                    name=f"AWS Cost Summary - {scope_note}",
                     created_at=None,
                     last_activity_at=datetime.utcnow().isoformat(),
                     tags={},
@@ -266,10 +304,11 @@ class AWSScanner(CloudProviderScan):
                     ownership_confidence="unknown",
                     suggested_owner=None,
                     usage_metrics={
-                        "actual_costs_30d": service_costs,
+                        "actual_costs_30d": display_costs,
                         "total_actual_cost_30d": total_actual_cost,
                         "estimated_monthly_cost": monthly_estimate,
-                        "note": "Actual costs from AWS Cost Explorer API for the last 30 days. This represents real spend, not estimates. Note: Some costs shown are for services that are not data assets (e.g., domain registration, email services, DNS). Individual asset costs below may be estimates based on resource usage.",
+                        "scope": scope_note,
+                        "note": "Actual costs from AWS Cost Explorer API for the last 30 days.",
                     },
                 )
                 all_assets.append(cost_summary_asset)
@@ -300,39 +339,16 @@ class AWSScanner(CloudProviderScan):
     def get_cost_estimate(self, asset: Asset) -> float:
         """Estimate monthly cost for an AWS asset.
-        First tries to get actual cost from Cost Explorer API.
-        Falls back to collector-based estimates if Cost Explorer data is not available.
+        Uses collector-based estimates for individual assets.
+        Service-level actual costs from Cost Explorer are already included
+        in the cost_summary asset and used for reporting.
         """
-        # First, try to get actual cost from Cost Explorer API
-        try:
-            # Map service names to Cost Explorer service names
-            service_mapping = {
-                "S3": "Amazon Simple Storage Service",
-                "Athena": "Amazon Athena",
-                "Glue": "AWS Glue",
-                "Redshift": "Amazon Redshift",
-                "MWAA": "Amazon Managed Workflows for Apache Airflow",
-            }
-            cost_explorer_service = service_mapping.get(asset.service)
-            if cost_explorer_service:
-                # Get service-level cost from Cost Explorer (last 30 days actual cost)
-                service_cost = self.cost_explorer.get_monthly_cost_for_service(
-                    cost_explorer_service
-                )
-                if service_cost > 0:
-                    # We have actual service-level cost from Cost Explorer
-                    # For now, we'll still use collector estimates for individual assets
-                    # because Cost Explorer doesn't provide per-resource costs without tags
-                    # But we could potentially distribute service cost across assets proportionally
-                    # For now, prefer collector estimates which are more accurate per-resource
-                    pass  # Continue to collector-based estimation
-        except Exception:
-            # If Cost Explorer fails, fall back to collector-based estimation
-            pass
+        # Use the cost already set by the collector during collection
+        # This avoids making Cost Explorer API calls for each asset
+        if asset.cost_estimate_usd is not None and asset.cost_estimate_usd > 0:
+            return asset.cost_estimate_usd
-        # Delegate to appropriate collector based on service for detailed estimation
+        # Delegate to appropriate collector based on service for estimation
         for collector in self.collectors:
             if hasattr(collector, "get_cost_estimate"):
                 try:

nuvu-scan 2.0.2__py3-none-any.whl → 2.1.2__py3-none-any.whl

nuvu-scan 2.0.2py3-none-any.whl → 2.1.2py3-none-any.whl