nuvu-scan 1.3.8__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nuvu_scan/__init__.py +1 -1
- nuvu_scan/cli/commands/scan.py +32 -0
- nuvu_scan/cli/formatters/html.py +262 -10
- nuvu_scan/cli/main.py +2 -1
- nuvu_scan/core/base.py +6 -0
- nuvu_scan/core/providers/aws/aws_scanner.py +41 -15
- nuvu_scan/core/providers/aws/collectors/athena.py +3 -0
- nuvu_scan/core/providers/aws/collectors/glue.py +420 -15
- nuvu_scan/core/providers/aws/collectors/iam.py +9 -0
- nuvu_scan/core/providers/aws/collectors/redshift.py +718 -40
- nuvu_scan/core/providers/gcp/gcp_scanner.py +42 -10
- {nuvu_scan-1.3.8.dist-info → nuvu_scan-2.0.0.dist-info}/METADATA +58 -14
- {nuvu_scan-1.3.8.dist-info → nuvu_scan-2.0.0.dist-info}/RECORD +15 -15
- {nuvu_scan-1.3.8.dist-info → nuvu_scan-2.0.0.dist-info}/entry_points.txt +1 -0
- {nuvu_scan-1.3.8.dist-info → nuvu_scan-2.0.0.dist-info}/WHEEL +0 -0
nuvu_scan/__init__.py
CHANGED
nuvu_scan/cli/commands/scan.py
CHANGED
|
@@ -42,6 +42,15 @@ from ..formatters.json import JSONFormatter
|
|
|
42
42
|
multiple=True,
|
|
43
43
|
help="Cloud provider region(s) to scan (can be specified multiple times, default: all regions)",
|
|
44
44
|
)
|
|
45
|
+
@click.option(
|
|
46
|
+
"--collectors",
|
|
47
|
+
"-c",
|
|
48
|
+
multiple=True,
|
|
49
|
+
help="Specific collector(s) to run (can be specified multiple times). "
|
|
50
|
+
"AWS: s3, glue, athena, redshift, iam, mwaa. "
|
|
51
|
+
"GCP: gcs, bigquery, dataproc, pubsub, iam, gemini. "
|
|
52
|
+
"Default: all collectors.",
|
|
53
|
+
)
|
|
45
54
|
@click.option(
|
|
46
55
|
"--access-key-id",
|
|
47
56
|
envvar="AWS_ACCESS_KEY_ID",
|
|
@@ -103,11 +112,17 @@ from ..formatters.json import JSONFormatter
|
|
|
103
112
|
envvar="NUVU_API_KEY",
|
|
104
113
|
help="Nuvu Cloud API key (from dashboard account settings)",
|
|
105
114
|
)
|
|
115
|
+
@click.option(
|
|
116
|
+
"--list-collectors",
|
|
117
|
+
is_flag=True,
|
|
118
|
+
help="List available collectors for the specified provider and exit",
|
|
119
|
+
)
|
|
106
120
|
def scan_command(
|
|
107
121
|
provider: str,
|
|
108
122
|
output_format: str,
|
|
109
123
|
output_file: str | None,
|
|
110
124
|
region: tuple,
|
|
125
|
+
collectors: tuple,
|
|
111
126
|
access_key_id: str | None,
|
|
112
127
|
secret_access_key: str | None,
|
|
113
128
|
session_token: str | None,
|
|
@@ -121,9 +136,25 @@ def scan_command(
|
|
|
121
136
|
push: bool,
|
|
122
137
|
nuvu_cloud_url: str | None,
|
|
123
138
|
api_key: str | None,
|
|
139
|
+
list_collectors: bool,
|
|
124
140
|
):
|
|
125
141
|
"""Scan cloud provider for data assets."""
|
|
126
142
|
|
|
143
|
+
# Handle --list-collectors flag
|
|
144
|
+
if list_collectors:
|
|
145
|
+
if provider == "aws":
|
|
146
|
+
available = AWSScanner.get_available_collectors()
|
|
147
|
+
elif provider == "gcp":
|
|
148
|
+
available = GCPScanner.get_available_collectors()
|
|
149
|
+
else:
|
|
150
|
+
click.echo(f"Unknown provider: {provider}", err=True)
|
|
151
|
+
sys.exit(1)
|
|
152
|
+
|
|
153
|
+
click.echo(f"Available collectors for {provider.upper()}:")
|
|
154
|
+
for name in sorted(available):
|
|
155
|
+
click.echo(f" - {name}")
|
|
156
|
+
return
|
|
157
|
+
|
|
127
158
|
# Build credentials based on provider
|
|
128
159
|
credentials = {}
|
|
129
160
|
account_id = None
|
|
@@ -206,6 +237,7 @@ def scan_command(
|
|
|
206
237
|
credentials=credentials,
|
|
207
238
|
regions=list(region) if region else None,
|
|
208
239
|
account_id=account_id,
|
|
240
|
+
collectors=list(collectors) if collectors else None,
|
|
209
241
|
)
|
|
210
242
|
|
|
211
243
|
# Get scanner instance
|
nuvu_scan/cli/formatters/html.py
CHANGED
|
@@ -14,6 +14,9 @@ class HTMLFormatter:
|
|
|
14
14
|
actual_total = result.summary.get("total_actual_cost_30d")
|
|
15
15
|
estimated_assets_total = result.summary.get("estimated_assets_cost_total")
|
|
16
16
|
|
|
17
|
+
# Calculate cost saving opportunities
|
|
18
|
+
savings_opportunities = self._calculate_savings(result.assets)
|
|
19
|
+
|
|
17
20
|
summary_cards = f"""
|
|
18
21
|
<div class="summary-card">
|
|
19
22
|
<h3>Total Assets</h3>
|
|
@@ -43,18 +46,27 @@ class HTMLFormatter:
|
|
|
43
46
|
summary_cards += f"""
|
|
44
47
|
<div class="summary-card">
|
|
45
48
|
<h3>Unused Assets</h3>
|
|
46
|
-
<div class="value">{result.summary.get(
|
|
49
|
+
<div class="value">{result.summary.get("unused_count", 0)}</div>
|
|
47
50
|
</div>
|
|
48
51
|
<div class="summary-card">
|
|
49
52
|
<h3>No Owner</h3>
|
|
50
|
-
<div class="value">{result.summary.get(
|
|
53
|
+
<div class="value">{result.summary.get("no_owner_count", 0)}</div>
|
|
51
54
|
</div>
|
|
52
55
|
<div class="summary-card">
|
|
53
56
|
<h3>Risky Assets</h3>
|
|
54
|
-
<div class="value">{result.summary.get(
|
|
57
|
+
<div class="value">{result.summary.get("risky_count", 0)}</div>
|
|
55
58
|
</div>
|
|
56
59
|
"""
|
|
57
60
|
|
|
61
|
+
# Add savings opportunity card if significant
|
|
62
|
+
if savings_opportunities["total_potential_savings"] > 100:
|
|
63
|
+
summary_cards += f"""
|
|
64
|
+
<div class="summary-card savings">
|
|
65
|
+
<h3>💰 Potential Savings</h3>
|
|
66
|
+
<div class="value">${savings_opportunities["total_potential_savings"]:,.2f}/mo</div>
|
|
67
|
+
</div>
|
|
68
|
+
"""
|
|
69
|
+
|
|
58
70
|
# Build service costs table if available
|
|
59
71
|
service_costs_html = ""
|
|
60
72
|
service_costs = result.summary.get("actual_costs_30d", {})
|
|
@@ -79,14 +91,17 @@ class HTMLFormatter:
|
|
|
79
91
|
<title>Nuvu Scan Report - {result.provider.upper()}</title>
|
|
80
92
|
<style>
|
|
81
93
|
body {{ font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }}
|
|
82
|
-
.container {{ max-width:
|
|
94
|
+
.container {{ max-width: 1400px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
|
|
83
95
|
h1 {{ color: #333; border-bottom: 3px solid #4CAF50; padding-bottom: 10px; }}
|
|
84
96
|
h2 {{ color: #555; margin-top: 30px; }}
|
|
85
|
-
.summary {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(
|
|
97
|
+
.summary {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 15px; margin: 20px 0; }}
|
|
86
98
|
.summary-card {{ background: #f9f9f9; padding: 15px; border-radius: 5px; border-left: 4px solid #4CAF50; }}
|
|
87
|
-
.summary-card
|
|
88
|
-
.summary-card
|
|
99
|
+
.summary-card.savings {{ border-left-color: #ff9800; background: #fff8e1; }}
|
|
100
|
+
.summary-card h3 {{ margin: 0 0 10px 0; color: #666; font-size: 13px; text-transform: uppercase; }}
|
|
101
|
+
.summary-card .value {{ font-size: 22px; font-weight: bold; color: #333; }}
|
|
89
102
|
table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
|
|
103
|
+
table.compact {{ font-size: 13px; }}
|
|
104
|
+
table.compact th, table.compact td {{ padding: 8px; }}
|
|
90
105
|
th, td {{ padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }}
|
|
91
106
|
th {{ background: #4CAF50; color: white; font-weight: bold; }}
|
|
92
107
|
tr:hover {{ background: #f5f5f5; }}
|
|
@@ -94,6 +109,21 @@ class HTMLFormatter:
|
|
|
94
109
|
.unused {{ color: #ff8800; font-weight: bold; }}
|
|
95
110
|
.no-owner {{ color: #ff4444; font-weight: bold; }}
|
|
96
111
|
.footer {{ margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd; color: #666; font-size: 12px; text-align: center; }}
|
|
112
|
+
.insight-box {{ padding: 15px; border-radius: 8px; margin: 15px 0; }}
|
|
113
|
+
.insight-box h3 {{ margin-top: 0; }}
|
|
114
|
+
.insight-box.warning {{ background: #fff8e1; border-left: 4px solid #ff9800; }}
|
|
115
|
+
.insight-box.alert {{ background: #ffebee; border-left: 4px solid #f44336; }}
|
|
116
|
+
.insight-box.info {{ background: #e3f2fd; border-left: 4px solid #2196f3; }}
|
|
117
|
+
.insight-box.success {{ background: #e8f5e9; border-left: 4px solid #4caf50; }}
|
|
118
|
+
.recommendation {{ font-style: italic; color: #666; margin-top: 10px; }}
|
|
119
|
+
/* Collapsible sections */
|
|
120
|
+
.collapsible {{ cursor: pointer; padding: 15px; width: 100%; border: none; text-align: left; outline: none; font-size: 18px; font-weight: bold; background: #f5f5f5; border-radius: 5px; margin-top: 20px; color: #555; display: flex; justify-content: space-between; align-items: center; }}
|
|
121
|
+
.collapsible:hover {{ background: #eee; }}
|
|
122
|
+
.collapsible:after {{ content: '▼'; font-size: 12px; color: #888; }}
|
|
123
|
+
.collapsible.active:after {{ content: '▲'; }}
|
|
124
|
+
.collapsible-content {{ display: none; overflow: hidden; padding: 0; }}
|
|
125
|
+
.collapsible-content.show {{ display: block; }}
|
|
126
|
+
.asset-count {{ font-size: 14px; font-weight: normal; color: #888; }}
|
|
97
127
|
</style>
|
|
98
128
|
</head>
|
|
99
129
|
<body>
|
|
@@ -108,7 +138,16 @@ class HTMLFormatter:
|
|
|
108
138
|
{summary_cards}
|
|
109
139
|
</div>
|
|
110
140
|
{service_costs_html}
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
# Add Cost Optimization Section FIRST (before Assets by Category)
|
|
144
|
+
html += self._build_cost_optimization_section(result.assets)
|
|
111
145
|
|
|
146
|
+
# Add Governance Insights Section SECOND
|
|
147
|
+
html += self._build_governance_section(result.assets)
|
|
148
|
+
|
|
149
|
+
# Assets by Category
|
|
150
|
+
html += """
|
|
112
151
|
<h2>Assets by Category</h2>
|
|
113
152
|
<table>
|
|
114
153
|
<tr><th>Category</th><th>Count</th></tr>
|
|
@@ -117,9 +156,12 @@ class HTMLFormatter:
|
|
|
117
156
|
for category, count in result.summary.get("assets_by_category", {}).items():
|
|
118
157
|
html += f" <tr><td>{category.replace('_', ' ').title()}</td><td>{count}</td></tr>\n"
|
|
119
158
|
|
|
120
|
-
|
|
159
|
+
# All Assets - COLLAPSIBLE
|
|
160
|
+
asset_count = len(result.assets)
|
|
161
|
+
html += f""" </table>
|
|
121
162
|
|
|
122
|
-
<
|
|
163
|
+
<button class="collapsible">All Assets <span class="asset-count">({asset_count} items)</span></button>
|
|
164
|
+
<div class="collapsible-content">
|
|
123
165
|
<table>
|
|
124
166
|
<tr>
|
|
125
167
|
<th>Name</th>
|
|
@@ -151,12 +193,24 @@ class HTMLFormatter:
|
|
|
151
193
|
<td>{asset.asset_type}</td>
|
|
152
194
|
<td>{asset.region}</td>
|
|
153
195
|
<td>${asset.cost_estimate_usd or 0:.2f}</td>
|
|
154
|
-
<td class="{owner_class}">{asset.suggested_owner or
|
|
196
|
+
<td class="{owner_class}">{asset.suggested_owner or "Unknown"}</td>
|
|
155
197
|
<td>{risk_flags_html}</td>
|
|
156
198
|
</tr>
|
|
157
199
|
"""
|
|
158
200
|
|
|
159
201
|
html += """ </table>
|
|
202
|
+
</div>
|
|
203
|
+
|
|
204
|
+
<script>
|
|
205
|
+
var coll = document.getElementsByClassName("collapsible");
|
|
206
|
+
for (var i = 0; i < coll.length; i++) {
|
|
207
|
+
coll[i].addEventListener("click", function() {
|
|
208
|
+
this.classList.toggle("active");
|
|
209
|
+
var content = this.nextElementSibling;
|
|
210
|
+
content.classList.toggle("show");
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
</script>
|
|
160
214
|
|
|
161
215
|
<div class="footer">
|
|
162
216
|
<p>Generated by Nuvu - AWS Data Asset Control</p>
|
|
@@ -167,3 +221,201 @@ class HTMLFormatter:
|
|
|
167
221
|
</html>"""
|
|
168
222
|
|
|
169
223
|
return html
|
|
224
|
+
|
|
225
|
+
def _calculate_savings(self, assets) -> dict:
|
|
226
|
+
"""Calculate potential cost savings from assets."""
|
|
227
|
+
savings = {
|
|
228
|
+
"old_manual_snapshots": 0,
|
|
229
|
+
"stale_crawlers": 0,
|
|
230
|
+
"unused_etl_jobs": 0,
|
|
231
|
+
"reservation_opportunities": 0,
|
|
232
|
+
"total_potential_savings": 0,
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
for asset in assets:
|
|
236
|
+
metrics = asset.usage_metrics or {}
|
|
237
|
+
|
|
238
|
+
# Old MANUAL snapshot savings (automated snapshots are free within retention)
|
|
239
|
+
if asset.asset_type == "redshift_snapshot":
|
|
240
|
+
if metrics.get("snapshot_type") == "manual":
|
|
241
|
+
if "old_snapshot" in (asset.risk_flags or []):
|
|
242
|
+
savings["old_manual_snapshots"] += asset.cost_estimate_usd or 0
|
|
243
|
+
|
|
244
|
+
# Reservation savings
|
|
245
|
+
if asset.asset_type == "redshift_cluster":
|
|
246
|
+
potential = metrics.get("potential_reservation_savings_usd", 0)
|
|
247
|
+
savings["reservation_opportunities"] += potential
|
|
248
|
+
|
|
249
|
+
# Stale crawler costs
|
|
250
|
+
if asset.asset_type == "glue_crawler":
|
|
251
|
+
if "stale_crawler" in (asset.risk_flags or []):
|
|
252
|
+
savings["stale_crawlers"] += asset.cost_estimate_usd or 0
|
|
253
|
+
|
|
254
|
+
savings["total_potential_savings"] = (
|
|
255
|
+
savings["old_manual_snapshots"]
|
|
256
|
+
+ savings["reservation_opportunities"]
|
|
257
|
+
+ savings["stale_crawlers"]
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return savings
|
|
261
|
+
|
|
262
|
+
def _build_cost_optimization_section(self, assets) -> str:
|
|
263
|
+
"""Build cost optimization recommendations section."""
|
|
264
|
+
# Filter relevant assets
|
|
265
|
+
snapshots = [a for a in assets if a.asset_type == "redshift_snapshot"]
|
|
266
|
+
manual_snapshots = [
|
|
267
|
+
a for a in snapshots if (a.usage_metrics or {}).get("snapshot_type") == "manual"
|
|
268
|
+
]
|
|
269
|
+
auto_snapshots = [
|
|
270
|
+
a for a in snapshots if (a.usage_metrics or {}).get("snapshot_type") == "automated"
|
|
271
|
+
]
|
|
272
|
+
old_manual_snapshots = [
|
|
273
|
+
a for a in manual_snapshots if "old_snapshot" in (a.risk_flags or [])
|
|
274
|
+
]
|
|
275
|
+
reserved_nodes = [a for a in assets if a.asset_type == "redshift_reserved_node"]
|
|
276
|
+
expiring_reservations = [
|
|
277
|
+
a for a in reserved_nodes if "reservation_expiring_soon" in (a.risk_flags or [])
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
if not snapshots and not reserved_nodes:
|
|
281
|
+
return ""
|
|
282
|
+
|
|
283
|
+
html = """
|
|
284
|
+
<h2>💰 Cost Optimization Opportunities</h2>
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
# Snapshot analysis - only manual snapshots are chargeable
|
|
288
|
+
if snapshots:
|
|
289
|
+
manual_snapshot_cost = sum(a.cost_estimate_usd or 0 for a in manual_snapshots)
|
|
290
|
+
old_manual_cost = sum(a.cost_estimate_usd or 0 for a in old_manual_snapshots)
|
|
291
|
+
manual_size = sum((a.size_bytes or 0) / (1024**4) for a in manual_snapshots) # TB
|
|
292
|
+
|
|
293
|
+
html += f"""
|
|
294
|
+
<div class="insight-box warning">
|
|
295
|
+
<h3>📦 Redshift Snapshots</h3>
|
|
296
|
+
<ul>
|
|
297
|
+
<li><strong>Automated Snapshots:</strong> {len(auto_snapshots)} (included in cluster cost)</li>
|
|
298
|
+
<li><strong>Manual Snapshots:</strong> {len(manual_snapshots)} ({manual_size:.2f} TB)</li>
|
|
299
|
+
<li><strong>Manual Snapshot Cost:</strong> ${manual_snapshot_cost:,.2f}/mo</li>
|
|
300
|
+
<li><strong>Old Manual Snapshots (>90 days):</strong> {len(old_manual_snapshots)} (${old_manual_cost:,.2f}/mo potential savings)</li>
|
|
301
|
+
</ul>
|
|
302
|
+
<p class="recommendation">💡 Review old manual snapshots - automated snapshots are retained per retention policy at no extra charge.</p>
|
|
303
|
+
</div>
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
# Reserved nodes analysis
|
|
307
|
+
if reserved_nodes:
|
|
308
|
+
active_reservations = [
|
|
309
|
+
a for a in reserved_nodes if (a.usage_metrics or {}).get("state") == "active"
|
|
310
|
+
]
|
|
311
|
+
expired = [a for a in reserved_nodes if "reservation_expired" in (a.risk_flags or [])]
|
|
312
|
+
|
|
313
|
+
html += f"""
|
|
314
|
+
<div class="insight-box info">
|
|
315
|
+
<h3>🎫 Reserved Nodes ({len(reserved_nodes)} total)</h3>
|
|
316
|
+
<ul>
|
|
317
|
+
<li><strong>Active Reservations:</strong> {len(active_reservations)}</li>
|
|
318
|
+
<li><strong>Expired/Retired:</strong> {len(expired)}</li>
|
|
319
|
+
<li><strong>Expiring Soon:</strong> {len(expiring_reservations)}</li>
|
|
320
|
+
</ul>
|
|
321
|
+
</div>
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
return html
|
|
325
|
+
|
|
326
|
+
def _build_governance_section(self, assets) -> str:
|
|
327
|
+
"""Build governance insights section."""
|
|
328
|
+
# Glue crawlers
|
|
329
|
+
crawlers = [a for a in assets if a.asset_type == "glue_crawler"]
|
|
330
|
+
stale_crawlers = [
|
|
331
|
+
a
|
|
332
|
+
for a in crawlers
|
|
333
|
+
if "stale_crawler" in (a.risk_flags or []) or "never_run" in (a.risk_flags or [])
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
# Glue jobs
|
|
337
|
+
jobs = [a for a in assets if a.asset_type == "glue_job"]
|
|
338
|
+
stale_jobs = [
|
|
339
|
+
a
|
|
340
|
+
for a in jobs
|
|
341
|
+
if "stale_job" in (a.risk_flags or []) or "never_run" in (a.risk_flags or [])
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
# Datashares
|
|
345
|
+
datashares = [a for a in assets if a.asset_type == "redshift_datashare"]
|
|
346
|
+
cross_account_shares = [
|
|
347
|
+
a for a in datashares if "cross_account_sharing" in (a.risk_flags or [])
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
# WLM issues
|
|
351
|
+
clusters = [a for a in assets if a.asset_type == "redshift_cluster"]
|
|
352
|
+
wlm_issues = [
|
|
353
|
+
a
|
|
354
|
+
for a in clusters
|
|
355
|
+
if "default_wlm_only" in (a.risk_flags or [])
|
|
356
|
+
or "unlimited_wlm_queue" in (a.risk_flags or [])
|
|
357
|
+
]
|
|
358
|
+
|
|
359
|
+
if not any([stale_crawlers, stale_jobs, cross_account_shares, wlm_issues]):
|
|
360
|
+
return ""
|
|
361
|
+
|
|
362
|
+
html = """
|
|
363
|
+
<h2>🔍 Governance Insights</h2>
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
if stale_crawlers:
|
|
367
|
+
html += f"""
|
|
368
|
+
<div class="insight-box warning">
|
|
369
|
+
<h3>🕷️ Stale/Unused Glue Crawlers ({len(stale_crawlers)})</h3>
|
|
370
|
+
<table class="compact">
|
|
371
|
+
<tr><th>Name</th><th>Last Run</th><th>Issue</th></tr>
|
|
372
|
+
"""
|
|
373
|
+
for crawler in stale_crawlers[:10]:
|
|
374
|
+
days = (crawler.usage_metrics or {}).get("days_since_last_run", "Never")
|
|
375
|
+
issues = ", ".join(crawler.risk_flags or [])
|
|
376
|
+
html += f"<tr><td>{crawler.name}</td><td>{days} days ago</td><td>{issues}</td></tr>"
|
|
377
|
+
html += "</table></div>"
|
|
378
|
+
|
|
379
|
+
if stale_jobs:
|
|
380
|
+
html += f"""
|
|
381
|
+
<div class="insight-box warning">
|
|
382
|
+
<h3>⚙️ Stale/Unused Glue ETL Jobs ({len(stale_jobs)})</h3>
|
|
383
|
+
<table class="compact">
|
|
384
|
+
<tr><th>Name</th><th>Last Run</th><th>Issue</th></tr>
|
|
385
|
+
"""
|
|
386
|
+
for job in stale_jobs[:10]:
|
|
387
|
+
days = (job.usage_metrics or {}).get("days_since_last_run", "Never")
|
|
388
|
+
issues = ", ".join(job.risk_flags or [])
|
|
389
|
+
html += f"<tr><td>{job.name}</td><td>{days} days ago</td><td>{issues}</td></tr>"
|
|
390
|
+
html += "</table></div>"
|
|
391
|
+
|
|
392
|
+
if cross_account_shares:
|
|
393
|
+
html += f"""
|
|
394
|
+
<div class="insight-box alert">
|
|
395
|
+
<h3>🔗 Cross-Account Data Shares ({len(cross_account_shares)})</h3>
|
|
396
|
+
<p>Data is being shared outside this AWS account. Review for security compliance.</p>
|
|
397
|
+
<table class="compact">
|
|
398
|
+
<tr><th>Share Name</th><th>Consumer Account</th><th>Flags</th></tr>
|
|
399
|
+
"""
|
|
400
|
+
for share in cross_account_shares[:10]:
|
|
401
|
+
consumers = (share.usage_metrics or {}).get("consumers", [])
|
|
402
|
+
consumer_ids = ", ".join(c.get("account_id", "?") for c in consumers[:3])
|
|
403
|
+
flags = ", ".join(share.risk_flags or [])
|
|
404
|
+
html += f"<tr><td>{share.name}</td><td>{consumer_ids}</td><td>{flags}</td></tr>"
|
|
405
|
+
html += "</table></div>"
|
|
406
|
+
|
|
407
|
+
if wlm_issues:
|
|
408
|
+
html += f"""
|
|
409
|
+
<div class="insight-box info">
|
|
410
|
+
<h3>⚡ WLM Configuration Review ({len(wlm_issues)} clusters)</h3>
|
|
411
|
+
<p>Some clusters may benefit from WLM tuning:</p>
|
|
412
|
+
<ul>
|
|
413
|
+
"""
|
|
414
|
+
for cluster in wlm_issues[:5]:
|
|
415
|
+
queues = (cluster.usage_metrics or {}).get("wlm_queue_count", 0)
|
|
416
|
+
auto_wlm = "Yes" if (cluster.usage_metrics or {}).get("wlm_auto_wlm") else "No"
|
|
417
|
+
flags = ", ".join(f for f in (cluster.risk_flags or []) if "wlm" in f)
|
|
418
|
+
html += f"<li><strong>{cluster.name}</strong>: {queues} queues, Auto WLM: {auto_wlm} ({flags})</li>"
|
|
419
|
+
html += "</ul></div>"
|
|
420
|
+
|
|
421
|
+
return html
|
nuvu_scan/cli/main.py
CHANGED
|
@@ -7,11 +7,12 @@ Usage:
|
|
|
7
7
|
|
|
8
8
|
import click
|
|
9
9
|
|
|
10
|
+
from .. import __version__
|
|
10
11
|
from .commands.scan import scan_command
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
@click.group()
|
|
14
|
-
@click.version_option(version="
|
|
15
|
+
@click.version_option(version=__version__, prog_name="nuvu-scan")
|
|
15
16
|
def cli():
|
|
16
17
|
"""Nuvu - Multi-Cloud Data Asset Control CLI."""
|
|
17
18
|
pass
|
nuvu_scan/core/base.py
CHANGED
|
@@ -21,10 +21,13 @@ class NormalizedCategory(str, Enum):
|
|
|
21
21
|
ML_TRAINING = "ml_training"
|
|
22
22
|
DATA_CATALOG = "data_catalog"
|
|
23
23
|
DATA_INTEGRATION = "data_integration"
|
|
24
|
+
DATA_PIPELINE = "data_pipeline" # ETL jobs, crawlers, workflows
|
|
25
|
+
DATA_SHARING = "data_sharing" # Datashares, cross-account sharing
|
|
24
26
|
QUERY_ENGINE = "query_engine"
|
|
25
27
|
SEARCH = "search"
|
|
26
28
|
DATABASE = "database"
|
|
27
29
|
SECURITY = "security"
|
|
30
|
+
BILLING = "billing"
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
@dataclass
|
|
@@ -66,10 +69,13 @@ class ScanConfig:
|
|
|
66
69
|
credentials: dict[str, Any] # Provider-specific credentials
|
|
67
70
|
regions: list[str] = None # None means all regions
|
|
68
71
|
account_id: str | None = None
|
|
72
|
+
collectors: list[str] = None # None means all collectors, otherwise filter by name
|
|
69
73
|
|
|
70
74
|
def __post_init__(self):
|
|
71
75
|
if self.regions is None:
|
|
72
76
|
self.regions = []
|
|
77
|
+
if self.collectors is None:
|
|
78
|
+
self.collectors = []
|
|
73
79
|
|
|
74
80
|
|
|
75
81
|
@dataclass
|
|
@@ -177,7 +177,7 @@ class AWSScanner(CloudProviderScan):
|
|
|
177
177
|
region_name=credentials.get("region", "us-east-1"),
|
|
178
178
|
)
|
|
179
179
|
except ClientError as e:
|
|
180
|
-
raise ValueError(f"Failed to assume role {role_arn}: {str(e)}")
|
|
180
|
+
raise ValueError(f"Failed to assume role {role_arn}: {str(e)}") from e
|
|
181
181
|
|
|
182
182
|
def _resolve_regions(self) -> list[str]:
|
|
183
183
|
"""Resolve regions to scan. If none provided, scan all enabled regions."""
|
|
@@ -201,23 +201,49 @@ class AWSScanner(CloudProviderScan):
|
|
|
201
201
|
# If we can't get account ID, return "unknown"
|
|
202
202
|
return "unknown"
|
|
203
203
|
|
|
204
|
+
# Map of collector names to their classes for filtering
|
|
205
|
+
COLLECTOR_MAP = {
|
|
206
|
+
"s3": S3Collector,
|
|
207
|
+
"glue": GlueCollector,
|
|
208
|
+
"athena": AthenaCollector,
|
|
209
|
+
"redshift": RedshiftCollector,
|
|
210
|
+
"iam": IAMCollector,
|
|
211
|
+
"mwaa": MWAACollector,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def get_available_collectors(cls) -> list[str]:
|
|
216
|
+
"""Return list of available collector names."""
|
|
217
|
+
return list(cls.COLLECTOR_MAP.keys())
|
|
218
|
+
|
|
204
219
|
def _initialize_collectors(self) -> list:
|
|
205
|
-
"""Initialize
|
|
220
|
+
"""Initialize AWS service collectors based on config."""
|
|
206
221
|
collectors = []
|
|
207
222
|
|
|
208
|
-
#
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
collectors
|
|
223
|
+
# Get requested collectors from config
|
|
224
|
+
requested = self.config.collectors if self.config.collectors else []
|
|
225
|
+
|
|
226
|
+
# Normalize to lowercase
|
|
227
|
+
requested_lower = [c.lower() for c in requested]
|
|
228
|
+
|
|
229
|
+
# If no specific collectors requested, use all
|
|
230
|
+
if not requested_lower:
|
|
231
|
+
for collector_cls in self.COLLECTOR_MAP.values():
|
|
232
|
+
collectors.append(collector_cls(self.session, self.config.regions))
|
|
233
|
+
else:
|
|
234
|
+
# Filter to only requested collectors
|
|
235
|
+
for name, collector_cls in self.COLLECTOR_MAP.items():
|
|
236
|
+
if name in requested_lower:
|
|
237
|
+
collectors.append(collector_cls(self.session, self.config.regions))
|
|
238
|
+
|
|
239
|
+
# Warn about unknown collectors
|
|
240
|
+
known = set(self.COLLECTOR_MAP.keys())
|
|
241
|
+
unknown = set(requested_lower) - known
|
|
242
|
+
if unknown:
|
|
243
|
+
import sys
|
|
215
244
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
# collectors.append(EMRCollector(self.session, self.config.regions))
|
|
219
|
-
# collectors.append(SageMakerCollector(self.session, self.config.regions))
|
|
220
|
-
# etc.
|
|
245
|
+
print(f"Warning: Unknown collectors ignored: {', '.join(unknown)}", file=sys.stderr)
|
|
246
|
+
print(f"Available collectors: {', '.join(sorted(known))}", file=sys.stderr)
|
|
221
247
|
|
|
222
248
|
return collectors
|
|
223
249
|
|
|
@@ -264,7 +290,7 @@ class AWSScanner(CloudProviderScan):
|
|
|
264
290
|
cost_summary_asset = Asset(
|
|
265
291
|
provider="aws",
|
|
266
292
|
asset_type="cost_summary",
|
|
267
|
-
normalized_category=NormalizedCategory.
|
|
293
|
+
normalized_category=NormalizedCategory.BILLING,
|
|
268
294
|
service="Cost Explorer",
|
|
269
295
|
region="global",
|
|
270
296
|
arn="arn:aws:ce::cost-summary",
|
|
@@ -23,10 +23,13 @@ class AthenaCollector:
|
|
|
23
23
|
|
|
24
24
|
def collect(self) -> list[Asset]:
|
|
25
25
|
"""Collect Athena workgroups."""
|
|
26
|
+
import sys
|
|
27
|
+
|
|
26
28
|
assets = []
|
|
27
29
|
|
|
28
30
|
try:
|
|
29
31
|
# List workgroups
|
|
32
|
+
print(" → Listing Athena workgroups...", file=sys.stderr)
|
|
30
33
|
response = self.athena_client.list_work_groups()
|
|
31
34
|
|
|
32
35
|
for wg_info in response.get("WorkGroups", []):
|