nuvu-scan 2.0.2__py3-none-any.whl → 2.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nuvu_scan/cli/commands/scan.py +8 -1
- nuvu_scan/cli/formatters/html.py +141 -20
- nuvu_scan/core/base.py +34 -0
- nuvu_scan/core/providers/aws/aws_scanner.py +52 -36
- nuvu_scan/core/providers/aws/collectors/athena.py +102 -67
- nuvu_scan/core/providers/aws/collectors/glue.py +104 -34
- nuvu_scan/core/providers/aws/collectors/mwaa.py +10 -5
- nuvu_scan/core/providers/aws/collectors/redshift.py +381 -18
- {nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/METADATA +41 -30
- {nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/RECORD +12 -12
- {nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/WHEEL +0 -0
- {nuvu_scan-2.0.2.dist-info → nuvu_scan-2.1.2.dist-info}/entry_points.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Amazon Athena collector.
|
|
3
3
|
|
|
4
|
-
Collects Athena workgroups and query history.
|
|
4
|
+
Collects Athena workgroups and query history across all regions.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from datetime import datetime,
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
10
|
import boto3
|
|
@@ -14,90 +14,121 @@ from nuvu_scan.core.base import Asset, NormalizedCategory
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class AthenaCollector:
|
|
17
|
-
"""Collects Amazon Athena resources."""
|
|
17
|
+
"""Collects Amazon Athena resources across all regions."""
|
|
18
18
|
|
|
19
19
|
def __init__(self, session: boto3.Session, regions: list[str] | None = None):
|
|
20
20
|
self.session = session
|
|
21
|
-
self.regions = regions or [
|
|
22
|
-
self.athena_client = session.client("athena", region_name="us-east-1")
|
|
21
|
+
self.regions = regions or []
|
|
23
22
|
|
|
24
23
|
def collect(self) -> list[Asset]:
|
|
25
|
-
"""Collect Athena workgroups."""
|
|
24
|
+
"""Collect Athena workgroups from all regions."""
|
|
26
25
|
import sys
|
|
27
26
|
|
|
28
27
|
assets = []
|
|
29
28
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
risk_flags
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
29
|
+
# If no regions specified, get all enabled regions
|
|
30
|
+
regions_to_check = self.regions if self.regions else self._get_all_regions()
|
|
31
|
+
|
|
32
|
+
print(
|
|
33
|
+
f" → Checking {len(regions_to_check)} regions for Athena workgroups...",
|
|
34
|
+
file=sys.stderr,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
for region in regions_to_check:
|
|
38
|
+
try:
|
|
39
|
+
athena_client = self.session.client("athena", region_name=region)
|
|
40
|
+
response = athena_client.list_work_groups()
|
|
41
|
+
|
|
42
|
+
for wg_info in response.get("WorkGroups", []):
|
|
43
|
+
wg_name = wg_info["Name"]
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
# Get workgroup details
|
|
47
|
+
wg_details = athena_client.get_work_group(WorkGroup=wg_name)
|
|
48
|
+
|
|
49
|
+
# Get query statistics
|
|
50
|
+
query_stats = self._get_query_stats(athena_client, wg_name)
|
|
51
|
+
|
|
52
|
+
risk_flags = []
|
|
53
|
+
if query_stats.get("idle_days", 0) > 90:
|
|
54
|
+
risk_flags.append("idle_workgroup")
|
|
55
|
+
if (
|
|
56
|
+
query_stats.get("failed_queries", 0)
|
|
57
|
+
> query_stats.get("total_queries", 1) * 0.5
|
|
58
|
+
):
|
|
59
|
+
risk_flags.append("high_failure_rate")
|
|
60
|
+
|
|
61
|
+
assets.append(
|
|
62
|
+
Asset(
|
|
63
|
+
provider="aws",
|
|
64
|
+
asset_type="athena_workgroup",
|
|
65
|
+
normalized_category=NormalizedCategory.QUERY_ENGINE,
|
|
66
|
+
service="Athena",
|
|
67
|
+
region=region,
|
|
68
|
+
arn=f"arn:aws:athena:{region}::workgroup/{wg_name}",
|
|
69
|
+
name=wg_name,
|
|
70
|
+
created_at=(
|
|
71
|
+
wg_details.get("WorkGroup", {})
|
|
72
|
+
.get("CreationTime", "")
|
|
73
|
+
.isoformat()
|
|
74
|
+
if wg_details.get("WorkGroup", {}).get("CreationTime")
|
|
75
|
+
else None
|
|
76
|
+
),
|
|
77
|
+
last_activity_at=query_stats.get("last_query_time"),
|
|
78
|
+
risk_flags=risk_flags,
|
|
79
|
+
usage_metrics={
|
|
80
|
+
**query_stats,
|
|
81
|
+
"last_used": query_stats.get("last_query_time"),
|
|
82
|
+
"days_since_last_use": query_stats.get("idle_days"),
|
|
83
|
+
},
|
|
84
|
+
)
|
|
76
85
|
)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
continue
|
|
86
|
+
except ClientError:
|
|
87
|
+
continue
|
|
80
88
|
|
|
81
|
-
|
|
82
|
-
|
|
89
|
+
except ClientError as e:
|
|
90
|
+
error_code = e.response.get("Error", {}).get("Code", "Unknown")
|
|
91
|
+
if error_code == "AccessDeniedException":
|
|
92
|
+
print(
|
|
93
|
+
f" ⚠️ No permission to list Athena workgroups in {region}. "
|
|
94
|
+
"Add 'athena:ListWorkGroups' to IAM policy.",
|
|
95
|
+
file=sys.stderr,
|
|
96
|
+
)
|
|
97
|
+
# Skip other errors silently (region not enabled, etc.)
|
|
83
98
|
|
|
99
|
+
if assets:
|
|
100
|
+
print(f" → Found {len(assets)} Athena workgroups", file=sys.stderr)
|
|
84
101
|
return assets
|
|
85
102
|
|
|
86
|
-
def
|
|
103
|
+
def _get_all_regions(self) -> list[str]:
|
|
104
|
+
"""Get all enabled AWS regions."""
|
|
105
|
+
try:
|
|
106
|
+
ec2 = self.session.client("ec2", region_name="us-east-1")
|
|
107
|
+
response = ec2.describe_regions(AllRegions=False)
|
|
108
|
+
return [r["RegionName"] for r in response.get("Regions", [])]
|
|
109
|
+
except ClientError:
|
|
110
|
+
# Fallback to common regions
|
|
111
|
+
return ["us-east-1", "us-west-2", "eu-west-1", "ap-southeast-1"]
|
|
112
|
+
|
|
113
|
+
def _get_query_stats(self, athena_client, workgroup_name: str) -> dict[str, Any]:
|
|
87
114
|
"""Get query statistics for a workgroup."""
|
|
88
115
|
stats = {"total_queries": 0, "failed_queries": 0, "last_query_time": None, "idle_days": 0}
|
|
89
116
|
|
|
90
117
|
try:
|
|
91
|
-
# List recent queries
|
|
92
|
-
paginator =
|
|
93
|
-
datetime.utcnow() - timedelta(days=90)
|
|
118
|
+
# List recent queries (limit to avoid long scan times)
|
|
119
|
+
paginator = athena_client.get_paginator("list_query_executions")
|
|
94
120
|
|
|
95
|
-
|
|
121
|
+
query_count = 0
|
|
122
|
+
for page in paginator.paginate(
|
|
123
|
+
WorkGroup=workgroup_name, PaginationConfig={"MaxItems": 100}
|
|
124
|
+
):
|
|
96
125
|
for query_id in page.get("QueryExecutionIds", []):
|
|
126
|
+
query_count += 1
|
|
127
|
+
if query_count > 50: # Limit for performance
|
|
128
|
+
break
|
|
129
|
+
|
|
97
130
|
try:
|
|
98
|
-
query_info =
|
|
99
|
-
QueryExecutionId=query_id
|
|
100
|
-
)
|
|
131
|
+
query_info = athena_client.get_query_execution(QueryExecutionId=query_id)
|
|
101
132
|
execution = query_info.get("QueryExecution", {})
|
|
102
133
|
status = execution.get("Status", {})
|
|
103
134
|
|
|
@@ -107,7 +138,7 @@ class AthenaCollector:
|
|
|
107
138
|
stats["failed_queries"] += 1
|
|
108
139
|
|
|
109
140
|
# Get last query time
|
|
110
|
-
completion_time =
|
|
141
|
+
completion_time = status.get("CompletionDateTime")
|
|
111
142
|
if completion_time:
|
|
112
143
|
if (
|
|
113
144
|
not stats["last_query_time"]
|
|
@@ -117,10 +148,14 @@ class AthenaCollector:
|
|
|
117
148
|
except ClientError:
|
|
118
149
|
continue
|
|
119
150
|
|
|
151
|
+
if query_count > 50:
|
|
152
|
+
break
|
|
153
|
+
|
|
120
154
|
# Calculate idle days
|
|
121
155
|
if stats["last_query_time"]:
|
|
122
156
|
last_query = datetime.fromisoformat(stats["last_query_time"].replace("Z", "+00:00"))
|
|
123
|
-
|
|
157
|
+
now = datetime.now(timezone.utc)
|
|
158
|
+
stats["idle_days"] = (now - last_query).days
|
|
124
159
|
else:
|
|
125
160
|
stats["idle_days"] = 999 # Never used
|
|
126
161
|
|
|
@@ -14,48 +14,79 @@ from nuvu_scan.core.base import Asset, NormalizedCategory
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class GlueCollector:
|
|
17
|
-
"""Collects AWS Glue Data Catalog resources."""
|
|
17
|
+
"""Collects AWS Glue Data Catalog resources across all regions."""
|
|
18
18
|
|
|
19
19
|
def __init__(self, session: boto3.Session, regions: list[str] | None = None):
|
|
20
20
|
self.session = session
|
|
21
|
-
self.regions = regions or [
|
|
22
|
-
|
|
23
|
-
self.glue_client =
|
|
21
|
+
self.regions = regions or []
|
|
22
|
+
# These will be set per-region during collection
|
|
23
|
+
self.glue_client = None
|
|
24
|
+
self.region = None
|
|
24
25
|
# Cache crawler run times to associate with tables
|
|
25
26
|
self._crawler_last_runs: dict[str, datetime | None] = {}
|
|
26
27
|
self._db_to_crawler: dict[str, str] = {}
|
|
27
28
|
|
|
29
|
+
def _get_all_regions(self) -> list[str]:
|
|
30
|
+
"""Get all enabled AWS regions."""
|
|
31
|
+
try:
|
|
32
|
+
ec2 = self.session.client("ec2", region_name="us-east-1")
|
|
33
|
+
response = ec2.describe_regions(AllRegions=False)
|
|
34
|
+
return [r["RegionName"] for r in response.get("Regions", [])]
|
|
35
|
+
except ClientError:
|
|
36
|
+
# Fallback to common regions
|
|
37
|
+
return ["us-east-1", "us-west-2", "eu-west-1", "ap-southeast-1"]
|
|
38
|
+
|
|
28
39
|
def collect(self) -> list[Asset]:
|
|
29
|
-
"""Collect all Glue resources
|
|
40
|
+
"""Collect all Glue resources across all regions."""
|
|
30
41
|
import sys
|
|
31
42
|
|
|
32
|
-
|
|
43
|
+
all_assets = []
|
|
33
44
|
|
|
34
|
-
#
|
|
35
|
-
|
|
36
|
-
crawler_assets = self._collect_crawlers()
|
|
37
|
-
assets.extend(crawler_assets)
|
|
38
|
-
print(f" → Found {len(crawler_assets)} crawlers", file=sys.stderr)
|
|
39
|
-
|
|
40
|
-
# Collect ETL jobs
|
|
41
|
-
print(" → Collecting Glue ETL jobs...", file=sys.stderr)
|
|
42
|
-
job_assets = self._collect_jobs()
|
|
43
|
-
assets.extend(job_assets)
|
|
44
|
-
print(f" → Found {len(job_assets)} jobs", file=sys.stderr)
|
|
45
|
-
|
|
46
|
-
# Collect connections
|
|
47
|
-
print(" → Collecting Glue connections...", file=sys.stderr)
|
|
48
|
-
conn_assets = self._collect_connections()
|
|
49
|
-
assets.extend(conn_assets)
|
|
50
|
-
print(f" → Found {len(conn_assets)} connections", file=sys.stderr)
|
|
51
|
-
|
|
52
|
-
# Collect databases and tables (using crawler info for last_activity)
|
|
53
|
-
print(" → Collecting Glue databases and tables...", file=sys.stderr)
|
|
54
|
-
db_assets = self._collect_databases_and_tables()
|
|
55
|
-
assets.extend(db_assets)
|
|
56
|
-
print(f" → Found {len(db_assets)} databases/tables", file=sys.stderr)
|
|
45
|
+
# If no regions specified, get all enabled regions
|
|
46
|
+
regions_to_check = self.regions if self.regions else self._get_all_regions()
|
|
57
47
|
|
|
58
|
-
|
|
48
|
+
print(
|
|
49
|
+
f" → Checking {len(regions_to_check)} regions for Glue resources...", file=sys.stderr
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
total_crawlers = 0
|
|
53
|
+
total_jobs = 0
|
|
54
|
+
total_connections = 0
|
|
55
|
+
total_dbs = 0
|
|
56
|
+
|
|
57
|
+
for region in regions_to_check:
|
|
58
|
+
# Set up client for this region
|
|
59
|
+
self.region = region
|
|
60
|
+
self.glue_client = self.session.client("glue", region_name=region)
|
|
61
|
+
self._crawler_last_runs = {}
|
|
62
|
+
self._db_to_crawler = {}
|
|
63
|
+
|
|
64
|
+
# Collect crawlers
|
|
65
|
+
crawler_assets = self._collect_crawlers()
|
|
66
|
+
all_assets.extend(crawler_assets)
|
|
67
|
+
total_crawlers += len(crawler_assets)
|
|
68
|
+
|
|
69
|
+
# Collect ETL jobs
|
|
70
|
+
job_assets = self._collect_jobs()
|
|
71
|
+
all_assets.extend(job_assets)
|
|
72
|
+
total_jobs += len(job_assets)
|
|
73
|
+
|
|
74
|
+
# Collect connections
|
|
75
|
+
conn_assets = self._collect_connections()
|
|
76
|
+
all_assets.extend(conn_assets)
|
|
77
|
+
total_connections += len(conn_assets)
|
|
78
|
+
|
|
79
|
+
# Collect databases and tables
|
|
80
|
+
db_assets = self._collect_databases_and_tables()
|
|
81
|
+
all_assets.extend(db_assets)
|
|
82
|
+
total_dbs += len(db_assets)
|
|
83
|
+
|
|
84
|
+
print(f" → Found {total_crawlers} crawlers", file=sys.stderr)
|
|
85
|
+
print(f" → Found {total_jobs} jobs", file=sys.stderr)
|
|
86
|
+
print(f" → Found {total_connections} connections", file=sys.stderr)
|
|
87
|
+
print(f" → Found {total_dbs} databases/tables", file=sys.stderr)
|
|
88
|
+
|
|
89
|
+
return all_assets
|
|
59
90
|
|
|
60
91
|
def _collect_crawlers(self) -> list[Asset]:
|
|
61
92
|
"""Collect Glue Crawlers with detailed status."""
|
|
@@ -152,7 +183,17 @@ class GlueCollector:
|
|
|
152
183
|
)
|
|
153
184
|
|
|
154
185
|
except ClientError as e:
|
|
155
|
-
|
|
186
|
+
import sys
|
|
187
|
+
|
|
188
|
+
error_code = e.response.get("Error", {}).get("Code", "Unknown")
|
|
189
|
+
if error_code == "AccessDeniedException":
|
|
190
|
+
print(
|
|
191
|
+
" ⚠️ No permission to list Glue crawlers. "
|
|
192
|
+
"Add 'glue:GetCrawlers' to IAM policy.",
|
|
193
|
+
file=sys.stderr,
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
print(f" ⚠️ Error collecting Glue crawlers: {e}", file=sys.stderr)
|
|
156
197
|
|
|
157
198
|
return assets
|
|
158
199
|
|
|
@@ -248,7 +289,16 @@ class GlueCollector:
|
|
|
248
289
|
)
|
|
249
290
|
|
|
250
291
|
except ClientError as e:
|
|
251
|
-
|
|
292
|
+
import sys
|
|
293
|
+
|
|
294
|
+
error_code = e.response.get("Error", {}).get("Code", "Unknown")
|
|
295
|
+
if error_code == "AccessDeniedException":
|
|
296
|
+
print(
|
|
297
|
+
" ⚠️ No permission to list Glue jobs. Add 'glue:GetJobs' to IAM policy.",
|
|
298
|
+
file=sys.stderr,
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
print(f" ⚠️ Error collecting Glue jobs: {e}", file=sys.stderr)
|
|
252
302
|
|
|
253
303
|
return assets
|
|
254
304
|
|
|
@@ -307,7 +357,17 @@ class GlueCollector:
|
|
|
307
357
|
)
|
|
308
358
|
|
|
309
359
|
except ClientError as e:
|
|
310
|
-
|
|
360
|
+
import sys
|
|
361
|
+
|
|
362
|
+
error_code = e.response.get("Error", {}).get("Code", "Unknown")
|
|
363
|
+
if error_code == "AccessDeniedException":
|
|
364
|
+
print(
|
|
365
|
+
" ⚠️ No permission to list Glue connections. "
|
|
366
|
+
"Add 'glue:GetConnections' to IAM policy.",
|
|
367
|
+
file=sys.stderr,
|
|
368
|
+
)
|
|
369
|
+
else:
|
|
370
|
+
print(f" ⚠️ Error collecting Glue connections: {e}", file=sys.stderr)
|
|
311
371
|
|
|
312
372
|
return assets
|
|
313
373
|
|
|
@@ -452,7 +512,17 @@ class GlueCollector:
|
|
|
452
512
|
pass
|
|
453
513
|
|
|
454
514
|
except ClientError as e:
|
|
455
|
-
|
|
515
|
+
import sys
|
|
516
|
+
|
|
517
|
+
error_code = e.response.get("Error", {}).get("Code", "Unknown")
|
|
518
|
+
if error_code == "AccessDeniedException":
|
|
519
|
+
print(
|
|
520
|
+
" ⚠️ No permission to list Glue databases/tables. "
|
|
521
|
+
"Add 'glue:GetDatabases' and 'glue:GetTables' to IAM policy.",
|
|
522
|
+
file=sys.stderr,
|
|
523
|
+
)
|
|
524
|
+
else:
|
|
525
|
+
print(f" ⚠️ Error collecting Glue resources: {e}", file=sys.stderr)
|
|
456
526
|
|
|
457
527
|
return assets
|
|
458
528
|
|
|
@@ -91,8 +91,8 @@ class MWAACollector:
|
|
|
91
91
|
environment_class, min_workers, max_workers
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
-
# Check for risk flags
|
|
95
|
-
risk_flags = self._check_risks(environment, tags)
|
|
94
|
+
# Check for risk flags (pass ownership for proper no_owner detection)
|
|
95
|
+
risk_flags = self._check_risks(environment, tags, ownership)
|
|
96
96
|
|
|
97
97
|
# Create asset
|
|
98
98
|
asset = Asset(
|
|
@@ -251,7 +251,9 @@ class MWAACollector:
|
|
|
251
251
|
|
|
252
252
|
return base_monthly + worker_monthly
|
|
253
253
|
|
|
254
|
-
def _check_risks(
|
|
254
|
+
def _check_risks(
|
|
255
|
+
self, environment: dict, tags: dict[str, str], ownership: dict | None = None
|
|
256
|
+
) -> list[str]:
|
|
255
257
|
"""Check for risk flags in MWAA environment."""
|
|
256
258
|
risks = []
|
|
257
259
|
|
|
@@ -260,8 +262,11 @@ class MWAACollector:
|
|
|
260
262
|
if status in ["CREATING", "DELETING", "UPDATING"]:
|
|
261
263
|
risks.append("environment_in_transition")
|
|
262
264
|
|
|
263
|
-
# Check for missing owner
|
|
264
|
-
if
|
|
265
|
+
# Check for missing owner - only flag if we couldn't infer owner at all
|
|
266
|
+
if ownership:
|
|
267
|
+
if not ownership.get("owner") and ownership.get("confidence") == "unknown":
|
|
268
|
+
risks.append("no_owner")
|
|
269
|
+
elif not tags.get("owner") and not tags.get("Owner"):
|
|
265
270
|
risks.append("no_owner")
|
|
266
271
|
|
|
267
272
|
# Check for public access (if webserver is publicly accessible)
|