nuvu-scan 2.0.2__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  """
2
2
  Amazon Athena collector.
3
3
 
4
- Collects Athena workgroups and query history.
4
+ Collects Athena workgroups and query history across all regions.
5
5
  """
6
6
 
7
- from datetime import datetime, timedelta
7
+ from datetime import datetime, timezone
8
8
  from typing import Any
9
9
 
10
10
  import boto3
@@ -14,90 +14,121 @@ from nuvu_scan.core.base import Asset, NormalizedCategory
14
14
 
15
15
 
16
16
  class AthenaCollector:
17
- """Collects Amazon Athena resources."""
17
+ """Collects Amazon Athena resources across all regions."""
18
18
 
19
19
  def __init__(self, session: boto3.Session, regions: list[str] | None = None):
20
20
  self.session = session
21
- self.regions = regions or ["us-east-1"]
22
- self.athena_client = session.client("athena", region_name="us-east-1")
21
+ self.regions = regions or []
23
22
 
24
23
  def collect(self) -> list[Asset]:
25
- """Collect Athena workgroups."""
24
+ """Collect Athena workgroups from all regions."""
26
25
  import sys
27
26
 
28
27
  assets = []
29
28
 
30
- try:
31
- # List workgroups
32
- print(" → Listing Athena workgroups...", file=sys.stderr)
33
- response = self.athena_client.list_work_groups()
34
-
35
- for wg_info in response.get("WorkGroups", []):
36
- wg_name = wg_info["Name"]
37
-
38
- try:
39
- # Get workgroup details
40
- wg_details = self.athena_client.get_work_group(WorkGroup=wg_name)
41
- wg_details.get("WorkGroup", {}).get("Configuration", {})
42
-
43
- # Get query statistics
44
- query_stats = self._get_query_stats(wg_name)
45
-
46
- risk_flags = []
47
- if query_stats.get("idle_days", 0) > 90:
48
- risk_flags.append("idle_workgroup")
49
- if (
50
- query_stats.get("failed_queries", 0)
51
- > query_stats.get("total_queries", 1) * 0.5
52
- ):
53
- risk_flags.append("high_failure_rate")
54
-
55
- assets.append(
56
- Asset(
57
- provider="aws",
58
- asset_type="athena_workgroup",
59
- normalized_category=NormalizedCategory.QUERY_ENGINE,
60
- service="Athena",
61
- region="us-east-1",
62
- arn=f"arn:aws:athena:us-east-1::workgroup/{wg_name}",
63
- name=wg_name,
64
- created_at=(
65
- wg_details.get("WorkGroup", {}).get("CreationTime", "").isoformat()
66
- if wg_details.get("WorkGroup", {}).get("CreationTime")
67
- else None
68
- ),
69
- last_activity_at=query_stats.get("last_query_time"),
70
- risk_flags=risk_flags,
71
- usage_metrics={
72
- **query_stats,
73
- "last_used": query_stats.get("last_query_time"),
74
- "days_since_last_use": query_stats.get("idle_days"),
75
- },
29
+ # If no regions specified, get all enabled regions
30
+ regions_to_check = self.regions if self.regions else self._get_all_regions()
31
+
32
+ print(
33
+ f" → Checking {len(regions_to_check)} regions for Athena workgroups...",
34
+ file=sys.stderr,
35
+ )
36
+
37
+ for region in regions_to_check:
38
+ try:
39
+ athena_client = self.session.client("athena", region_name=region)
40
+ response = athena_client.list_work_groups()
41
+
42
+ for wg_info in response.get("WorkGroups", []):
43
+ wg_name = wg_info["Name"]
44
+
45
+ try:
46
+ # Get workgroup details
47
+ wg_details = athena_client.get_work_group(WorkGroup=wg_name)
48
+
49
+ # Get query statistics
50
+ query_stats = self._get_query_stats(athena_client, wg_name)
51
+
52
+ risk_flags = []
53
+ if query_stats.get("idle_days", 0) > 90:
54
+ risk_flags.append("idle_workgroup")
55
+ if (
56
+ query_stats.get("failed_queries", 0)
57
+ > query_stats.get("total_queries", 1) * 0.5
58
+ ):
59
+ risk_flags.append("high_failure_rate")
60
+
61
+ assets.append(
62
+ Asset(
63
+ provider="aws",
64
+ asset_type="athena_workgroup",
65
+ normalized_category=NormalizedCategory.QUERY_ENGINE,
66
+ service="Athena",
67
+ region=region,
68
+ arn=f"arn:aws:athena:{region}::workgroup/{wg_name}",
69
+ name=wg_name,
70
+ created_at=(
71
+ wg_details.get("WorkGroup", {})
72
+ .get("CreationTime", "")
73
+ .isoformat()
74
+ if wg_details.get("WorkGroup", {}).get("CreationTime")
75
+ else None
76
+ ),
77
+ last_activity_at=query_stats.get("last_query_time"),
78
+ risk_flags=risk_flags,
79
+ usage_metrics={
80
+ **query_stats,
81
+ "last_used": query_stats.get("last_query_time"),
82
+ "days_since_last_use": query_stats.get("idle_days"),
83
+ },
84
+ )
76
85
  )
77
- )
78
- except ClientError:
79
- continue
86
+ except ClientError:
87
+ continue
80
88
 
81
- except ClientError as e:
82
- print(f"Error collecting Athena resources: {e}")
89
+ except ClientError as e:
90
+ error_code = e.response.get("Error", {}).get("Code", "Unknown")
91
+ if error_code == "AccessDeniedException":
92
+ print(
93
+ f" ⚠️ No permission to list Athena workgroups in {region}. "
94
+ "Add 'athena:ListWorkGroups' to IAM policy.",
95
+ file=sys.stderr,
96
+ )
97
+ # Skip other errors silently (region not enabled, etc.)
83
98
 
99
+ if assets:
100
+ print(f" → Found {len(assets)} Athena workgroups", file=sys.stderr)
84
101
  return assets
85
102
 
86
- def _get_query_stats(self, workgroup_name: str) -> dict[str, Any]:
103
+ def _get_all_regions(self) -> list[str]:
104
+ """Get all enabled AWS regions."""
105
+ try:
106
+ ec2 = self.session.client("ec2", region_name="us-east-1")
107
+ response = ec2.describe_regions(AllRegions=False)
108
+ return [r["RegionName"] for r in response.get("Regions", [])]
109
+ except ClientError:
110
+ # Fallback to common regions
111
+ return ["us-east-1", "us-west-2", "eu-west-1", "ap-southeast-1"]
112
+
113
+ def _get_query_stats(self, athena_client, workgroup_name: str) -> dict[str, Any]:
87
114
  """Get query statistics for a workgroup."""
88
115
  stats = {"total_queries": 0, "failed_queries": 0, "last_query_time": None, "idle_days": 0}
89
116
 
90
117
  try:
91
- # List recent queries
92
- paginator = self.athena_client.get_paginator("list_query_executions")
93
- datetime.utcnow() - timedelta(days=90)
118
+ # List recent queries (limit to avoid long scan times)
119
+ paginator = athena_client.get_paginator("list_query_executions")
94
120
 
95
- for page in paginator.paginate(WorkGroup=workgroup_name):
121
+ query_count = 0
122
+ for page in paginator.paginate(
123
+ WorkGroup=workgroup_name, PaginationConfig={"MaxItems": 100}
124
+ ):
96
125
  for query_id in page.get("QueryExecutionIds", []):
126
+ query_count += 1
127
+ if query_count > 50: # Limit for performance
128
+ break
129
+
97
130
  try:
98
- query_info = self.athena_client.get_query_execution(
99
- QueryExecutionId=query_id
100
- )
131
+ query_info = athena_client.get_query_execution(QueryExecutionId=query_id)
101
132
  execution = query_info.get("QueryExecution", {})
102
133
  status = execution.get("Status", {})
103
134
 
@@ -107,7 +138,7 @@ class AthenaCollector:
107
138
  stats["failed_queries"] += 1
108
139
 
109
140
  # Get last query time
110
- completion_time = execution.get("Status", {}).get("CompletionDateTime")
141
+ completion_time = status.get("CompletionDateTime")
111
142
  if completion_time:
112
143
  if (
113
144
  not stats["last_query_time"]
@@ -117,10 +148,14 @@ class AthenaCollector:
117
148
  except ClientError:
118
149
  continue
119
150
 
151
+ if query_count > 50:
152
+ break
153
+
120
154
  # Calculate idle days
121
155
  if stats["last_query_time"]:
122
156
  last_query = datetime.fromisoformat(stats["last_query_time"].replace("Z", "+00:00"))
123
- stats["idle_days"] = (datetime.utcnow() - last_query.replace(tzinfo=None)).days
157
+ now = datetime.now(timezone.utc)
158
+ stats["idle_days"] = (now - last_query).days
124
159
  else:
125
160
  stats["idle_days"] = 999 # Never used
126
161
 
@@ -14,48 +14,79 @@ from nuvu_scan.core.base import Asset, NormalizedCategory
14
14
 
15
15
 
16
16
  class GlueCollector:
17
- """Collects AWS Glue Data Catalog resources."""
17
+ """Collects AWS Glue Data Catalog resources across all regions."""
18
18
 
19
19
  def __init__(self, session: boto3.Session, regions: list[str] | None = None):
20
20
  self.session = session
21
- self.regions = regions or ["us-east-1"] # Glue is regional but catalog is global
22
- self.region = self.regions[0] if self.regions else "us-east-1"
23
- self.glue_client = session.client("glue", region_name=self.region)
21
+ self.regions = regions or []
22
+ # These will be set per-region during collection
23
+ self.glue_client = None
24
+ self.region = None
24
25
  # Cache crawler run times to associate with tables
25
26
  self._crawler_last_runs: dict[str, datetime | None] = {}
26
27
  self._db_to_crawler: dict[str, str] = {}
27
28
 
29
+ def _get_all_regions(self) -> list[str]:
30
+ """Get all enabled AWS regions."""
31
+ try:
32
+ ec2 = self.session.client("ec2", region_name="us-east-1")
33
+ response = ec2.describe_regions(AllRegions=False)
34
+ return [r["RegionName"] for r in response.get("Regions", [])]
35
+ except ClientError:
36
+ # Fallback to common regions
37
+ return ["us-east-1", "us-west-2", "eu-west-1", "ap-southeast-1"]
38
+
28
39
  def collect(self) -> list[Asset]:
29
- """Collect all Glue resources: databases, tables, crawlers, jobs, connections."""
40
+ """Collect all Glue resources across all regions."""
30
41
  import sys
31
42
 
32
- assets = []
43
+ all_assets = []
33
44
 
34
- # First, collect crawlers to build database-to-crawler mapping
35
- print(" → Collecting Glue crawlers...", file=sys.stderr)
36
- crawler_assets = self._collect_crawlers()
37
- assets.extend(crawler_assets)
38
- print(f" → Found {len(crawler_assets)} crawlers", file=sys.stderr)
39
-
40
- # Collect ETL jobs
41
- print(" → Collecting Glue ETL jobs...", file=sys.stderr)
42
- job_assets = self._collect_jobs()
43
- assets.extend(job_assets)
44
- print(f" → Found {len(job_assets)} jobs", file=sys.stderr)
45
-
46
- # Collect connections
47
- print(" → Collecting Glue connections...", file=sys.stderr)
48
- conn_assets = self._collect_connections()
49
- assets.extend(conn_assets)
50
- print(f" → Found {len(conn_assets)} connections", file=sys.stderr)
51
-
52
- # Collect databases and tables (using crawler info for last_activity)
53
- print(" → Collecting Glue databases and tables...", file=sys.stderr)
54
- db_assets = self._collect_databases_and_tables()
55
- assets.extend(db_assets)
56
- print(f" → Found {len(db_assets)} databases/tables", file=sys.stderr)
45
+ # If no regions specified, get all enabled regions
46
+ regions_to_check = self.regions if self.regions else self._get_all_regions()
57
47
 
58
- return assets
48
+ print(
49
+ f" → Checking {len(regions_to_check)} regions for Glue resources...", file=sys.stderr
50
+ )
51
+
52
+ total_crawlers = 0
53
+ total_jobs = 0
54
+ total_connections = 0
55
+ total_dbs = 0
56
+
57
+ for region in regions_to_check:
58
+ # Set up client for this region
59
+ self.region = region
60
+ self.glue_client = self.session.client("glue", region_name=region)
61
+ self._crawler_last_runs = {}
62
+ self._db_to_crawler = {}
63
+
64
+ # Collect crawlers
65
+ crawler_assets = self._collect_crawlers()
66
+ all_assets.extend(crawler_assets)
67
+ total_crawlers += len(crawler_assets)
68
+
69
+ # Collect ETL jobs
70
+ job_assets = self._collect_jobs()
71
+ all_assets.extend(job_assets)
72
+ total_jobs += len(job_assets)
73
+
74
+ # Collect connections
75
+ conn_assets = self._collect_connections()
76
+ all_assets.extend(conn_assets)
77
+ total_connections += len(conn_assets)
78
+
79
+ # Collect databases and tables
80
+ db_assets = self._collect_databases_and_tables()
81
+ all_assets.extend(db_assets)
82
+ total_dbs += len(db_assets)
83
+
84
+ print(f" → Found {total_crawlers} crawlers", file=sys.stderr)
85
+ print(f" → Found {total_jobs} jobs", file=sys.stderr)
86
+ print(f" → Found {total_connections} connections", file=sys.stderr)
87
+ print(f" → Found {total_dbs} databases/tables", file=sys.stderr)
88
+
89
+ return all_assets
59
90
 
60
91
  def _collect_crawlers(self) -> list[Asset]:
61
92
  """Collect Glue Crawlers with detailed status."""
@@ -152,7 +183,17 @@ class GlueCollector:
152
183
  )
153
184
 
154
185
  except ClientError as e:
155
- print(f"Error collecting Glue crawlers: {e}")
186
+ import sys
187
+
188
+ error_code = e.response.get("Error", {}).get("Code", "Unknown")
189
+ if error_code == "AccessDeniedException":
190
+ print(
191
+ " ⚠️ No permission to list Glue crawlers. "
192
+ "Add 'glue:GetCrawlers' to IAM policy.",
193
+ file=sys.stderr,
194
+ )
195
+ else:
196
+ print(f" ⚠️ Error collecting Glue crawlers: {e}", file=sys.stderr)
156
197
 
157
198
  return assets
158
199
 
@@ -248,7 +289,16 @@ class GlueCollector:
248
289
  )
249
290
 
250
291
  except ClientError as e:
251
- print(f"Error collecting Glue jobs: {e}")
292
+ import sys
293
+
294
+ error_code = e.response.get("Error", {}).get("Code", "Unknown")
295
+ if error_code == "AccessDeniedException":
296
+ print(
297
+ " ⚠️ No permission to list Glue jobs. Add 'glue:GetJobs' to IAM policy.",
298
+ file=sys.stderr,
299
+ )
300
+ else:
301
+ print(f" ⚠️ Error collecting Glue jobs: {e}", file=sys.stderr)
252
302
 
253
303
  return assets
254
304
 
@@ -307,7 +357,17 @@ class GlueCollector:
307
357
  )
308
358
 
309
359
  except ClientError as e:
310
- print(f"Error collecting Glue connections: {e}")
360
+ import sys
361
+
362
+ error_code = e.response.get("Error", {}).get("Code", "Unknown")
363
+ if error_code == "AccessDeniedException":
364
+ print(
365
+ " ⚠️ No permission to list Glue connections. "
366
+ "Add 'glue:GetConnections' to IAM policy.",
367
+ file=sys.stderr,
368
+ )
369
+ else:
370
+ print(f" ⚠️ Error collecting Glue connections: {e}", file=sys.stderr)
311
371
 
312
372
  return assets
313
373
 
@@ -452,7 +512,17 @@ class GlueCollector:
452
512
  pass
453
513
 
454
514
  except ClientError as e:
455
- print(f"Error collecting Glue resources: {e}")
515
+ import sys
516
+
517
+ error_code = e.response.get("Error", {}).get("Code", "Unknown")
518
+ if error_code == "AccessDeniedException":
519
+ print(
520
+ " ⚠️ No permission to list Glue databases/tables. "
521
+ "Add 'glue:GetDatabases' and 'glue:GetTables' to IAM policy.",
522
+ file=sys.stderr,
523
+ )
524
+ else:
525
+ print(f" ⚠️ Error collecting Glue resources: {e}", file=sys.stderr)
456
526
 
457
527
  return assets
458
528
 
@@ -91,8 +91,8 @@ class MWAACollector:
91
91
  environment_class, min_workers, max_workers
92
92
  )
93
93
 
94
- # Check for risk flags
95
- risk_flags = self._check_risks(environment, tags)
94
+ # Check for risk flags (pass ownership for proper no_owner detection)
95
+ risk_flags = self._check_risks(environment, tags, ownership)
96
96
 
97
97
  # Create asset
98
98
  asset = Asset(
@@ -251,7 +251,9 @@ class MWAACollector:
251
251
 
252
252
  return base_monthly + worker_monthly
253
253
 
254
- def _check_risks(self, environment: dict, tags: dict[str, str]) -> list[str]:
254
+ def _check_risks(
255
+ self, environment: dict, tags: dict[str, str], ownership: dict | None = None
256
+ ) -> list[str]:
255
257
  """Check for risk flags in MWAA environment."""
256
258
  risks = []
257
259
 
@@ -260,8 +262,11 @@ class MWAACollector:
260
262
  if status in ["CREATING", "DELETING", "UPDATING"]:
261
263
  risks.append("environment_in_transition")
262
264
 
263
- # Check for missing owner
264
- if not tags.get("owner") and not tags.get("Owner"):
265
+ # Check for missing owner - only flag if we couldn't infer owner at all
266
+ if ownership:
267
+ if not ownership.get("owner") and ownership.get("confidence") == "unknown":
268
+ risks.append("no_owner")
269
+ elif not tags.get("owner") and not tags.get("Owner"):
265
270
  risks.append("no_owner")
266
271
 
267
272
  # Check for public access (if webserver is publicly accessible)