nuvu-scan 1.3.7__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
1
  """
2
2
  AWS Glue Data Catalog collector.
3
3
 
4
- Collects Glue databases, tables, and jobs.
4
+ Collects Glue databases, tables, crawlers, ETL jobs, and connections.
5
5
  """
6
6
 
7
+ from datetime import datetime, timezone
7
8
  from typing import Any
8
9
 
9
10
  import boto3
@@ -18,10 +19,300 @@ class GlueCollector:
18
19
  def __init__(self, session: boto3.Session, regions: list[str] | None = None):
19
20
  self.session = session
20
21
  self.regions = regions or ["us-east-1"] # Glue is regional but catalog is global
21
- self.glue_client = session.client("glue", region_name="us-east-1")
22
+ self.region = self.regions[0] if self.regions else "us-east-1"
23
+ self.glue_client = session.client("glue", region_name=self.region)
24
+ # Cache crawler run times to associate with tables
25
+ self._crawler_last_runs: dict[str, datetime | None] = {}
26
+ self._db_to_crawler: dict[str, str] = {}
22
27
 
23
28
  def collect(self) -> list[Asset]:
24
- """Collect Glue databases and tables."""
29
+ """Collect all Glue resources: databases, tables, crawlers, jobs, connections."""
30
+ import sys
31
+
32
+ assets = []
33
+
34
+ # First, collect crawlers to build database-to-crawler mapping
35
+ print(" → Collecting Glue crawlers...", file=sys.stderr)
36
+ crawler_assets = self._collect_crawlers()
37
+ assets.extend(crawler_assets)
38
+ print(f" → Found {len(crawler_assets)} crawlers", file=sys.stderr)
39
+
40
+ # Collect ETL jobs
41
+ print(" → Collecting Glue ETL jobs...", file=sys.stderr)
42
+ job_assets = self._collect_jobs()
43
+ assets.extend(job_assets)
44
+ print(f" → Found {len(job_assets)} jobs", file=sys.stderr)
45
+
46
+ # Collect connections
47
+ print(" → Collecting Glue connections...", file=sys.stderr)
48
+ conn_assets = self._collect_connections()
49
+ assets.extend(conn_assets)
50
+ print(f" → Found {len(conn_assets)} connections", file=sys.stderr)
51
+
52
+ # Collect databases and tables (using crawler info for last_activity)
53
+ print(" → Collecting Glue databases and tables...", file=sys.stderr)
54
+ db_assets = self._collect_databases_and_tables()
55
+ assets.extend(db_assets)
56
+ print(f" → Found {len(db_assets)} databases/tables", file=sys.stderr)
57
+
58
+ return assets
59
+
60
+ def _collect_crawlers(self) -> list[Asset]:
61
+ """Collect Glue Crawlers with detailed status."""
62
+ assets = []
63
+
64
+ try:
65
+ # List all crawlers
66
+ paginator = self.glue_client.get_paginator("get_crawlers")
67
+
68
+ for page in paginator.paginate():
69
+ for crawler in page.get("Crawlers", []):
70
+ crawler_name = crawler.get("Name", "")
71
+ db_name = crawler.get("DatabaseName", "")
72
+ state = crawler.get("State", "UNKNOWN")
73
+
74
+ # Map database to crawler for later use
75
+ if db_name:
76
+ self._db_to_crawler[db_name] = crawler_name
77
+
78
+ # Get last crawl info
79
+ last_crawl = crawler.get("LastCrawl", {})
80
+ last_crawl_status = last_crawl.get("Status", "UNKNOWN")
81
+ last_crawl_time = last_crawl.get("StartTime")
82
+
83
+ if last_crawl_time:
84
+ self._crawler_last_runs[crawler_name] = last_crawl_time
85
+
86
+ # Get schedule info
87
+ schedule = crawler.get("Schedule", {})
88
+ schedule_expr = schedule.get("ScheduleExpression") if schedule else None
89
+ schedule_state = schedule.get("State") if schedule else None
90
+
91
+ # Determine if crawler is stale (no schedule OR hasn't run in 90+ days)
92
+ risk_flags = []
93
+ days_since_last_run = None
94
+
95
+ if last_crawl_time:
96
+ if isinstance(last_crawl_time, datetime):
97
+ last_dt = last_crawl_time
98
+ else:
99
+ last_dt = datetime.fromisoformat(
100
+ str(last_crawl_time).replace("Z", "+00:00")
101
+ )
102
+ now = datetime.now(timezone.utc)
103
+ days_since_last_run = (now - last_dt).days
104
+
105
+ if days_since_last_run > 90:
106
+ risk_flags.append("stale_crawler")
107
+
108
+ # No schedule and not recently run = potentially abandoned
109
+ if not schedule_expr and (
110
+ days_since_last_run is None or days_since_last_run > 30
111
+ ):
112
+ risk_flags.append("no_schedule")
113
+
114
+ # Never run
115
+ if not last_crawl_time:
116
+ risk_flags.append("never_run")
117
+
118
+ tags = self._get_crawler_tags(crawler_name)
119
+ ownership = self._infer_ownership(tags, crawler_name)
120
+
121
+ assets.append(
122
+ Asset(
123
+ provider="aws",
124
+ asset_type="glue_crawler",
125
+ normalized_category=NormalizedCategory.DATA_PIPELINE,
126
+ service="Glue",
127
+ region=self.region,
128
+ arn=f"arn:aws:glue:{self.region}::crawler/{crawler_name}",
129
+ name=crawler_name,
130
+ created_at=crawler.get("CreationTime").isoformat()
131
+ if crawler.get("CreationTime")
132
+ else None,
133
+ tags=tags,
134
+ risk_flags=risk_flags,
135
+ ownership_confidence=ownership["confidence"],
136
+ suggested_owner=ownership["owner"],
137
+ last_activity_at=last_crawl_time.isoformat()
138
+ if last_crawl_time
139
+ else None,
140
+ usage_metrics={
141
+ "state": state,
142
+ "last_crawl_status": last_crawl_status,
143
+ "schedule_expression": schedule_expr,
144
+ "schedule_state": schedule_state,
145
+ "database_name": db_name,
146
+ "days_since_last_run": days_since_last_run,
147
+ "tables_created": last_crawl.get("TablesCreated", 0),
148
+ "tables_updated": last_crawl.get("TablesUpdated", 0),
149
+ "tables_deleted": last_crawl.get("TablesDeleted", 0),
150
+ },
151
+ )
152
+ )
153
+
154
+ except ClientError as e:
155
+ print(f"Error collecting Glue crawlers: {e}")
156
+
157
+ return assets
158
+
159
+ def _collect_jobs(self) -> list[Asset]:
160
+ """Collect Glue ETL Jobs with run history."""
161
+ assets = []
162
+
163
+ try:
164
+ # List all jobs
165
+ paginator = self.glue_client.get_paginator("get_jobs")
166
+
167
+ for page in paginator.paginate():
168
+ for job in page.get("Jobs", []):
169
+ job_name = job.get("Name", "")
170
+
171
+ # Get last job run
172
+ last_run = None
173
+ last_run_status = None
174
+ days_since_last_run = None
175
+
176
+ try:
177
+ runs_response = self.glue_client.get_job_runs(
178
+ JobName=job_name, MaxResults=1
179
+ )
180
+ runs = runs_response.get("JobRuns", [])
181
+ if runs:
182
+ last_run = runs[0].get("StartedOn")
183
+ last_run_status = runs[0].get("JobRunState", "UNKNOWN")
184
+
185
+ if last_run:
186
+ if isinstance(last_run, datetime):
187
+ last_dt = last_run
188
+ else:
189
+ last_dt = datetime.fromisoformat(
190
+ str(last_run).replace("Z", "+00:00")
191
+ )
192
+ now = datetime.now(timezone.utc)
193
+ days_since_last_run = (now - last_dt).days
194
+ except ClientError:
195
+ pass
196
+
197
+ # Determine risk flags
198
+ risk_flags = []
199
+ if days_since_last_run is not None and days_since_last_run > 90:
200
+ risk_flags.append("stale_job")
201
+ if last_run is None:
202
+ risk_flags.append("never_run")
203
+ if last_run_status in ["FAILED", "ERROR", "TIMEOUT"]:
204
+ risk_flags.append("failed_job")
205
+
206
+ tags = self._get_job_tags(job_name)
207
+ ownership = self._infer_ownership(tags, job_name)
208
+
209
+ # Estimate cost based on DPU allocation
210
+ allocated_capacity = (
211
+ job.get("AllocatedCapacity", 0) or job.get("MaxCapacity", 0) or 2
212
+ )
213
+ # Glue ETL: ~$0.44/DPU-hour, assume average 1 hour run per day for active jobs
214
+ estimated_monthly_cost = 0.0
215
+ if days_since_last_run is not None and days_since_last_run < 30:
216
+ # Active job, estimate based on recent usage
217
+ runs_per_month = 30 if days_since_last_run < 7 else 4
218
+ estimated_monthly_cost = allocated_capacity * 0.44 * runs_per_month
219
+
220
+ assets.append(
221
+ Asset(
222
+ provider="aws",
223
+ asset_type="glue_job",
224
+ normalized_category=NormalizedCategory.DATA_PIPELINE,
225
+ service="Glue",
226
+ region=self.region,
227
+ arn=f"arn:aws:glue:{self.region}::job/{job_name}",
228
+ name=job_name,
229
+ created_at=job.get("CreatedOn").isoformat()
230
+ if job.get("CreatedOn")
231
+ else None,
232
+ tags=tags,
233
+ risk_flags=risk_flags,
234
+ ownership_confidence=ownership["confidence"],
235
+ suggested_owner=ownership["owner"],
236
+ last_activity_at=last_run.isoformat() if last_run else None,
237
+ cost_estimate_usd=estimated_monthly_cost,
238
+ usage_metrics={
239
+ "job_type": job.get("Command", {}).get("Name", "unknown"),
240
+ "glue_version": job.get("GlueVersion", "unknown"),
241
+ "allocated_capacity": allocated_capacity,
242
+ "max_retries": job.get("MaxRetries", 0),
243
+ "timeout_minutes": job.get("Timeout"),
244
+ "last_run_status": last_run_status,
245
+ "days_since_last_run": days_since_last_run,
246
+ },
247
+ )
248
+ )
249
+
250
+ except ClientError as e:
251
+ print(f"Error collecting Glue jobs: {e}")
252
+
253
+ return assets
254
+
255
+ def _collect_connections(self) -> list[Asset]:
256
+ """Collect Glue Connections (JDBC, etc.)."""
257
+ assets = []
258
+
259
+ try:
260
+ response = self.glue_client.get_connections()
261
+
262
+ for conn in response.get("ConnectionList", []):
263
+ conn_name = conn.get("Name", "")
264
+ conn_type = conn.get("ConnectionType", "UNKNOWN")
265
+
266
+ # Get connection properties for governance insights
267
+ conn_props = conn.get("ConnectionProperties", {})
268
+ jdbc_url = conn_props.get("JDBC_CONNECTION_URL", "")
269
+
270
+ # Detect external data sources
271
+ risk_flags = []
272
+ if "redshift" in jdbc_url.lower():
273
+ # Redshift connection
274
+ pass
275
+ elif "rds" in jdbc_url.lower() or "aurora" in jdbc_url.lower():
276
+ # RDS/Aurora connection
277
+ pass
278
+ elif jdbc_url and not any(x in jdbc_url.lower() for x in ["amazonaws.com", "aws"]):
279
+ # External (non-AWS) database connection
280
+ risk_flags.append("external_connection")
281
+
282
+ # Check if connection has last tested time
283
+ last_updated = conn.get("LastUpdatedTime")
284
+
285
+ assets.append(
286
+ Asset(
287
+ provider="aws",
288
+ asset_type="glue_connection",
289
+ normalized_category=NormalizedCategory.DATA_PIPELINE,
290
+ service="Glue",
291
+ region=self.region,
292
+ arn=f"arn:aws:glue:{self.region}::connection/{conn_name}",
293
+ name=conn_name,
294
+ created_at=conn.get("CreationTime").isoformat()
295
+ if conn.get("CreationTime")
296
+ else None,
297
+ risk_flags=risk_flags,
298
+ last_activity_at=last_updated.isoformat() if last_updated else None,
299
+ usage_metrics={
300
+ "connection_type": conn_type,
301
+ "jdbc_url_masked": self._mask_jdbc_url(jdbc_url) if jdbc_url else None,
302
+ "physical_connection_requirements": bool(
303
+ conn.get("PhysicalConnectionRequirements")
304
+ ),
305
+ },
306
+ )
307
+ )
308
+
309
+ except ClientError as e:
310
+ print(f"Error collecting Glue connections: {e}")
311
+
312
+ return assets
313
+
314
+ def _collect_databases_and_tables(self) -> list[Asset]:
315
+ """Collect Glue databases and tables with improved activity tracking."""
25
316
  assets = []
26
317
 
27
318
  try:
@@ -32,17 +323,45 @@ class GlueCollector:
32
323
  for db_info in page.get("DatabaseList", []):
33
324
  db_name = db_info["Name"]
34
325
 
326
+ # Get last activity from associated crawler
327
+ last_activity = None
328
+ crawler_name = self._db_to_crawler.get(db_name)
329
+ if crawler_name and crawler_name in self._crawler_last_runs:
330
+ crawler_time = self._crawler_last_runs[crawler_name]
331
+ if crawler_time:
332
+ last_activity = (
333
+ crawler_time.isoformat()
334
+ if isinstance(crawler_time, datetime)
335
+ else str(crawler_time)
336
+ )
337
+
35
338
  # Create database asset
36
339
  tags = self._get_tags(f"database/{db_name}")
37
340
  ownership = self._infer_ownership(tags, db_name)
38
341
 
342
+ # Count tables in this database
343
+ table_count = 0
344
+ try:
345
+ table_paginator = self.glue_client.get_paginator("get_tables")
346
+ for table_page in table_paginator.paginate(DatabaseName=db_name):
347
+ table_count += len(table_page.get("TableList", []))
348
+ except ClientError:
349
+ pass
350
+
351
+ # Detect stale databases (no tables or no recent crawler activity)
352
+ risk_flags = []
353
+ if table_count == 0:
354
+ risk_flags.append("empty_database")
355
+ if not crawler_name:
356
+ risk_flags.append("no_crawler")
357
+
39
358
  assets.append(
40
359
  Asset(
41
360
  provider="aws",
42
361
  asset_type="glue_database",
43
362
  normalized_category=NormalizedCategory.DATA_CATALOG,
44
363
  service="Glue",
45
- region="us-east-1", # Glue catalog is in us-east-1
364
+ region=self.region,
46
365
  arn=db_info.get("CatalogId", "") + "::" + db_name,
47
366
  name=db_name,
48
367
  created_at=(
@@ -51,13 +370,15 @@ class GlueCollector:
51
370
  else None
52
371
  ),
53
372
  tags=tags,
373
+ risk_flags=risk_flags,
54
374
  ownership_confidence=ownership["confidence"],
55
375
  suggested_owner=ownership["owner"],
56
- last_activity_at=None,
376
+ last_activity_at=last_activity,
57
377
  usage_metrics={
58
- "table_count": 0,
59
- "last_used": None,
60
- "days_since_last_use": None,
378
+ "table_count": table_count,
379
+ "associated_crawler": crawler_name,
380
+ "last_used": last_activity,
381
+ "days_since_last_use": self._calculate_days_since(last_activity),
61
382
  },
62
383
  )
63
384
  )
@@ -71,19 +392,33 @@ class GlueCollector:
71
392
  table_tags = self._get_tags(f"table/{db_name}/{table_name}")
72
393
  table_ownership = self._infer_ownership(table_tags, table_name)
73
394
 
395
+ # Get table update time as activity indicator
396
+ table_updated = table_info.get("UpdateTime") or table_info.get(
397
+ "CreateTime"
398
+ )
399
+ table_activity = (
400
+ table_updated.isoformat() if table_updated else last_activity
401
+ )
402
+
74
403
  # Check if table is empty/unused
75
404
  partition_count = len(table_info.get("PartitionKeys", []))
405
+ storage = table_info.get("StorageDescriptor", {})
406
+
76
407
  risk_flags = []
77
- if partition_count == 0 and not table_info.get("StorageDescriptor"):
408
+ if partition_count == 0 and not storage:
78
409
  risk_flags.append("empty_table")
79
410
 
411
+ # Check for external tables (Spectrum)
412
+ location = storage.get("Location", "") if storage else ""
413
+ is_external = location.startswith("s3://") if location else False
414
+
80
415
  assets.append(
81
416
  Asset(
82
417
  provider="aws",
83
418
  asset_type="glue_table",
84
419
  normalized_category=NormalizedCategory.DATA_CATALOG,
85
420
  service="Glue",
86
- region="us-east-1",
421
+ region=self.region,
87
422
  arn=f"{db_info.get('CatalogId', '')}::{db_name}::{table_name}",
88
423
  name=f"{db_name}.{table_name}",
89
424
  created_at=(
@@ -95,11 +430,21 @@ class GlueCollector:
95
430
  risk_flags=risk_flags,
96
431
  ownership_confidence=table_ownership["confidence"],
97
432
  suggested_owner=table_ownership["owner"],
98
- last_activity_at=None, # Glue tables don't track last access
433
+ last_activity_at=table_activity,
99
434
  usage_metrics={
100
435
  "partition_count": partition_count,
101
- "last_used": None,
102
- "days_since_last_use": None,
436
+ "is_external": is_external,
437
+ "table_type": table_info.get("TableType", ""),
438
+ "input_format": storage.get("InputFormat", "")
439
+ if storage
440
+ else "",
441
+ "location": location[:50] + "..."
442
+ if len(location) > 50
443
+ else location,
444
+ "last_used": table_activity,
445
+ "days_since_last_use": self._calculate_days_since(
446
+ table_activity
447
+ ),
103
448
  },
104
449
  )
105
450
  )
@@ -120,6 +465,32 @@ class GlueCollector:
120
465
  except ClientError:
121
466
  return {}
122
467
 
468
+ def _get_crawler_tags(self, crawler_name: str) -> dict[str, str]:
469
+ """Get tags for a Glue crawler."""
470
+ try:
471
+ arn = f"arn:aws:glue:{self.region}:{self._get_account_id()}:crawler/{crawler_name}"
472
+ response = self.glue_client.get_tags(ResourceArn=arn)
473
+ return response.get("Tags", {})
474
+ except ClientError:
475
+ return {}
476
+
477
+ def _get_job_tags(self, job_name: str) -> dict[str, str]:
478
+ """Get tags for a Glue job."""
479
+ try:
480
+ arn = f"arn:aws:glue:{self.region}:{self._get_account_id()}:job/{job_name}"
481
+ response = self.glue_client.get_tags(ResourceArn=arn)
482
+ return response.get("Tags", {})
483
+ except ClientError:
484
+ return {}
485
+
486
+ def _get_account_id(self) -> str:
487
+ """Get AWS account ID."""
488
+ try:
489
+ sts = self.session.client("sts")
490
+ return sts.get_caller_identity()["Account"]
491
+ except ClientError:
492
+ return ""
493
+
123
494
  def _infer_ownership(self, tags: dict[str, str], name: str) -> dict[str, str]:
124
495
  """Infer ownership from tags."""
125
496
  owner = None
@@ -131,18 +502,52 @@ class GlueCollector:
131
502
  elif "team" in tags:
132
503
  owner = tags["team"]
133
504
  confidence = "medium"
505
+ elif "Owner" in tags:
506
+ owner = tags["Owner"]
507
+ confidence = "high"
508
+ elif "Team" in tags:
509
+ owner = tags["Team"]
510
+ confidence = "medium"
134
511
 
135
512
  return {"owner": owner, "confidence": confidence}
136
513
 
514
+ def _mask_jdbc_url(self, jdbc_url: str) -> str:
515
+ """Mask sensitive parts of JDBC URL."""
516
+ import re
517
+
518
+ # Mask password in JDBC URL
519
+ masked = re.sub(r"password=[^&;]+", "password=***", jdbc_url, flags=re.IGNORECASE)
520
+ masked = re.sub(r":[^:@]+@", ":***@", masked)
521
+ return masked
522
+
523
+ def _calculate_days_since(self, timestamp: str | None) -> int | None:
524
+ """Calculate days since a timestamp."""
525
+ if not timestamp:
526
+ return None
527
+ try:
528
+ dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
529
+ now = datetime.now(timezone.utc)
530
+ return (now - dt).days
531
+ except Exception:
532
+ return None
533
+
137
534
  def get_usage_metrics(self, asset: Asset) -> dict[str, Any]:
138
535
  """Get usage metrics for Glue asset."""
139
536
  return asset.usage_metrics or {}
140
537
 
141
538
  def get_cost_estimate(self, asset: Asset) -> float:
142
539
  """Estimate cost for Glue asset."""
540
+ if asset.cost_estimate_usd:
541
+ return asset.cost_estimate_usd
542
+
143
543
  # Glue Data Catalog: $1 per 100,000 objects per month
144
- # Tables and partitions count as objects
145
544
  if asset.asset_type == "glue_table":
146
- # Approximate: $0.01 per table per month
147
545
  return 0.01
546
+ elif asset.asset_type == "glue_database":
547
+ return 0.005
548
+ elif asset.asset_type == "glue_crawler":
549
+ # Crawlers: $0.44/DPU-hour, assume minimal usage if not active
550
+ return 0.50 # Minimal monthly estimate
551
+ elif asset.asset_type == "glue_connection":
552
+ return 0.0 # Connections are free
148
553
  return 0.0
@@ -25,16 +25,24 @@ class IAMCollector:
25
25
 
26
26
  def collect(self) -> list[Asset]:
27
27
  """Collect IAM roles with data-access permissions."""
28
+ import sys
29
+
28
30
  assets = []
29
31
 
30
32
  try:
31
33
  # List all IAM roles
34
+ print(" → Listing IAM roles...", file=sys.stderr)
32
35
  paginator = self.iam_client.get_paginator("list_roles")
33
36
  roles = []
34
37
 
35
38
  for page in paginator.paginate():
36
39
  roles.extend(page.get("Roles", []))
37
40
 
41
+ print(
42
+ f" → Found {len(roles)} roles, checking data-access permissions...",
43
+ file=sys.stderr,
44
+ )
45
+ data_roles_count = 0
38
46
  for role in roles:
39
47
  try:
40
48
  role_name = role["RoleName"]
@@ -126,6 +134,7 @@ class IAMCollector:
126
134
  )
127
135
 
128
136
  assets.append(asset)
137
+ data_roles_count += 1
129
138
 
130
139
  except ClientError as e:
131
140
  # Skip roles we can't access