nuvu-scan 1.3.7__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nuvu_scan/__init__.py +1 -1
- nuvu_scan/cli/commands/scan.py +94 -0
- nuvu_scan/cli/formatters/html.py +262 -10
- nuvu_scan/cli/main.py +2 -1
- nuvu_scan/core/base.py +6 -0
- nuvu_scan/core/providers/aws/aws_scanner.py +55 -15
- nuvu_scan/core/providers/aws/collectors/athena.py +3 -0
- nuvu_scan/core/providers/aws/collectors/glue.py +420 -15
- nuvu_scan/core/providers/aws/collectors/iam.py +9 -0
- nuvu_scan/core/providers/aws/collectors/redshift.py +718 -40
- nuvu_scan/core/providers/gcp/gcp_scanner.py +42 -10
- {nuvu_scan-1.3.7.dist-info → nuvu_scan-2.0.0.dist-info}/METADATA +86 -20
- {nuvu_scan-1.3.7.dist-info → nuvu_scan-2.0.0.dist-info}/RECORD +15 -15
- {nuvu_scan-1.3.7.dist-info → nuvu_scan-2.0.0.dist-info}/entry_points.txt +1 -0
- {nuvu_scan-1.3.7.dist-info → nuvu_scan-2.0.0.dist-info}/WHEEL +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
AWS Glue Data Catalog collector.
|
|
3
3
|
|
|
4
|
-
Collects Glue databases, tables, and
|
|
4
|
+
Collects Glue databases, tables, crawlers, ETL jobs, and connections.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
from datetime import datetime, timezone
|
|
7
8
|
from typing import Any
|
|
8
9
|
|
|
9
10
|
import boto3
|
|
@@ -18,10 +19,300 @@ class GlueCollector:
|
|
|
18
19
|
def __init__(self, session: boto3.Session, regions: list[str] | None = None):
|
|
19
20
|
self.session = session
|
|
20
21
|
self.regions = regions or ["us-east-1"] # Glue is regional but catalog is global
|
|
21
|
-
self.
|
|
22
|
+
self.region = self.regions[0] if self.regions else "us-east-1"
|
|
23
|
+
self.glue_client = session.client("glue", region_name=self.region)
|
|
24
|
+
# Cache crawler run times to associate with tables
|
|
25
|
+
self._crawler_last_runs: dict[str, datetime | None] = {}
|
|
26
|
+
self._db_to_crawler: dict[str, str] = {}
|
|
22
27
|
|
|
23
28
|
def collect(self) -> list[Asset]:
|
|
24
|
-
"""Collect Glue databases
|
|
29
|
+
"""Collect all Glue resources: databases, tables, crawlers, jobs, connections."""
|
|
30
|
+
import sys
|
|
31
|
+
|
|
32
|
+
assets = []
|
|
33
|
+
|
|
34
|
+
# First, collect crawlers to build database-to-crawler mapping
|
|
35
|
+
print(" → Collecting Glue crawlers...", file=sys.stderr)
|
|
36
|
+
crawler_assets = self._collect_crawlers()
|
|
37
|
+
assets.extend(crawler_assets)
|
|
38
|
+
print(f" → Found {len(crawler_assets)} crawlers", file=sys.stderr)
|
|
39
|
+
|
|
40
|
+
# Collect ETL jobs
|
|
41
|
+
print(" → Collecting Glue ETL jobs...", file=sys.stderr)
|
|
42
|
+
job_assets = self._collect_jobs()
|
|
43
|
+
assets.extend(job_assets)
|
|
44
|
+
print(f" → Found {len(job_assets)} jobs", file=sys.stderr)
|
|
45
|
+
|
|
46
|
+
# Collect connections
|
|
47
|
+
print(" → Collecting Glue connections...", file=sys.stderr)
|
|
48
|
+
conn_assets = self._collect_connections()
|
|
49
|
+
assets.extend(conn_assets)
|
|
50
|
+
print(f" → Found {len(conn_assets)} connections", file=sys.stderr)
|
|
51
|
+
|
|
52
|
+
# Collect databases and tables (using crawler info for last_activity)
|
|
53
|
+
print(" → Collecting Glue databases and tables...", file=sys.stderr)
|
|
54
|
+
db_assets = self._collect_databases_and_tables()
|
|
55
|
+
assets.extend(db_assets)
|
|
56
|
+
print(f" → Found {len(db_assets)} databases/tables", file=sys.stderr)
|
|
57
|
+
|
|
58
|
+
return assets
|
|
59
|
+
|
|
60
|
+
def _collect_crawlers(self) -> list[Asset]:
|
|
61
|
+
"""Collect Glue Crawlers with detailed status."""
|
|
62
|
+
assets = []
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
# List all crawlers
|
|
66
|
+
paginator = self.glue_client.get_paginator("get_crawlers")
|
|
67
|
+
|
|
68
|
+
for page in paginator.paginate():
|
|
69
|
+
for crawler in page.get("Crawlers", []):
|
|
70
|
+
crawler_name = crawler.get("Name", "")
|
|
71
|
+
db_name = crawler.get("DatabaseName", "")
|
|
72
|
+
state = crawler.get("State", "UNKNOWN")
|
|
73
|
+
|
|
74
|
+
# Map database to crawler for later use
|
|
75
|
+
if db_name:
|
|
76
|
+
self._db_to_crawler[db_name] = crawler_name
|
|
77
|
+
|
|
78
|
+
# Get last crawl info
|
|
79
|
+
last_crawl = crawler.get("LastCrawl", {})
|
|
80
|
+
last_crawl_status = last_crawl.get("Status", "UNKNOWN")
|
|
81
|
+
last_crawl_time = last_crawl.get("StartTime")
|
|
82
|
+
|
|
83
|
+
if last_crawl_time:
|
|
84
|
+
self._crawler_last_runs[crawler_name] = last_crawl_time
|
|
85
|
+
|
|
86
|
+
# Get schedule info
|
|
87
|
+
schedule = crawler.get("Schedule", {})
|
|
88
|
+
schedule_expr = schedule.get("ScheduleExpression") if schedule else None
|
|
89
|
+
schedule_state = schedule.get("State") if schedule else None
|
|
90
|
+
|
|
91
|
+
# Determine if crawler is stale (no schedule OR hasn't run in 90+ days)
|
|
92
|
+
risk_flags = []
|
|
93
|
+
days_since_last_run = None
|
|
94
|
+
|
|
95
|
+
if last_crawl_time:
|
|
96
|
+
if isinstance(last_crawl_time, datetime):
|
|
97
|
+
last_dt = last_crawl_time
|
|
98
|
+
else:
|
|
99
|
+
last_dt = datetime.fromisoformat(
|
|
100
|
+
str(last_crawl_time).replace("Z", "+00:00")
|
|
101
|
+
)
|
|
102
|
+
now = datetime.now(timezone.utc)
|
|
103
|
+
days_since_last_run = (now - last_dt).days
|
|
104
|
+
|
|
105
|
+
if days_since_last_run > 90:
|
|
106
|
+
risk_flags.append("stale_crawler")
|
|
107
|
+
|
|
108
|
+
# No schedule and not recently run = potentially abandoned
|
|
109
|
+
if not schedule_expr and (
|
|
110
|
+
days_since_last_run is None or days_since_last_run > 30
|
|
111
|
+
):
|
|
112
|
+
risk_flags.append("no_schedule")
|
|
113
|
+
|
|
114
|
+
# Never run
|
|
115
|
+
if not last_crawl_time:
|
|
116
|
+
risk_flags.append("never_run")
|
|
117
|
+
|
|
118
|
+
tags = self._get_crawler_tags(crawler_name)
|
|
119
|
+
ownership = self._infer_ownership(tags, crawler_name)
|
|
120
|
+
|
|
121
|
+
assets.append(
|
|
122
|
+
Asset(
|
|
123
|
+
provider="aws",
|
|
124
|
+
asset_type="glue_crawler",
|
|
125
|
+
normalized_category=NormalizedCategory.DATA_PIPELINE,
|
|
126
|
+
service="Glue",
|
|
127
|
+
region=self.region,
|
|
128
|
+
arn=f"arn:aws:glue:{self.region}::crawler/{crawler_name}",
|
|
129
|
+
name=crawler_name,
|
|
130
|
+
created_at=crawler.get("CreationTime").isoformat()
|
|
131
|
+
if crawler.get("CreationTime")
|
|
132
|
+
else None,
|
|
133
|
+
tags=tags,
|
|
134
|
+
risk_flags=risk_flags,
|
|
135
|
+
ownership_confidence=ownership["confidence"],
|
|
136
|
+
suggested_owner=ownership["owner"],
|
|
137
|
+
last_activity_at=last_crawl_time.isoformat()
|
|
138
|
+
if last_crawl_time
|
|
139
|
+
else None,
|
|
140
|
+
usage_metrics={
|
|
141
|
+
"state": state,
|
|
142
|
+
"last_crawl_status": last_crawl_status,
|
|
143
|
+
"schedule_expression": schedule_expr,
|
|
144
|
+
"schedule_state": schedule_state,
|
|
145
|
+
"database_name": db_name,
|
|
146
|
+
"days_since_last_run": days_since_last_run,
|
|
147
|
+
"tables_created": last_crawl.get("TablesCreated", 0),
|
|
148
|
+
"tables_updated": last_crawl.get("TablesUpdated", 0),
|
|
149
|
+
"tables_deleted": last_crawl.get("TablesDeleted", 0),
|
|
150
|
+
},
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
except ClientError as e:
|
|
155
|
+
print(f"Error collecting Glue crawlers: {e}")
|
|
156
|
+
|
|
157
|
+
return assets
|
|
158
|
+
|
|
159
|
+
def _collect_jobs(self) -> list[Asset]:
|
|
160
|
+
"""Collect Glue ETL Jobs with run history."""
|
|
161
|
+
assets = []
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
# List all jobs
|
|
165
|
+
paginator = self.glue_client.get_paginator("get_jobs")
|
|
166
|
+
|
|
167
|
+
for page in paginator.paginate():
|
|
168
|
+
for job in page.get("Jobs", []):
|
|
169
|
+
job_name = job.get("Name", "")
|
|
170
|
+
|
|
171
|
+
# Get last job run
|
|
172
|
+
last_run = None
|
|
173
|
+
last_run_status = None
|
|
174
|
+
days_since_last_run = None
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
runs_response = self.glue_client.get_job_runs(
|
|
178
|
+
JobName=job_name, MaxResults=1
|
|
179
|
+
)
|
|
180
|
+
runs = runs_response.get("JobRuns", [])
|
|
181
|
+
if runs:
|
|
182
|
+
last_run = runs[0].get("StartedOn")
|
|
183
|
+
last_run_status = runs[0].get("JobRunState", "UNKNOWN")
|
|
184
|
+
|
|
185
|
+
if last_run:
|
|
186
|
+
if isinstance(last_run, datetime):
|
|
187
|
+
last_dt = last_run
|
|
188
|
+
else:
|
|
189
|
+
last_dt = datetime.fromisoformat(
|
|
190
|
+
str(last_run).replace("Z", "+00:00")
|
|
191
|
+
)
|
|
192
|
+
now = datetime.now(timezone.utc)
|
|
193
|
+
days_since_last_run = (now - last_dt).days
|
|
194
|
+
except ClientError:
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
# Determine risk flags
|
|
198
|
+
risk_flags = []
|
|
199
|
+
if days_since_last_run is not None and days_since_last_run > 90:
|
|
200
|
+
risk_flags.append("stale_job")
|
|
201
|
+
if last_run is None:
|
|
202
|
+
risk_flags.append("never_run")
|
|
203
|
+
if last_run_status in ["FAILED", "ERROR", "TIMEOUT"]:
|
|
204
|
+
risk_flags.append("failed_job")
|
|
205
|
+
|
|
206
|
+
tags = self._get_job_tags(job_name)
|
|
207
|
+
ownership = self._infer_ownership(tags, job_name)
|
|
208
|
+
|
|
209
|
+
# Estimate cost based on DPU allocation
|
|
210
|
+
allocated_capacity = (
|
|
211
|
+
job.get("AllocatedCapacity", 0) or job.get("MaxCapacity", 0) or 2
|
|
212
|
+
)
|
|
213
|
+
# Glue ETL: ~$0.44/DPU-hour, assume average 1 hour run per day for active jobs
|
|
214
|
+
estimated_monthly_cost = 0.0
|
|
215
|
+
if days_since_last_run is not None and days_since_last_run < 30:
|
|
216
|
+
# Active job, estimate based on recent usage
|
|
217
|
+
runs_per_month = 30 if days_since_last_run < 7 else 4
|
|
218
|
+
estimated_monthly_cost = allocated_capacity * 0.44 * runs_per_month
|
|
219
|
+
|
|
220
|
+
assets.append(
|
|
221
|
+
Asset(
|
|
222
|
+
provider="aws",
|
|
223
|
+
asset_type="glue_job",
|
|
224
|
+
normalized_category=NormalizedCategory.DATA_PIPELINE,
|
|
225
|
+
service="Glue",
|
|
226
|
+
region=self.region,
|
|
227
|
+
arn=f"arn:aws:glue:{self.region}::job/{job_name}",
|
|
228
|
+
name=job_name,
|
|
229
|
+
created_at=job.get("CreatedOn").isoformat()
|
|
230
|
+
if job.get("CreatedOn")
|
|
231
|
+
else None,
|
|
232
|
+
tags=tags,
|
|
233
|
+
risk_flags=risk_flags,
|
|
234
|
+
ownership_confidence=ownership["confidence"],
|
|
235
|
+
suggested_owner=ownership["owner"],
|
|
236
|
+
last_activity_at=last_run.isoformat() if last_run else None,
|
|
237
|
+
cost_estimate_usd=estimated_monthly_cost,
|
|
238
|
+
usage_metrics={
|
|
239
|
+
"job_type": job.get("Command", {}).get("Name", "unknown"),
|
|
240
|
+
"glue_version": job.get("GlueVersion", "unknown"),
|
|
241
|
+
"allocated_capacity": allocated_capacity,
|
|
242
|
+
"max_retries": job.get("MaxRetries", 0),
|
|
243
|
+
"timeout_minutes": job.get("Timeout"),
|
|
244
|
+
"last_run_status": last_run_status,
|
|
245
|
+
"days_since_last_run": days_since_last_run,
|
|
246
|
+
},
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
except ClientError as e:
|
|
251
|
+
print(f"Error collecting Glue jobs: {e}")
|
|
252
|
+
|
|
253
|
+
return assets
|
|
254
|
+
|
|
255
|
+
def _collect_connections(self) -> list[Asset]:
|
|
256
|
+
"""Collect Glue Connections (JDBC, etc.)."""
|
|
257
|
+
assets = []
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
response = self.glue_client.get_connections()
|
|
261
|
+
|
|
262
|
+
for conn in response.get("ConnectionList", []):
|
|
263
|
+
conn_name = conn.get("Name", "")
|
|
264
|
+
conn_type = conn.get("ConnectionType", "UNKNOWN")
|
|
265
|
+
|
|
266
|
+
# Get connection properties for governance insights
|
|
267
|
+
conn_props = conn.get("ConnectionProperties", {})
|
|
268
|
+
jdbc_url = conn_props.get("JDBC_CONNECTION_URL", "")
|
|
269
|
+
|
|
270
|
+
# Detect external data sources
|
|
271
|
+
risk_flags = []
|
|
272
|
+
if "redshift" in jdbc_url.lower():
|
|
273
|
+
# Redshift connection
|
|
274
|
+
pass
|
|
275
|
+
elif "rds" in jdbc_url.lower() or "aurora" in jdbc_url.lower():
|
|
276
|
+
# RDS/Aurora connection
|
|
277
|
+
pass
|
|
278
|
+
elif jdbc_url and not any(x in jdbc_url.lower() for x in ["amazonaws.com", "aws"]):
|
|
279
|
+
# External (non-AWS) database connection
|
|
280
|
+
risk_flags.append("external_connection")
|
|
281
|
+
|
|
282
|
+
# Check if connection has last tested time
|
|
283
|
+
last_updated = conn.get("LastUpdatedTime")
|
|
284
|
+
|
|
285
|
+
assets.append(
|
|
286
|
+
Asset(
|
|
287
|
+
provider="aws",
|
|
288
|
+
asset_type="glue_connection",
|
|
289
|
+
normalized_category=NormalizedCategory.DATA_PIPELINE,
|
|
290
|
+
service="Glue",
|
|
291
|
+
region=self.region,
|
|
292
|
+
arn=f"arn:aws:glue:{self.region}::connection/{conn_name}",
|
|
293
|
+
name=conn_name,
|
|
294
|
+
created_at=conn.get("CreationTime").isoformat()
|
|
295
|
+
if conn.get("CreationTime")
|
|
296
|
+
else None,
|
|
297
|
+
risk_flags=risk_flags,
|
|
298
|
+
last_activity_at=last_updated.isoformat() if last_updated else None,
|
|
299
|
+
usage_metrics={
|
|
300
|
+
"connection_type": conn_type,
|
|
301
|
+
"jdbc_url_masked": self._mask_jdbc_url(jdbc_url) if jdbc_url else None,
|
|
302
|
+
"physical_connection_requirements": bool(
|
|
303
|
+
conn.get("PhysicalConnectionRequirements")
|
|
304
|
+
),
|
|
305
|
+
},
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
except ClientError as e:
|
|
310
|
+
print(f"Error collecting Glue connections: {e}")
|
|
311
|
+
|
|
312
|
+
return assets
|
|
313
|
+
|
|
314
|
+
def _collect_databases_and_tables(self) -> list[Asset]:
|
|
315
|
+
"""Collect Glue databases and tables with improved activity tracking."""
|
|
25
316
|
assets = []
|
|
26
317
|
|
|
27
318
|
try:
|
|
@@ -32,17 +323,45 @@ class GlueCollector:
|
|
|
32
323
|
for db_info in page.get("DatabaseList", []):
|
|
33
324
|
db_name = db_info["Name"]
|
|
34
325
|
|
|
326
|
+
# Get last activity from associated crawler
|
|
327
|
+
last_activity = None
|
|
328
|
+
crawler_name = self._db_to_crawler.get(db_name)
|
|
329
|
+
if crawler_name and crawler_name in self._crawler_last_runs:
|
|
330
|
+
crawler_time = self._crawler_last_runs[crawler_name]
|
|
331
|
+
if crawler_time:
|
|
332
|
+
last_activity = (
|
|
333
|
+
crawler_time.isoformat()
|
|
334
|
+
if isinstance(crawler_time, datetime)
|
|
335
|
+
else str(crawler_time)
|
|
336
|
+
)
|
|
337
|
+
|
|
35
338
|
# Create database asset
|
|
36
339
|
tags = self._get_tags(f"database/{db_name}")
|
|
37
340
|
ownership = self._infer_ownership(tags, db_name)
|
|
38
341
|
|
|
342
|
+
# Count tables in this database
|
|
343
|
+
table_count = 0
|
|
344
|
+
try:
|
|
345
|
+
table_paginator = self.glue_client.get_paginator("get_tables")
|
|
346
|
+
for table_page in table_paginator.paginate(DatabaseName=db_name):
|
|
347
|
+
table_count += len(table_page.get("TableList", []))
|
|
348
|
+
except ClientError:
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
# Detect stale databases (no tables or no recent crawler activity)
|
|
352
|
+
risk_flags = []
|
|
353
|
+
if table_count == 0:
|
|
354
|
+
risk_flags.append("empty_database")
|
|
355
|
+
if not crawler_name:
|
|
356
|
+
risk_flags.append("no_crawler")
|
|
357
|
+
|
|
39
358
|
assets.append(
|
|
40
359
|
Asset(
|
|
41
360
|
provider="aws",
|
|
42
361
|
asset_type="glue_database",
|
|
43
362
|
normalized_category=NormalizedCategory.DATA_CATALOG,
|
|
44
363
|
service="Glue",
|
|
45
|
-
region=
|
|
364
|
+
region=self.region,
|
|
46
365
|
arn=db_info.get("CatalogId", "") + "::" + db_name,
|
|
47
366
|
name=db_name,
|
|
48
367
|
created_at=(
|
|
@@ -51,13 +370,15 @@ class GlueCollector:
|
|
|
51
370
|
else None
|
|
52
371
|
),
|
|
53
372
|
tags=tags,
|
|
373
|
+
risk_flags=risk_flags,
|
|
54
374
|
ownership_confidence=ownership["confidence"],
|
|
55
375
|
suggested_owner=ownership["owner"],
|
|
56
|
-
last_activity_at=
|
|
376
|
+
last_activity_at=last_activity,
|
|
57
377
|
usage_metrics={
|
|
58
|
-
"table_count":
|
|
59
|
-
"
|
|
60
|
-
"
|
|
378
|
+
"table_count": table_count,
|
|
379
|
+
"associated_crawler": crawler_name,
|
|
380
|
+
"last_used": last_activity,
|
|
381
|
+
"days_since_last_use": self._calculate_days_since(last_activity),
|
|
61
382
|
},
|
|
62
383
|
)
|
|
63
384
|
)
|
|
@@ -71,19 +392,33 @@ class GlueCollector:
|
|
|
71
392
|
table_tags = self._get_tags(f"table/{db_name}/{table_name}")
|
|
72
393
|
table_ownership = self._infer_ownership(table_tags, table_name)
|
|
73
394
|
|
|
395
|
+
# Get table update time as activity indicator
|
|
396
|
+
table_updated = table_info.get("UpdateTime") or table_info.get(
|
|
397
|
+
"CreateTime"
|
|
398
|
+
)
|
|
399
|
+
table_activity = (
|
|
400
|
+
table_updated.isoformat() if table_updated else last_activity
|
|
401
|
+
)
|
|
402
|
+
|
|
74
403
|
# Check if table is empty/unused
|
|
75
404
|
partition_count = len(table_info.get("PartitionKeys", []))
|
|
405
|
+
storage = table_info.get("StorageDescriptor", {})
|
|
406
|
+
|
|
76
407
|
risk_flags = []
|
|
77
|
-
if partition_count == 0 and not
|
|
408
|
+
if partition_count == 0 and not storage:
|
|
78
409
|
risk_flags.append("empty_table")
|
|
79
410
|
|
|
411
|
+
# Check for external tables (Spectrum)
|
|
412
|
+
location = storage.get("Location", "") if storage else ""
|
|
413
|
+
is_external = location.startswith("s3://") if location else False
|
|
414
|
+
|
|
80
415
|
assets.append(
|
|
81
416
|
Asset(
|
|
82
417
|
provider="aws",
|
|
83
418
|
asset_type="glue_table",
|
|
84
419
|
normalized_category=NormalizedCategory.DATA_CATALOG,
|
|
85
420
|
service="Glue",
|
|
86
|
-
region=
|
|
421
|
+
region=self.region,
|
|
87
422
|
arn=f"{db_info.get('CatalogId', '')}::{db_name}::{table_name}",
|
|
88
423
|
name=f"{db_name}.{table_name}",
|
|
89
424
|
created_at=(
|
|
@@ -95,11 +430,21 @@ class GlueCollector:
|
|
|
95
430
|
risk_flags=risk_flags,
|
|
96
431
|
ownership_confidence=table_ownership["confidence"],
|
|
97
432
|
suggested_owner=table_ownership["owner"],
|
|
98
|
-
last_activity_at=
|
|
433
|
+
last_activity_at=table_activity,
|
|
99
434
|
usage_metrics={
|
|
100
435
|
"partition_count": partition_count,
|
|
101
|
-
"
|
|
102
|
-
"
|
|
436
|
+
"is_external": is_external,
|
|
437
|
+
"table_type": table_info.get("TableType", ""),
|
|
438
|
+
"input_format": storage.get("InputFormat", "")
|
|
439
|
+
if storage
|
|
440
|
+
else "",
|
|
441
|
+
"location": location[:50] + "..."
|
|
442
|
+
if len(location) > 50
|
|
443
|
+
else location,
|
|
444
|
+
"last_used": table_activity,
|
|
445
|
+
"days_since_last_use": self._calculate_days_since(
|
|
446
|
+
table_activity
|
|
447
|
+
),
|
|
103
448
|
},
|
|
104
449
|
)
|
|
105
450
|
)
|
|
@@ -120,6 +465,32 @@ class GlueCollector:
|
|
|
120
465
|
except ClientError:
|
|
121
466
|
return {}
|
|
122
467
|
|
|
468
|
+
def _get_crawler_tags(self, crawler_name: str) -> dict[str, str]:
|
|
469
|
+
"""Get tags for a Glue crawler."""
|
|
470
|
+
try:
|
|
471
|
+
arn = f"arn:aws:glue:{self.region}:{self._get_account_id()}:crawler/{crawler_name}"
|
|
472
|
+
response = self.glue_client.get_tags(ResourceArn=arn)
|
|
473
|
+
return response.get("Tags", {})
|
|
474
|
+
except ClientError:
|
|
475
|
+
return {}
|
|
476
|
+
|
|
477
|
+
def _get_job_tags(self, job_name: str) -> dict[str, str]:
|
|
478
|
+
"""Get tags for a Glue job."""
|
|
479
|
+
try:
|
|
480
|
+
arn = f"arn:aws:glue:{self.region}:{self._get_account_id()}:job/{job_name}"
|
|
481
|
+
response = self.glue_client.get_tags(ResourceArn=arn)
|
|
482
|
+
return response.get("Tags", {})
|
|
483
|
+
except ClientError:
|
|
484
|
+
return {}
|
|
485
|
+
|
|
486
|
+
def _get_account_id(self) -> str:
|
|
487
|
+
"""Get AWS account ID."""
|
|
488
|
+
try:
|
|
489
|
+
sts = self.session.client("sts")
|
|
490
|
+
return sts.get_caller_identity()["Account"]
|
|
491
|
+
except ClientError:
|
|
492
|
+
return ""
|
|
493
|
+
|
|
123
494
|
def _infer_ownership(self, tags: dict[str, str], name: str) -> dict[str, str]:
|
|
124
495
|
"""Infer ownership from tags."""
|
|
125
496
|
owner = None
|
|
@@ -131,18 +502,52 @@ class GlueCollector:
|
|
|
131
502
|
elif "team" in tags:
|
|
132
503
|
owner = tags["team"]
|
|
133
504
|
confidence = "medium"
|
|
505
|
+
elif "Owner" in tags:
|
|
506
|
+
owner = tags["Owner"]
|
|
507
|
+
confidence = "high"
|
|
508
|
+
elif "Team" in tags:
|
|
509
|
+
owner = tags["Team"]
|
|
510
|
+
confidence = "medium"
|
|
134
511
|
|
|
135
512
|
return {"owner": owner, "confidence": confidence}
|
|
136
513
|
|
|
514
|
+
def _mask_jdbc_url(self, jdbc_url: str) -> str:
|
|
515
|
+
"""Mask sensitive parts of JDBC URL."""
|
|
516
|
+
import re
|
|
517
|
+
|
|
518
|
+
# Mask password in JDBC URL
|
|
519
|
+
masked = re.sub(r"password=[^&;]+", "password=***", jdbc_url, flags=re.IGNORECASE)
|
|
520
|
+
masked = re.sub(r":[^:@]+@", ":***@", masked)
|
|
521
|
+
return masked
|
|
522
|
+
|
|
523
|
+
def _calculate_days_since(self, timestamp: str | None) -> int | None:
|
|
524
|
+
"""Calculate days since a timestamp."""
|
|
525
|
+
if not timestamp:
|
|
526
|
+
return None
|
|
527
|
+
try:
|
|
528
|
+
dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
|
|
529
|
+
now = datetime.now(timezone.utc)
|
|
530
|
+
return (now - dt).days
|
|
531
|
+
except Exception:
|
|
532
|
+
return None
|
|
533
|
+
|
|
137
534
|
def get_usage_metrics(self, asset: Asset) -> dict[str, Any]:
|
|
138
535
|
"""Get usage metrics for Glue asset."""
|
|
139
536
|
return asset.usage_metrics or {}
|
|
140
537
|
|
|
141
538
|
def get_cost_estimate(self, asset: Asset) -> float:
|
|
142
539
|
"""Estimate cost for Glue asset."""
|
|
540
|
+
if asset.cost_estimate_usd:
|
|
541
|
+
return asset.cost_estimate_usd
|
|
542
|
+
|
|
143
543
|
# Glue Data Catalog: $1 per 100,000 objects per month
|
|
144
|
-
# Tables and partitions count as objects
|
|
145
544
|
if asset.asset_type == "glue_table":
|
|
146
|
-
# Approximate: $0.01 per table per month
|
|
147
545
|
return 0.01
|
|
546
|
+
elif asset.asset_type == "glue_database":
|
|
547
|
+
return 0.005
|
|
548
|
+
elif asset.asset_type == "glue_crawler":
|
|
549
|
+
# Crawlers: $0.44/DPU-hour, assume minimal usage if not active
|
|
550
|
+
return 0.50 # Minimal monthly estimate
|
|
551
|
+
elif asset.asset_type == "glue_connection":
|
|
552
|
+
return 0.0 # Connections are free
|
|
148
553
|
return 0.0
|
|
@@ -25,16 +25,24 @@ class IAMCollector:
|
|
|
25
25
|
|
|
26
26
|
def collect(self) -> list[Asset]:
|
|
27
27
|
"""Collect IAM roles with data-access permissions."""
|
|
28
|
+
import sys
|
|
29
|
+
|
|
28
30
|
assets = []
|
|
29
31
|
|
|
30
32
|
try:
|
|
31
33
|
# List all IAM roles
|
|
34
|
+
print(" → Listing IAM roles...", file=sys.stderr)
|
|
32
35
|
paginator = self.iam_client.get_paginator("list_roles")
|
|
33
36
|
roles = []
|
|
34
37
|
|
|
35
38
|
for page in paginator.paginate():
|
|
36
39
|
roles.extend(page.get("Roles", []))
|
|
37
40
|
|
|
41
|
+
print(
|
|
42
|
+
f" → Found {len(roles)} roles, checking data-access permissions...",
|
|
43
|
+
file=sys.stderr,
|
|
44
|
+
)
|
|
45
|
+
data_roles_count = 0
|
|
38
46
|
for role in roles:
|
|
39
47
|
try:
|
|
40
48
|
role_name = role["RoleName"]
|
|
@@ -126,6 +134,7 @@ class IAMCollector:
|
|
|
126
134
|
)
|
|
127
135
|
|
|
128
136
|
assets.append(asset)
|
|
137
|
+
data_roles_count += 1
|
|
129
138
|
|
|
130
139
|
except ClientError as e:
|
|
131
140
|
# Skip roles we can't access
|