aws-inventory-manager 0.13.2__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aws-inventory-manager might be problematic. Click here for more details.

@@ -0,0 +1,450 @@
1
+ """Resource name normalizer using rules and AI."""
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from .config import NormalizerConfig
11
+ from .prompts import NORMALIZATION_SYSTEM_PROMPT
12
+
13
+
14
+ @dataclass
15
+ class NormalizationResult:
16
+ """Result of normalizing a resource name.
17
+
18
+ Attributes:
19
+ normalized_name: The semantic part after stripping auto-generated components
20
+ extracted_patterns: List of patterns that were stripped from the name
21
+ method: How normalization was determined ('tag:logical-id', 'tag:Name', 'pattern', 'none')
22
+ confidence: Confidence score (0.0-1.0) indicating reliability of the normalization
23
+ """
24
+
25
+ normalized_name: str
26
+ extracted_patterns: List[str] = field(default_factory=list)
27
+ method: str = "none"
28
+ confidence: float = 0.9
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Try to import openai, but don't fail if not installed
33
+ try:
34
+ from openai import OpenAI
35
+
36
+ OPENAI_AVAILABLE = True
37
+ except ImportError:
38
+ OPENAI_AVAILABLE = False
39
+ OpenAI = None
40
+
41
+
42
+ class ResourceNormalizer:
43
+ """Normalize resource names using rules-based and AI approaches.
44
+
45
+ The normalizer first tries rules-based normalization for obvious cases,
46
+ then falls back to AI for ambiguous names.
47
+ """
48
+
49
+ def __init__(self, config: Optional[NormalizerConfig] = None):
50
+ """Initialize the normalizer.
51
+
52
+ Args:
53
+ config: Configuration for normalization. If None, loads from environment.
54
+ """
55
+ self.config = config or NormalizerConfig.from_env()
56
+ self._client: Optional[Any] = None
57
+ self._total_tokens = 0
58
+
59
+ # Compile regex patterns for performance
60
+ self._random_patterns = [re.compile(p) for p in self.config.random_patterns]
61
+
62
+ @property
63
+ def client(self) -> Optional[Any]:
64
+ """Lazy-init OpenAI client."""
65
+ if self._client is None and self.config.is_ai_enabled:
66
+ if not OPENAI_AVAILABLE:
67
+ logger.warning("OpenAI package not installed. Install with: pip install openai")
68
+ return None
69
+
70
+ self._client = OpenAI(
71
+ api_key=self.config.api_key,
72
+ base_url=self.config.base_url,
73
+ timeout=self.config.timeout_seconds,
74
+ )
75
+ return self._client
76
+
77
+ @property
78
+ def tokens_used(self) -> int:
79
+ """Total tokens used for AI normalization."""
80
+ return self._total_tokens
81
+
82
+ def normalize_resources(
83
+ self,
84
+ resources: List[Dict[str, Any]],
85
+ use_ai: bool = True,
86
+ ) -> Dict[str, str]:
87
+ """Normalize a list of resources.
88
+
89
+ Args:
90
+ resources: List of resource dicts with 'arn', 'name', 'tags', 'resource_type'
91
+ use_ai: Whether to use AI for ambiguous names
92
+
93
+ Returns:
94
+ Dict mapping ARN -> normalized_name
95
+ """
96
+ results: Dict[str, str] = {}
97
+ needs_ai: List[Dict[str, Any]] = []
98
+
99
+ # Phase 1: Try rules-based normalization
100
+ for resource in resources:
101
+ arn = resource.get("arn", "")
102
+ normalized = self._try_rules_based(resource)
103
+
104
+ if normalized:
105
+ results[arn] = normalized
106
+ logger.debug(f"Rules-based: {resource.get('name')} -> {normalized}")
107
+ else:
108
+ needs_ai.append(resource)
109
+
110
+ logger.info(
111
+ f"Normalization: {len(results)} via rules, {len(needs_ai)} need AI"
112
+ )
113
+
114
+ # Phase 2: AI normalization for ambiguous names
115
+ if needs_ai and use_ai and self.client:
116
+ ai_results = self._normalize_with_ai(needs_ai)
117
+ results.update(ai_results)
118
+ logger.info(f"AI normalized {len(ai_results)} resources")
119
+ elif needs_ai:
120
+ # Fallback: use lowercase name if AI not available
121
+ for resource in needs_ai:
122
+ arn = resource.get("arn", "")
123
+ name = resource.get("name", "")
124
+ results[arn] = self._basic_normalize(name)
125
+ logger.debug(f"Fallback: {name} -> {results[arn]}")
126
+
127
+ return results
128
+
129
+ def normalize_single(
130
+ self,
131
+ name: str,
132
+ resource_type: str,
133
+ tags: Optional[Dict[str, str]] = None,
134
+ ) -> NormalizationResult:
135
+ """Normalize a single resource and return detailed result.
136
+
137
+ This method is used by snapshot_store to compute normalized names
138
+ during snapshot save.
139
+
140
+ Args:
141
+ name: Physical resource name
142
+ resource_type: AWS resource type (e.g., 'AWS::Lambda::Function')
143
+ tags: Resource tags
144
+
145
+ Returns:
146
+ NormalizationResult with normalized_name, extracted_patterns, method, confidence
147
+ """
148
+ tags = tags or {}
149
+
150
+ # Priority 1: CloudFormation logical ID tag (most reliable)
151
+ logical_id = tags.get("aws:cloudformation:logical-id")
152
+ if logical_id:
153
+ return NormalizationResult(
154
+ normalized_name=self._basic_normalize(logical_id),
155
+ extracted_patterns=[],
156
+ method="tag:logical-id",
157
+ confidence=1.0,
158
+ )
159
+
160
+ # Priority 2: Name tag (user-defined, stable)
161
+ name_tag = tags.get("Name")
162
+ if name_tag and not self._has_random_patterns(name_tag):
163
+ return NormalizationResult(
164
+ normalized_name=self._basic_normalize(name_tag),
165
+ extracted_patterns=[],
166
+ method="tag:Name",
167
+ confidence=0.95,
168
+ )
169
+
170
+ # Priority 3: Check if entirely AWS-generated ID (subnet-xxx, vpc-xxx, etc.)
171
+ if self._is_aws_resource_id(name, resource_type):
172
+ # Can't normalize - needs Name tag for stable matching
173
+ return NormalizationResult(
174
+ normalized_name=self._basic_normalize(name),
175
+ extracted_patterns=[name],
176
+ method="none",
177
+ confidence=0.0, # Low confidence - needs tag for reliable matching
178
+ )
179
+
180
+ # Priority 4: Try to extract patterns from physical name
181
+ normalized, extracted = self._extract_patterns(name)
182
+ if extracted:
183
+ return NormalizationResult(
184
+ normalized_name=self._basic_normalize(normalized),
185
+ extracted_patterns=extracted,
186
+ method="pattern",
187
+ confidence=0.8,
188
+ )
189
+
190
+ # Priority 5: Clean name - no normalization needed
191
+ return NormalizationResult(
192
+ normalized_name=self._basic_normalize(name),
193
+ extracted_patterns=[],
194
+ method="none",
195
+ confidence=0.9, # Clean name, high confidence
196
+ )
197
+
198
+ def _is_aws_resource_id(self, name: str, resource_type: str) -> bool:
199
+ """Check if name is entirely an AWS-generated resource ID.
200
+
201
+ These IDs (subnet-xxx, vpc-xxx, vol-xxx, etc.) are stable but
202
+ provide no semantic meaning without a Name tag.
203
+ """
204
+ # Map resource types to their ID patterns
205
+ aws_id_patterns = {
206
+ "AWS::EC2::Subnet": r"^subnet-[a-f0-9]+$",
207
+ "AWS::EC2::VPC": r"^vpc-[a-f0-9]+$",
208
+ "AWS::EC2::SecurityGroup": r"^sg-[a-f0-9]+$",
209
+ "AWS::EC2::Volume": r"^vol-[a-f0-9]+$",
210
+ "AWS::EC2::Instance": r"^i-[a-f0-9]+$",
211
+ "AWS::EC2::InternetGateway": r"^igw-[a-f0-9]+$",
212
+ "AWS::EC2::RouteTable": r"^rtb-[a-f0-9]+$",
213
+ "AWS::EC2::NetworkAcl": r"^acl-[a-f0-9]+$",
214
+ "AWS::EC2::NetworkInterface": r"^eni-[a-f0-9]+$",
215
+ "AWS::EC2::NatGateway": r"^nat-[a-f0-9]+$",
216
+ "AWS::EC2::EIP": r"^eipalloc-[a-f0-9]+$",
217
+ }
218
+
219
+ pattern = aws_id_patterns.get(resource_type)
220
+ if pattern:
221
+ # AWS resource IDs are always lowercase hex, no IGNORECASE needed
222
+ return bool(re.match(pattern, name))
223
+ return False
224
+
225
+ def _extract_patterns(self, name: str) -> Tuple[str, List[str]]:
226
+ """Extract auto-generated patterns from name.
227
+
228
+ Strips common patterns like CloudFormation suffixes, account IDs,
229
+ regions, timestamps, etc.
230
+
231
+ Returns:
232
+ Tuple of (cleaned_name, list_of_extracted_patterns)
233
+ """
234
+ extracted = []
235
+ result = name
236
+
237
+ # Patterns to extract (ordered by specificity)
238
+ extraction_patterns = [
239
+ # CloudFormation suffix (uppercase alphanumeric, 8-13 chars at end)
240
+ (r"-[A-Z0-9]{8,13}$", "cfn_suffix"),
241
+ # Bedrock/Kendra suffix (underscore + lowercase alphanumeric)
242
+ (r"_[a-z0-9]{4,6}$", "bedrock_suffix"),
243
+ # Account ID (12 digits, with optional surrounding hyphens)
244
+ (r"-?\d{12}-?", "account_id"),
245
+ # Region (e.g., us-east-1, eu-west-2)
246
+ (r"-?(us|eu|ap|sa|ca|me|af)-(east|west|north|south|central|northeast|southeast)-\d-?", "region"),
247
+ # Hex suffix (8+ lowercase hex chars at end)
248
+ (r"-[a-f0-9]{8,}$", "hex_suffix"),
249
+ # Timestamp suffix (8-14 digits at end)
250
+ (r"-\d{8,14}$", "timestamp"),
251
+ ]
252
+
253
+ for pattern, _pattern_name in extraction_patterns:
254
+ # Note: Don't use IGNORECASE - CloudFormation suffixes are uppercase,
255
+ # and we want case-sensitive matching for accuracy
256
+ match = re.search(pattern, result)
257
+ if match:
258
+ extracted.append(match.group().strip("-"))
259
+ result = result[: match.start()] + result[match.end() :]
260
+
261
+ # Clean up trailing/leading separators
262
+ result = re.sub(r"^[-_]+|[-_]+$", "", result)
263
+ # Collapse multiple separators
264
+ result = re.sub(r"[-_]{2,}", "-", result)
265
+
266
+ return result, extracted
267
+
268
+ def _try_rules_based(self, resource: Dict[str, Any]) -> Optional[str]:
269
+ """Try to normalize using rules.
270
+
271
+ Priority:
272
+ 1. CloudFormation logical ID tag
273
+ 2. Name tag (if clean)
274
+ 3. Physical name (if clean)
275
+
276
+ Returns None if name appears to have random patterns.
277
+ """
278
+ tags = resource.get("tags", {}) or {}
279
+ name = resource.get("name", "")
280
+
281
+ # 1. CloudFormation logical ID is the best canonical identifier
282
+ logical_id = tags.get("aws:cloudformation:logical-id")
283
+ if logical_id:
284
+ return self._basic_normalize(logical_id)
285
+
286
+ # 2. Name tag (if it looks clean)
287
+ name_tag = tags.get("Name")
288
+ if name_tag and not self._has_random_patterns(name_tag):
289
+ return self._basic_normalize(name_tag)
290
+
291
+ # 3. Physical name (if it looks clean)
292
+ if name and not self._has_random_patterns(name):
293
+ return self._basic_normalize(name)
294
+
295
+ # Name has random patterns - needs AI
296
+ return None
297
+
298
+ def _has_random_patterns(self, name: str) -> bool:
299
+ """Check if name contains random-looking patterns."""
300
+ if not name:
301
+ return True
302
+
303
+ for pattern in self._random_patterns:
304
+ if pattern.search(name):
305
+ return True
306
+ return False
307
+
308
+ def _basic_normalize(self, name: str) -> str:
309
+ """Basic string normalization without AI.
310
+
311
+ - Lowercase
312
+ - Replace underscores/spaces with hyphens
313
+ - Strip leading/trailing hyphens
314
+ """
315
+ if not name:
316
+ return ""
317
+
318
+ result = name.lower()
319
+ result = re.sub(r"[_\s]+", "-", result)
320
+ result = re.sub(r"-+", "-", result)
321
+ return result.strip("-")
322
+
323
+ def _normalize_with_ai(
324
+ self,
325
+ resources: List[Dict[str, Any]],
326
+ ) -> Dict[str, str]:
327
+ """Normalize resources using AI.
328
+
329
+ Batches resources and calls the AI API.
330
+
331
+ Args:
332
+ resources: Resources that need AI normalization
333
+
334
+ Returns:
335
+ Dict mapping ARN -> normalized_name
336
+ """
337
+ results: Dict[str, str] = {}
338
+
339
+ # Process in batches
340
+ for i in range(0, len(resources), self.config.max_batch_size):
341
+ batch = resources[i : i + self.config.max_batch_size]
342
+ batch_results = self._process_ai_batch(batch)
343
+ results.update(batch_results)
344
+
345
+ return results
346
+
347
+ def _process_ai_batch(
348
+ self,
349
+ resources: List[Dict[str, Any]],
350
+ ) -> Dict[str, str]:
351
+ """Process a single batch through the AI.
352
+
353
+ Args:
354
+ resources: Batch of resources
355
+
356
+ Returns:
357
+ Dict mapping ARN -> normalized_name
358
+ """
359
+ # Build the user prompt with resource details
360
+ resource_list = []
361
+ for r in resources:
362
+ item = {
363
+ "arn": r.get("arn", ""),
364
+ "name": r.get("name", ""),
365
+ "type": r.get("resource_type", ""),
366
+ }
367
+ # Include Name tag if present
368
+ tags = r.get("tags", {}) or {}
369
+ if tags.get("Name"):
370
+ item["name_tag"] = tags["Name"]
371
+ resource_list.append(item)
372
+
373
+ user_prompt = json.dumps({"resources": resource_list}, indent=2)
374
+
375
+ # Call the AI with retries
376
+ for attempt in range(self.config.max_retries):
377
+ try:
378
+ response = self.client.chat.completions.create(
379
+ model=self.config.model,
380
+ messages=[
381
+ {"role": "system", "content": NORMALIZATION_SYSTEM_PROMPT},
382
+ {"role": "user", "content": user_prompt},
383
+ ],
384
+ temperature=0.1, # Low for consistency
385
+ )
386
+
387
+ # Track token usage
388
+ if hasattr(response, "usage") and response.usage:
389
+ self._total_tokens += response.usage.total_tokens
390
+
391
+ # Parse response
392
+ content = response.choices[0].message.content
393
+ return self._parse_ai_response(content, resources)
394
+
395
+ except Exception as e:
396
+ wait_time = 2**attempt
397
+ logger.warning(
398
+ f"AI normalization attempt {attempt + 1} failed: {e}. "
399
+ f"Retrying in {wait_time}s..."
400
+ )
401
+ if attempt < self.config.max_retries - 1:
402
+ time.sleep(wait_time)
403
+
404
+ # All retries failed - use fallback
405
+ logger.error("AI normalization failed after all retries")
406
+ return {
407
+ r.get("arn", ""): self._basic_normalize(r.get("name", ""))
408
+ for r in resources
409
+ }
410
+
411
+ def _parse_ai_response(
412
+ self,
413
+ content: str,
414
+ resources: List[Dict[str, Any]],
415
+ ) -> Dict[str, str]:
416
+ """Parse AI response into ARN -> normalized_name mapping.
417
+
418
+ Args:
419
+ content: AI response content (JSON string)
420
+ resources: Original resources (for fallback)
421
+
422
+ Returns:
423
+ Dict mapping ARN -> normalized_name
424
+ """
425
+ try:
426
+ data = json.loads(content)
427
+ normalizations = data.get("normalizations", [])
428
+
429
+ results = {}
430
+ for norm in normalizations:
431
+ arn = norm.get("arn", "")
432
+ normalized_name = norm.get("normalized_name", "")
433
+ if arn and normalized_name:
434
+ results[arn] = normalized_name
435
+
436
+ # Fallback for any missing
437
+ for r in resources:
438
+ arn = r.get("arn", "")
439
+ if arn and arn not in results:
440
+ results[arn] = self._basic_normalize(r.get("name", ""))
441
+
442
+ return results
443
+
444
+ except json.JSONDecodeError as e:
445
+ logger.error(f"Failed to parse AI response as JSON: {e}")
446
+ logger.debug(f"Response content: {content[:500]}...")
447
+ return {
448
+ r.get("arn", ""): self._basic_normalize(r.get("name", ""))
449
+ for r in resources
450
+ }
@@ -0,0 +1,33 @@
1
+ """AI prompts for resource name normalization."""
2
+
3
+ NORMALIZATION_SYSTEM_PROMPT = """You are an AWS resource name normalizer for cross-account infrastructure matching.
4
+
5
+ Your job is to extract the "logical identity" from AWS resource names by removing:
6
+ - Random suffixes (hex, alphanumeric, CloudFormation-generated)
7
+ - AWS account IDs (12-digit numbers)
8
+ - Region names (us-east-1, eu-west-2, etc.)
9
+ - Stack name prefixes (MyStack-, Stack-, etc.)
10
+ - AWS resource ID prefixes (subnet-, vpc-, vol-, i-, sg-, etc.)
11
+ - Timestamps and dates embedded in names
12
+
13
+ Keep the meaningful, purpose-identifying parts of the name.
14
+
15
+ Rules:
16
+ 1. Output should be lowercase with hyphens (no underscores, no spaces)
17
+ 2. If the name is already clean and meaningful, return it as-is (lowercase)
18
+ 3. Preserve the semantic meaning - "policy-executor" not just "executor"
19
+ 4. For AWS-generated IDs (subnet-xxx, vpc-xxx), use the Name tag if provided
20
+ 5. Strip common AWS service prefixes that don't add meaning
21
+
22
+ Examples:
23
+ - "cloud-custodian-480738299408-policy-executor-abc123" → "cloud-custodian-policy-executor"
24
+ - "AmazonBedrockExecutionRoleForKnowledgeBase_jnwn1" → "bedrock-knowledge-base-execution-role"
25
+ - "MyStack-ProcessorLambda-XYZ789ABC" → "processor-lambda"
26
+ - "daybreak-transcribe-processor" → "daybreak-transcribe-processor" (already clean)
27
+ - "AWSServiceRoleForOrganizations" → "aws-service-role-organizations"
28
+ - "d-9067239ebb_controllers" → "directory-controllers"
29
+ - Resource with name "subnet-abc123def" and Name tag "Private-Subnet-AZ1" → "private-subnet-az1"
30
+
31
+ Respond ONLY with valid JSON in this exact format:
32
+ {"normalizations": [{"arn": "arn:aws:...", "normalized_name": "..."}]}
33
+ """
src/snapshot/capturer.py CHANGED
@@ -33,6 +33,7 @@ from .resource_collectors.codebuild import CodeBuildCollector
33
33
  from .resource_collectors.codepipeline import CodePipelineCollector
34
34
  from .resource_collectors.dynamodb import DynamoDBCollector
35
35
  from .resource_collectors.ec2 import EC2Collector
36
+ from .resource_collectors.glue import GlueCollector
36
37
  from .resource_collectors.ecs import ECSCollector
37
38
  from .resource_collectors.efs_collector import EFSCollector
38
39
  from .resource_collectors.eks import EKSCollector
@@ -85,6 +86,7 @@ COLLECTOR_REGISTRY: List[Type[BaseResourceCollector]] = [
85
86
  CodePipelineCollector,
86
87
  CodeBuildCollector,
87
88
  BackupCollector,
89
+ GlueCollector,
88
90
  ]
89
91
 
90
92
 
@@ -0,0 +1,199 @@
1
+ """AWS Glue resource collector."""
2
+
3
+ from typing import List
4
+
5
+ from ...models.resource import Resource
6
+ from ...utils.hash import compute_config_hash
7
+ from .base import BaseResourceCollector
8
+
9
+
10
+ class GlueCollector(BaseResourceCollector):
11
+ """Collector for AWS Glue resources (databases, tables, crawlers, jobs)."""
12
+
13
+ @property
14
+ def service_name(self) -> str:
15
+ return "glue"
16
+
17
+ def collect(self) -> List[Resource]:
18
+ """Collect AWS Glue resources.
19
+
20
+ Returns:
21
+ List of Glue resources (databases, tables, crawlers, jobs)
22
+ """
23
+ resources = []
24
+ client = self._create_client()
25
+ account_id = self._get_account_id()
26
+
27
+ # Collect databases and tables
28
+ resources.extend(self._collect_databases(client, account_id))
29
+
30
+ # Collect crawlers
31
+ resources.extend(self._collect_crawlers(client, account_id))
32
+
33
+ # Collect jobs
34
+ resources.extend(self._collect_jobs(client, account_id))
35
+
36
+ # Collect connections
37
+ resources.extend(self._collect_connections(client, account_id))
38
+
39
+ self.logger.debug(f"Collected {len(resources)} Glue resources in {self.region}")
40
+ return resources
41
+
42
+ def _collect_databases(self, client, account_id: str) -> List[Resource]:
43
+ """Collect Glue databases and their tables."""
44
+ resources = []
45
+
46
+ try:
47
+ paginator = client.get_paginator("get_databases")
48
+ for page in paginator.paginate():
49
+ for db in page.get("DatabaseList", []):
50
+ db_name = db.get("Name")
51
+ db_arn = f"arn:aws:glue:{self.region}:{account_id}:database/{db_name}"
52
+
53
+ resource = Resource(
54
+ arn=db_arn,
55
+ resource_type="AWS::Glue::Database",
56
+ name=db_name,
57
+ region=self.region,
58
+ tags={}, # Glue databases don't support tags directly
59
+ config_hash=compute_config_hash(db),
60
+ created_at=db.get("CreateTime"),
61
+ raw_config=db,
62
+ )
63
+ resources.append(resource)
64
+
65
+ # Collect tables for this database
66
+ resources.extend(self._collect_tables(client, account_id, db_name))
67
+
68
+ except Exception as e:
69
+ self.logger.error(f"Error collecting Glue databases in {self.region}: {e}")
70
+
71
+ return resources
72
+
73
+ def _collect_tables(self, client, account_id: str, database_name: str) -> List[Resource]:
74
+ """Collect tables for a specific database."""
75
+ resources = []
76
+
77
+ try:
78
+ paginator = client.get_paginator("get_tables")
79
+ for page in paginator.paginate(DatabaseName=database_name):
80
+ for table in page.get("TableList", []):
81
+ table_name = table.get("Name")
82
+ table_arn = f"arn:aws:glue:{self.region}:{account_id}:table/{database_name}/{table_name}"
83
+
84
+ resource = Resource(
85
+ arn=table_arn,
86
+ resource_type="AWS::Glue::Table",
87
+ name=f"{database_name}/{table_name}",
88
+ region=self.region,
89
+ tags={}, # Glue tables don't support tags directly
90
+ config_hash=compute_config_hash(table),
91
+ created_at=table.get("CreateTime"),
92
+ raw_config=table,
93
+ )
94
+ resources.append(resource)
95
+
96
+ except Exception as e:
97
+ self.logger.debug(f"Error collecting tables for database {database_name}: {e}")
98
+
99
+ return resources
100
+
101
+ def _collect_crawlers(self, client, account_id: str) -> List[Resource]:
102
+ """Collect Glue crawlers."""
103
+ resources = []
104
+
105
+ try:
106
+ paginator = client.get_paginator("get_crawlers")
107
+ for page in paginator.paginate():
108
+ for crawler in page.get("Crawlers", []):
109
+ crawler_name = crawler.get("Name")
110
+ crawler_arn = f"arn:aws:glue:{self.region}:{account_id}:crawler/{crawler_name}"
111
+
112
+ # Get tags for crawler
113
+ tags = {}
114
+ try:
115
+ tag_response = client.get_tags(ResourceArn=crawler_arn)
116
+ tags = tag_response.get("Tags", {})
117
+ except Exception as e:
118
+ self.logger.debug(f"Could not get tags for crawler {crawler_name}: {e}")
119
+
120
+ resource = Resource(
121
+ arn=crawler_arn,
122
+ resource_type="AWS::Glue::Crawler",
123
+ name=crawler_name,
124
+ region=self.region,
125
+ tags=tags,
126
+ config_hash=compute_config_hash(crawler),
127
+ created_at=crawler.get("CreationTime"),
128
+ raw_config=crawler,
129
+ )
130
+ resources.append(resource)
131
+
132
+ except Exception as e:
133
+ self.logger.error(f"Error collecting Glue crawlers in {self.region}: {e}")
134
+
135
+ return resources
136
+
137
+ def _collect_jobs(self, client, account_id: str) -> List[Resource]:
138
+ """Collect Glue jobs."""
139
+ resources = []
140
+
141
+ try:
142
+ paginator = client.get_paginator("get_jobs")
143
+ for page in paginator.paginate():
144
+ for job in page.get("Jobs", []):
145
+ job_name = job.get("Name")
146
+ job_arn = f"arn:aws:glue:{self.region}:{account_id}:job/{job_name}"
147
+
148
+ # Get tags for job
149
+ tags = {}
150
+ try:
151
+ tag_response = client.get_tags(ResourceArn=job_arn)
152
+ tags = tag_response.get("Tags", {})
153
+ except Exception as e:
154
+ self.logger.debug(f"Could not get tags for job {job_name}: {e}")
155
+
156
+ resource = Resource(
157
+ arn=job_arn,
158
+ resource_type="AWS::Glue::Job",
159
+ name=job_name,
160
+ region=self.region,
161
+ tags=tags,
162
+ config_hash=compute_config_hash(job),
163
+ created_at=job.get("CreatedOn"),
164
+ raw_config=job,
165
+ )
166
+ resources.append(resource)
167
+
168
+ except Exception as e:
169
+ self.logger.error(f"Error collecting Glue jobs in {self.region}: {e}")
170
+
171
+ return resources
172
+
173
+ def _collect_connections(self, client, account_id: str) -> List[Resource]:
174
+ """Collect Glue connections."""
175
+ resources = []
176
+
177
+ try:
178
+ paginator = client.get_paginator("get_connections")
179
+ for page in paginator.paginate():
180
+ for conn in page.get("ConnectionList", []):
181
+ conn_name = conn.get("Name")
182
+ conn_arn = f"arn:aws:glue:{self.region}:{account_id}:connection/{conn_name}"
183
+
184
+ resource = Resource(
185
+ arn=conn_arn,
186
+ resource_type="AWS::Glue::Connection",
187
+ name=conn_name,
188
+ region=self.region,
189
+ tags={}, # Connections don't support tags
190
+ config_hash=compute_config_hash(conn),
191
+ created_at=conn.get("CreationTime"),
192
+ raw_config=conn,
193
+ )
194
+ resources.append(resource)
195
+
196
+ except Exception as e:
197
+ self.logger.error(f"Error collecting Glue connections in {self.region}: {e}")
198
+
199
+ return resources