aws-inventory-manager 0.13.2__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aws-inventory-manager might be problematic. Click here for more details.
- {aws_inventory_manager-0.13.2.dist-info → aws_inventory_manager-0.16.0.dist-info}/METADATA +1 -1
- {aws_inventory_manager-0.13.2.dist-info → aws_inventory_manager-0.16.0.dist-info}/RECORD +20 -13
- src/cli/main.py +202 -3
- src/cloudtrail/__init__.py +5 -0
- src/cloudtrail/query.py +419 -0
- src/matching/__init__.py +6 -0
- src/matching/config.py +52 -0
- src/matching/normalizer.py +450 -0
- src/matching/prompts.py +33 -0
- src/snapshot/capturer.py +2 -0
- src/snapshot/resource_collectors/glue.py +199 -0
- src/storage/group_store.py +22 -8
- src/storage/resource_store.py +3 -0
- src/storage/schema.py +52 -1
- src/storage/snapshot_store.py +19 -2
- src/web/templates/pages/resources.html +3 -0
- {aws_inventory_manager-0.13.2.dist-info → aws_inventory_manager-0.16.0.dist-info}/LICENSE +0 -0
- {aws_inventory_manager-0.13.2.dist-info → aws_inventory_manager-0.16.0.dist-info}/WHEEL +0 -0
- {aws_inventory_manager-0.13.2.dist-info → aws_inventory_manager-0.16.0.dist-info}/entry_points.txt +0 -0
- {aws_inventory_manager-0.13.2.dist-info → aws_inventory_manager-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
"""Resource name normalizer using rules and AI."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
from .config import NormalizerConfig
|
|
11
|
+
from .prompts import NORMALIZATION_SYSTEM_PROMPT
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class NormalizationResult:
|
|
16
|
+
"""Result of normalizing a resource name.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
normalized_name: The semantic part after stripping auto-generated components
|
|
20
|
+
extracted_patterns: List of patterns that were stripped from the name
|
|
21
|
+
method: How normalization was determined ('tag:logical-id', 'tag:Name', 'pattern', 'none')
|
|
22
|
+
confidence: Confidence score (0.0-1.0) indicating reliability of the normalization
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
normalized_name: str
|
|
26
|
+
extracted_patterns: List[str] = field(default_factory=list)
|
|
27
|
+
method: str = "none"
|
|
28
|
+
confidence: float = 0.9
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
# Try to import openai, but don't fail if not installed
|
|
33
|
+
try:
|
|
34
|
+
from openai import OpenAI
|
|
35
|
+
|
|
36
|
+
OPENAI_AVAILABLE = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
OPENAI_AVAILABLE = False
|
|
39
|
+
OpenAI = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ResourceNormalizer:
|
|
43
|
+
"""Normalize resource names using rules-based and AI approaches.
|
|
44
|
+
|
|
45
|
+
The normalizer first tries rules-based normalization for obvious cases,
|
|
46
|
+
then falls back to AI for ambiguous names.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, config: Optional[NormalizerConfig] = None):
|
|
50
|
+
"""Initialize the normalizer.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
config: Configuration for normalization. If None, loads from environment.
|
|
54
|
+
"""
|
|
55
|
+
self.config = config or NormalizerConfig.from_env()
|
|
56
|
+
self._client: Optional[Any] = None
|
|
57
|
+
self._total_tokens = 0
|
|
58
|
+
|
|
59
|
+
# Compile regex patterns for performance
|
|
60
|
+
self._random_patterns = [re.compile(p) for p in self.config.random_patterns]
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def client(self) -> Optional[Any]:
|
|
64
|
+
"""Lazy-init OpenAI client."""
|
|
65
|
+
if self._client is None and self.config.is_ai_enabled:
|
|
66
|
+
if not OPENAI_AVAILABLE:
|
|
67
|
+
logger.warning("OpenAI package not installed. Install with: pip install openai")
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
self._client = OpenAI(
|
|
71
|
+
api_key=self.config.api_key,
|
|
72
|
+
base_url=self.config.base_url,
|
|
73
|
+
timeout=self.config.timeout_seconds,
|
|
74
|
+
)
|
|
75
|
+
return self._client
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def tokens_used(self) -> int:
|
|
79
|
+
"""Total tokens used for AI normalization."""
|
|
80
|
+
return self._total_tokens
|
|
81
|
+
|
|
82
|
+
def normalize_resources(
|
|
83
|
+
self,
|
|
84
|
+
resources: List[Dict[str, Any]],
|
|
85
|
+
use_ai: bool = True,
|
|
86
|
+
) -> Dict[str, str]:
|
|
87
|
+
"""Normalize a list of resources.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
resources: List of resource dicts with 'arn', 'name', 'tags', 'resource_type'
|
|
91
|
+
use_ai: Whether to use AI for ambiguous names
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Dict mapping ARN -> normalized_name
|
|
95
|
+
"""
|
|
96
|
+
results: Dict[str, str] = {}
|
|
97
|
+
needs_ai: List[Dict[str, Any]] = []
|
|
98
|
+
|
|
99
|
+
# Phase 1: Try rules-based normalization
|
|
100
|
+
for resource in resources:
|
|
101
|
+
arn = resource.get("arn", "")
|
|
102
|
+
normalized = self._try_rules_based(resource)
|
|
103
|
+
|
|
104
|
+
if normalized:
|
|
105
|
+
results[arn] = normalized
|
|
106
|
+
logger.debug(f"Rules-based: {resource.get('name')} -> {normalized}")
|
|
107
|
+
else:
|
|
108
|
+
needs_ai.append(resource)
|
|
109
|
+
|
|
110
|
+
logger.info(
|
|
111
|
+
f"Normalization: {len(results)} via rules, {len(needs_ai)} need AI"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Phase 2: AI normalization for ambiguous names
|
|
115
|
+
if needs_ai and use_ai and self.client:
|
|
116
|
+
ai_results = self._normalize_with_ai(needs_ai)
|
|
117
|
+
results.update(ai_results)
|
|
118
|
+
logger.info(f"AI normalized {len(ai_results)} resources")
|
|
119
|
+
elif needs_ai:
|
|
120
|
+
# Fallback: use lowercase name if AI not available
|
|
121
|
+
for resource in needs_ai:
|
|
122
|
+
arn = resource.get("arn", "")
|
|
123
|
+
name = resource.get("name", "")
|
|
124
|
+
results[arn] = self._basic_normalize(name)
|
|
125
|
+
logger.debug(f"Fallback: {name} -> {results[arn]}")
|
|
126
|
+
|
|
127
|
+
return results
|
|
128
|
+
|
|
129
|
+
def normalize_single(
|
|
130
|
+
self,
|
|
131
|
+
name: str,
|
|
132
|
+
resource_type: str,
|
|
133
|
+
tags: Optional[Dict[str, str]] = None,
|
|
134
|
+
) -> NormalizationResult:
|
|
135
|
+
"""Normalize a single resource and return detailed result.
|
|
136
|
+
|
|
137
|
+
This method is used by snapshot_store to compute normalized names
|
|
138
|
+
during snapshot save.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
name: Physical resource name
|
|
142
|
+
resource_type: AWS resource type (e.g., 'AWS::Lambda::Function')
|
|
143
|
+
tags: Resource tags
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
NormalizationResult with normalized_name, extracted_patterns, method, confidence
|
|
147
|
+
"""
|
|
148
|
+
tags = tags or {}
|
|
149
|
+
|
|
150
|
+
# Priority 1: CloudFormation logical ID tag (most reliable)
|
|
151
|
+
logical_id = tags.get("aws:cloudformation:logical-id")
|
|
152
|
+
if logical_id:
|
|
153
|
+
return NormalizationResult(
|
|
154
|
+
normalized_name=self._basic_normalize(logical_id),
|
|
155
|
+
extracted_patterns=[],
|
|
156
|
+
method="tag:logical-id",
|
|
157
|
+
confidence=1.0,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Priority 2: Name tag (user-defined, stable)
|
|
161
|
+
name_tag = tags.get("Name")
|
|
162
|
+
if name_tag and not self._has_random_patterns(name_tag):
|
|
163
|
+
return NormalizationResult(
|
|
164
|
+
normalized_name=self._basic_normalize(name_tag),
|
|
165
|
+
extracted_patterns=[],
|
|
166
|
+
method="tag:Name",
|
|
167
|
+
confidence=0.95,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Priority 3: Check if entirely AWS-generated ID (subnet-xxx, vpc-xxx, etc.)
|
|
171
|
+
if self._is_aws_resource_id(name, resource_type):
|
|
172
|
+
# Can't normalize - needs Name tag for stable matching
|
|
173
|
+
return NormalizationResult(
|
|
174
|
+
normalized_name=self._basic_normalize(name),
|
|
175
|
+
extracted_patterns=[name],
|
|
176
|
+
method="none",
|
|
177
|
+
confidence=0.0, # Low confidence - needs tag for reliable matching
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Priority 4: Try to extract patterns from physical name
|
|
181
|
+
normalized, extracted = self._extract_patterns(name)
|
|
182
|
+
if extracted:
|
|
183
|
+
return NormalizationResult(
|
|
184
|
+
normalized_name=self._basic_normalize(normalized),
|
|
185
|
+
extracted_patterns=extracted,
|
|
186
|
+
method="pattern",
|
|
187
|
+
confidence=0.8,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Priority 5: Clean name - no normalization needed
|
|
191
|
+
return NormalizationResult(
|
|
192
|
+
normalized_name=self._basic_normalize(name),
|
|
193
|
+
extracted_patterns=[],
|
|
194
|
+
method="none",
|
|
195
|
+
confidence=0.9, # Clean name, high confidence
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def _is_aws_resource_id(self, name: str, resource_type: str) -> bool:
|
|
199
|
+
"""Check if name is entirely an AWS-generated resource ID.
|
|
200
|
+
|
|
201
|
+
These IDs (subnet-xxx, vpc-xxx, vol-xxx, etc.) are stable but
|
|
202
|
+
provide no semantic meaning without a Name tag.
|
|
203
|
+
"""
|
|
204
|
+
# Map resource types to their ID patterns
|
|
205
|
+
aws_id_patterns = {
|
|
206
|
+
"AWS::EC2::Subnet": r"^subnet-[a-f0-9]+$",
|
|
207
|
+
"AWS::EC2::VPC": r"^vpc-[a-f0-9]+$",
|
|
208
|
+
"AWS::EC2::SecurityGroup": r"^sg-[a-f0-9]+$",
|
|
209
|
+
"AWS::EC2::Volume": r"^vol-[a-f0-9]+$",
|
|
210
|
+
"AWS::EC2::Instance": r"^i-[a-f0-9]+$",
|
|
211
|
+
"AWS::EC2::InternetGateway": r"^igw-[a-f0-9]+$",
|
|
212
|
+
"AWS::EC2::RouteTable": r"^rtb-[a-f0-9]+$",
|
|
213
|
+
"AWS::EC2::NetworkAcl": r"^acl-[a-f0-9]+$",
|
|
214
|
+
"AWS::EC2::NetworkInterface": r"^eni-[a-f0-9]+$",
|
|
215
|
+
"AWS::EC2::NatGateway": r"^nat-[a-f0-9]+$",
|
|
216
|
+
"AWS::EC2::EIP": r"^eipalloc-[a-f0-9]+$",
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
pattern = aws_id_patterns.get(resource_type)
|
|
220
|
+
if pattern:
|
|
221
|
+
# AWS resource IDs are always lowercase hex, no IGNORECASE needed
|
|
222
|
+
return bool(re.match(pattern, name))
|
|
223
|
+
return False
|
|
224
|
+
|
|
225
|
+
def _extract_patterns(self, name: str) -> Tuple[str, List[str]]:
|
|
226
|
+
"""Extract auto-generated patterns from name.
|
|
227
|
+
|
|
228
|
+
Strips common patterns like CloudFormation suffixes, account IDs,
|
|
229
|
+
regions, timestamps, etc.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Tuple of (cleaned_name, list_of_extracted_patterns)
|
|
233
|
+
"""
|
|
234
|
+
extracted = []
|
|
235
|
+
result = name
|
|
236
|
+
|
|
237
|
+
# Patterns to extract (ordered by specificity)
|
|
238
|
+
extraction_patterns = [
|
|
239
|
+
# CloudFormation suffix (uppercase alphanumeric, 8-13 chars at end)
|
|
240
|
+
(r"-[A-Z0-9]{8,13}$", "cfn_suffix"),
|
|
241
|
+
# Bedrock/Kendra suffix (underscore + lowercase alphanumeric)
|
|
242
|
+
(r"_[a-z0-9]{4,6}$", "bedrock_suffix"),
|
|
243
|
+
# Account ID (12 digits, with optional surrounding hyphens)
|
|
244
|
+
(r"-?\d{12}-?", "account_id"),
|
|
245
|
+
# Region (e.g., us-east-1, eu-west-2)
|
|
246
|
+
(r"-?(us|eu|ap|sa|ca|me|af)-(east|west|north|south|central|northeast|southeast)-\d-?", "region"),
|
|
247
|
+
# Hex suffix (8+ lowercase hex chars at end)
|
|
248
|
+
(r"-[a-f0-9]{8,}$", "hex_suffix"),
|
|
249
|
+
# Timestamp suffix (8-14 digits at end)
|
|
250
|
+
(r"-\d{8,14}$", "timestamp"),
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
for pattern, _pattern_name in extraction_patterns:
|
|
254
|
+
# Note: Don't use IGNORECASE - CloudFormation suffixes are uppercase,
|
|
255
|
+
# and we want case-sensitive matching for accuracy
|
|
256
|
+
match = re.search(pattern, result)
|
|
257
|
+
if match:
|
|
258
|
+
extracted.append(match.group().strip("-"))
|
|
259
|
+
result = result[: match.start()] + result[match.end() :]
|
|
260
|
+
|
|
261
|
+
# Clean up trailing/leading separators
|
|
262
|
+
result = re.sub(r"^[-_]+|[-_]+$", "", result)
|
|
263
|
+
# Collapse multiple separators
|
|
264
|
+
result = re.sub(r"[-_]{2,}", "-", result)
|
|
265
|
+
|
|
266
|
+
return result, extracted
|
|
267
|
+
|
|
268
|
+
def _try_rules_based(self, resource: Dict[str, Any]) -> Optional[str]:
|
|
269
|
+
"""Try to normalize using rules.
|
|
270
|
+
|
|
271
|
+
Priority:
|
|
272
|
+
1. CloudFormation logical ID tag
|
|
273
|
+
2. Name tag (if clean)
|
|
274
|
+
3. Physical name (if clean)
|
|
275
|
+
|
|
276
|
+
Returns None if name appears to have random patterns.
|
|
277
|
+
"""
|
|
278
|
+
tags = resource.get("tags", {}) or {}
|
|
279
|
+
name = resource.get("name", "")
|
|
280
|
+
|
|
281
|
+
# 1. CloudFormation logical ID is the best canonical identifier
|
|
282
|
+
logical_id = tags.get("aws:cloudformation:logical-id")
|
|
283
|
+
if logical_id:
|
|
284
|
+
return self._basic_normalize(logical_id)
|
|
285
|
+
|
|
286
|
+
# 2. Name tag (if it looks clean)
|
|
287
|
+
name_tag = tags.get("Name")
|
|
288
|
+
if name_tag and not self._has_random_patterns(name_tag):
|
|
289
|
+
return self._basic_normalize(name_tag)
|
|
290
|
+
|
|
291
|
+
# 3. Physical name (if it looks clean)
|
|
292
|
+
if name and not self._has_random_patterns(name):
|
|
293
|
+
return self._basic_normalize(name)
|
|
294
|
+
|
|
295
|
+
# Name has random patterns - needs AI
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
def _has_random_patterns(self, name: str) -> bool:
|
|
299
|
+
"""Check if name contains random-looking patterns."""
|
|
300
|
+
if not name:
|
|
301
|
+
return True
|
|
302
|
+
|
|
303
|
+
for pattern in self._random_patterns:
|
|
304
|
+
if pattern.search(name):
|
|
305
|
+
return True
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
def _basic_normalize(self, name: str) -> str:
|
|
309
|
+
"""Basic string normalization without AI.
|
|
310
|
+
|
|
311
|
+
- Lowercase
|
|
312
|
+
- Replace underscores/spaces with hyphens
|
|
313
|
+
- Strip leading/trailing hyphens
|
|
314
|
+
"""
|
|
315
|
+
if not name:
|
|
316
|
+
return ""
|
|
317
|
+
|
|
318
|
+
result = name.lower()
|
|
319
|
+
result = re.sub(r"[_\s]+", "-", result)
|
|
320
|
+
result = re.sub(r"-+", "-", result)
|
|
321
|
+
return result.strip("-")
|
|
322
|
+
|
|
323
|
+
def _normalize_with_ai(
|
|
324
|
+
self,
|
|
325
|
+
resources: List[Dict[str, Any]],
|
|
326
|
+
) -> Dict[str, str]:
|
|
327
|
+
"""Normalize resources using AI.
|
|
328
|
+
|
|
329
|
+
Batches resources and calls the AI API.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
resources: Resources that need AI normalization
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Dict mapping ARN -> normalized_name
|
|
336
|
+
"""
|
|
337
|
+
results: Dict[str, str] = {}
|
|
338
|
+
|
|
339
|
+
# Process in batches
|
|
340
|
+
for i in range(0, len(resources), self.config.max_batch_size):
|
|
341
|
+
batch = resources[i : i + self.config.max_batch_size]
|
|
342
|
+
batch_results = self._process_ai_batch(batch)
|
|
343
|
+
results.update(batch_results)
|
|
344
|
+
|
|
345
|
+
return results
|
|
346
|
+
|
|
347
|
+
def _process_ai_batch(
|
|
348
|
+
self,
|
|
349
|
+
resources: List[Dict[str, Any]],
|
|
350
|
+
) -> Dict[str, str]:
|
|
351
|
+
"""Process a single batch through the AI.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
resources: Batch of resources
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Dict mapping ARN -> normalized_name
|
|
358
|
+
"""
|
|
359
|
+
# Build the user prompt with resource details
|
|
360
|
+
resource_list = []
|
|
361
|
+
for r in resources:
|
|
362
|
+
item = {
|
|
363
|
+
"arn": r.get("arn", ""),
|
|
364
|
+
"name": r.get("name", ""),
|
|
365
|
+
"type": r.get("resource_type", ""),
|
|
366
|
+
}
|
|
367
|
+
# Include Name tag if present
|
|
368
|
+
tags = r.get("tags", {}) or {}
|
|
369
|
+
if tags.get("Name"):
|
|
370
|
+
item["name_tag"] = tags["Name"]
|
|
371
|
+
resource_list.append(item)
|
|
372
|
+
|
|
373
|
+
user_prompt = json.dumps({"resources": resource_list}, indent=2)
|
|
374
|
+
|
|
375
|
+
# Call the AI with retries
|
|
376
|
+
for attempt in range(self.config.max_retries):
|
|
377
|
+
try:
|
|
378
|
+
response = self.client.chat.completions.create(
|
|
379
|
+
model=self.config.model,
|
|
380
|
+
messages=[
|
|
381
|
+
{"role": "system", "content": NORMALIZATION_SYSTEM_PROMPT},
|
|
382
|
+
{"role": "user", "content": user_prompt},
|
|
383
|
+
],
|
|
384
|
+
temperature=0.1, # Low for consistency
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Track token usage
|
|
388
|
+
if hasattr(response, "usage") and response.usage:
|
|
389
|
+
self._total_tokens += response.usage.total_tokens
|
|
390
|
+
|
|
391
|
+
# Parse response
|
|
392
|
+
content = response.choices[0].message.content
|
|
393
|
+
return self._parse_ai_response(content, resources)
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
wait_time = 2**attempt
|
|
397
|
+
logger.warning(
|
|
398
|
+
f"AI normalization attempt {attempt + 1} failed: {e}. "
|
|
399
|
+
f"Retrying in {wait_time}s..."
|
|
400
|
+
)
|
|
401
|
+
if attempt < self.config.max_retries - 1:
|
|
402
|
+
time.sleep(wait_time)
|
|
403
|
+
|
|
404
|
+
# All retries failed - use fallback
|
|
405
|
+
logger.error("AI normalization failed after all retries")
|
|
406
|
+
return {
|
|
407
|
+
r.get("arn", ""): self._basic_normalize(r.get("name", ""))
|
|
408
|
+
for r in resources
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
def _parse_ai_response(
|
|
412
|
+
self,
|
|
413
|
+
content: str,
|
|
414
|
+
resources: List[Dict[str, Any]],
|
|
415
|
+
) -> Dict[str, str]:
|
|
416
|
+
"""Parse AI response into ARN -> normalized_name mapping.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
content: AI response content (JSON string)
|
|
420
|
+
resources: Original resources (for fallback)
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
Dict mapping ARN -> normalized_name
|
|
424
|
+
"""
|
|
425
|
+
try:
|
|
426
|
+
data = json.loads(content)
|
|
427
|
+
normalizations = data.get("normalizations", [])
|
|
428
|
+
|
|
429
|
+
results = {}
|
|
430
|
+
for norm in normalizations:
|
|
431
|
+
arn = norm.get("arn", "")
|
|
432
|
+
normalized_name = norm.get("normalized_name", "")
|
|
433
|
+
if arn and normalized_name:
|
|
434
|
+
results[arn] = normalized_name
|
|
435
|
+
|
|
436
|
+
# Fallback for any missing
|
|
437
|
+
for r in resources:
|
|
438
|
+
arn = r.get("arn", "")
|
|
439
|
+
if arn and arn not in results:
|
|
440
|
+
results[arn] = self._basic_normalize(r.get("name", ""))
|
|
441
|
+
|
|
442
|
+
return results
|
|
443
|
+
|
|
444
|
+
except json.JSONDecodeError as e:
|
|
445
|
+
logger.error(f"Failed to parse AI response as JSON: {e}")
|
|
446
|
+
logger.debug(f"Response content: {content[:500]}...")
|
|
447
|
+
return {
|
|
448
|
+
r.get("arn", ""): self._basic_normalize(r.get("name", ""))
|
|
449
|
+
for r in resources
|
|
450
|
+
}
|
src/matching/prompts.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""AI prompts for resource name normalization."""
|
|
2
|
+
|
|
3
|
+
NORMALIZATION_SYSTEM_PROMPT = """You are an AWS resource name normalizer for cross-account infrastructure matching.
|
|
4
|
+
|
|
5
|
+
Your job is to extract the "logical identity" from AWS resource names by removing:
|
|
6
|
+
- Random suffixes (hex, alphanumeric, CloudFormation-generated)
|
|
7
|
+
- AWS account IDs (12-digit numbers)
|
|
8
|
+
- Region names (us-east-1, eu-west-2, etc.)
|
|
9
|
+
- Stack name prefixes (MyStack-, Stack-, etc.)
|
|
10
|
+
- AWS resource ID prefixes (subnet-, vpc-, vol-, i-, sg-, etc.)
|
|
11
|
+
- Timestamps and dates embedded in names
|
|
12
|
+
|
|
13
|
+
Keep the meaningful, purpose-identifying parts of the name.
|
|
14
|
+
|
|
15
|
+
Rules:
|
|
16
|
+
1. Output should be lowercase with hyphens (no underscores, no spaces)
|
|
17
|
+
2. If the name is already clean and meaningful, return it as-is (lowercase)
|
|
18
|
+
3. Preserve the semantic meaning - "policy-executor" not just "executor"
|
|
19
|
+
4. For AWS-generated IDs (subnet-xxx, vpc-xxx), use the Name tag if provided
|
|
20
|
+
5. Strip common AWS service prefixes that don't add meaning
|
|
21
|
+
|
|
22
|
+
Examples:
|
|
23
|
+
- "cloud-custodian-480738299408-policy-executor-abc123" → "cloud-custodian-policy-executor"
|
|
24
|
+
- "AmazonBedrockExecutionRoleForKnowledgeBase_jnwn1" → "bedrock-knowledge-base-execution-role"
|
|
25
|
+
- "MyStack-ProcessorLambda-XYZ789ABC" → "processor-lambda"
|
|
26
|
+
- "daybreak-transcribe-processor" → "daybreak-transcribe-processor" (already clean)
|
|
27
|
+
- "AWSServiceRoleForOrganizations" → "aws-service-role-organizations"
|
|
28
|
+
- "d-9067239ebb_controllers" → "directory-controllers"
|
|
29
|
+
- Resource with name "subnet-abc123def" and Name tag "Private-Subnet-AZ1" → "private-subnet-az1"
|
|
30
|
+
|
|
31
|
+
Respond ONLY with valid JSON in this exact format:
|
|
32
|
+
{"normalizations": [{"arn": "arn:aws:...", "normalized_name": "..."}]}
|
|
33
|
+
"""
|
src/snapshot/capturer.py
CHANGED
|
@@ -33,6 +33,7 @@ from .resource_collectors.codebuild import CodeBuildCollector
|
|
|
33
33
|
from .resource_collectors.codepipeline import CodePipelineCollector
|
|
34
34
|
from .resource_collectors.dynamodb import DynamoDBCollector
|
|
35
35
|
from .resource_collectors.ec2 import EC2Collector
|
|
36
|
+
from .resource_collectors.glue import GlueCollector
|
|
36
37
|
from .resource_collectors.ecs import ECSCollector
|
|
37
38
|
from .resource_collectors.efs_collector import EFSCollector
|
|
38
39
|
from .resource_collectors.eks import EKSCollector
|
|
@@ -85,6 +86,7 @@ COLLECTOR_REGISTRY: List[Type[BaseResourceCollector]] = [
|
|
|
85
86
|
CodePipelineCollector,
|
|
86
87
|
CodeBuildCollector,
|
|
87
88
|
BackupCollector,
|
|
89
|
+
GlueCollector,
|
|
88
90
|
]
|
|
89
91
|
|
|
90
92
|
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""AWS Glue resource collector."""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from ...models.resource import Resource
|
|
6
|
+
from ...utils.hash import compute_config_hash
|
|
7
|
+
from .base import BaseResourceCollector
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GlueCollector(BaseResourceCollector):
|
|
11
|
+
"""Collector for AWS Glue resources (databases, tables, crawlers, jobs)."""
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def service_name(self) -> str:
|
|
15
|
+
return "glue"
|
|
16
|
+
|
|
17
|
+
def collect(self) -> List[Resource]:
|
|
18
|
+
"""Collect AWS Glue resources.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
List of Glue resources (databases, tables, crawlers, jobs)
|
|
22
|
+
"""
|
|
23
|
+
resources = []
|
|
24
|
+
client = self._create_client()
|
|
25
|
+
account_id = self._get_account_id()
|
|
26
|
+
|
|
27
|
+
# Collect databases and tables
|
|
28
|
+
resources.extend(self._collect_databases(client, account_id))
|
|
29
|
+
|
|
30
|
+
# Collect crawlers
|
|
31
|
+
resources.extend(self._collect_crawlers(client, account_id))
|
|
32
|
+
|
|
33
|
+
# Collect jobs
|
|
34
|
+
resources.extend(self._collect_jobs(client, account_id))
|
|
35
|
+
|
|
36
|
+
# Collect connections
|
|
37
|
+
resources.extend(self._collect_connections(client, account_id))
|
|
38
|
+
|
|
39
|
+
self.logger.debug(f"Collected {len(resources)} Glue resources in {self.region}")
|
|
40
|
+
return resources
|
|
41
|
+
|
|
42
|
+
def _collect_databases(self, client, account_id: str) -> List[Resource]:
|
|
43
|
+
"""Collect Glue databases and their tables."""
|
|
44
|
+
resources = []
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
paginator = client.get_paginator("get_databases")
|
|
48
|
+
for page in paginator.paginate():
|
|
49
|
+
for db in page.get("DatabaseList", []):
|
|
50
|
+
db_name = db.get("Name")
|
|
51
|
+
db_arn = f"arn:aws:glue:{self.region}:{account_id}:database/{db_name}"
|
|
52
|
+
|
|
53
|
+
resource = Resource(
|
|
54
|
+
arn=db_arn,
|
|
55
|
+
resource_type="AWS::Glue::Database",
|
|
56
|
+
name=db_name,
|
|
57
|
+
region=self.region,
|
|
58
|
+
tags={}, # Glue databases don't support tags directly
|
|
59
|
+
config_hash=compute_config_hash(db),
|
|
60
|
+
created_at=db.get("CreateTime"),
|
|
61
|
+
raw_config=db,
|
|
62
|
+
)
|
|
63
|
+
resources.append(resource)
|
|
64
|
+
|
|
65
|
+
# Collect tables for this database
|
|
66
|
+
resources.extend(self._collect_tables(client, account_id, db_name))
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
self.logger.error(f"Error collecting Glue databases in {self.region}: {e}")
|
|
70
|
+
|
|
71
|
+
return resources
|
|
72
|
+
|
|
73
|
+
def _collect_tables(self, client, account_id: str, database_name: str) -> List[Resource]:
|
|
74
|
+
"""Collect tables for a specific database."""
|
|
75
|
+
resources = []
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
paginator = client.get_paginator("get_tables")
|
|
79
|
+
for page in paginator.paginate(DatabaseName=database_name):
|
|
80
|
+
for table in page.get("TableList", []):
|
|
81
|
+
table_name = table.get("Name")
|
|
82
|
+
table_arn = f"arn:aws:glue:{self.region}:{account_id}:table/{database_name}/{table_name}"
|
|
83
|
+
|
|
84
|
+
resource = Resource(
|
|
85
|
+
arn=table_arn,
|
|
86
|
+
resource_type="AWS::Glue::Table",
|
|
87
|
+
name=f"{database_name}/{table_name}",
|
|
88
|
+
region=self.region,
|
|
89
|
+
tags={}, # Glue tables don't support tags directly
|
|
90
|
+
config_hash=compute_config_hash(table),
|
|
91
|
+
created_at=table.get("CreateTime"),
|
|
92
|
+
raw_config=table,
|
|
93
|
+
)
|
|
94
|
+
resources.append(resource)
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
self.logger.debug(f"Error collecting tables for database {database_name}: {e}")
|
|
98
|
+
|
|
99
|
+
return resources
|
|
100
|
+
|
|
101
|
+
def _collect_crawlers(self, client, account_id: str) -> List[Resource]:
|
|
102
|
+
"""Collect Glue crawlers."""
|
|
103
|
+
resources = []
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
paginator = client.get_paginator("get_crawlers")
|
|
107
|
+
for page in paginator.paginate():
|
|
108
|
+
for crawler in page.get("Crawlers", []):
|
|
109
|
+
crawler_name = crawler.get("Name")
|
|
110
|
+
crawler_arn = f"arn:aws:glue:{self.region}:{account_id}:crawler/{crawler_name}"
|
|
111
|
+
|
|
112
|
+
# Get tags for crawler
|
|
113
|
+
tags = {}
|
|
114
|
+
try:
|
|
115
|
+
tag_response = client.get_tags(ResourceArn=crawler_arn)
|
|
116
|
+
tags = tag_response.get("Tags", {})
|
|
117
|
+
except Exception as e:
|
|
118
|
+
self.logger.debug(f"Could not get tags for crawler {crawler_name}: {e}")
|
|
119
|
+
|
|
120
|
+
resource = Resource(
|
|
121
|
+
arn=crawler_arn,
|
|
122
|
+
resource_type="AWS::Glue::Crawler",
|
|
123
|
+
name=crawler_name,
|
|
124
|
+
region=self.region,
|
|
125
|
+
tags=tags,
|
|
126
|
+
config_hash=compute_config_hash(crawler),
|
|
127
|
+
created_at=crawler.get("CreationTime"),
|
|
128
|
+
raw_config=crawler,
|
|
129
|
+
)
|
|
130
|
+
resources.append(resource)
|
|
131
|
+
|
|
132
|
+
except Exception as e:
|
|
133
|
+
self.logger.error(f"Error collecting Glue crawlers in {self.region}: {e}")
|
|
134
|
+
|
|
135
|
+
return resources
|
|
136
|
+
|
|
137
|
+
def _collect_jobs(self, client, account_id: str) -> List[Resource]:
|
|
138
|
+
"""Collect Glue jobs."""
|
|
139
|
+
resources = []
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
paginator = client.get_paginator("get_jobs")
|
|
143
|
+
for page in paginator.paginate():
|
|
144
|
+
for job in page.get("Jobs", []):
|
|
145
|
+
job_name = job.get("Name")
|
|
146
|
+
job_arn = f"arn:aws:glue:{self.region}:{account_id}:job/{job_name}"
|
|
147
|
+
|
|
148
|
+
# Get tags for job
|
|
149
|
+
tags = {}
|
|
150
|
+
try:
|
|
151
|
+
tag_response = client.get_tags(ResourceArn=job_arn)
|
|
152
|
+
tags = tag_response.get("Tags", {})
|
|
153
|
+
except Exception as e:
|
|
154
|
+
self.logger.debug(f"Could not get tags for job {job_name}: {e}")
|
|
155
|
+
|
|
156
|
+
resource = Resource(
|
|
157
|
+
arn=job_arn,
|
|
158
|
+
resource_type="AWS::Glue::Job",
|
|
159
|
+
name=job_name,
|
|
160
|
+
region=self.region,
|
|
161
|
+
tags=tags,
|
|
162
|
+
config_hash=compute_config_hash(job),
|
|
163
|
+
created_at=job.get("CreatedOn"),
|
|
164
|
+
raw_config=job,
|
|
165
|
+
)
|
|
166
|
+
resources.append(resource)
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
self.logger.error(f"Error collecting Glue jobs in {self.region}: {e}")
|
|
170
|
+
|
|
171
|
+
return resources
|
|
172
|
+
|
|
173
|
+
def _collect_connections(self, client, account_id: str) -> List[Resource]:
|
|
174
|
+
"""Collect Glue connections."""
|
|
175
|
+
resources = []
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
paginator = client.get_paginator("get_connections")
|
|
179
|
+
for page in paginator.paginate():
|
|
180
|
+
for conn in page.get("ConnectionList", []):
|
|
181
|
+
conn_name = conn.get("Name")
|
|
182
|
+
conn_arn = f"arn:aws:glue:{self.region}:{account_id}:connection/{conn_name}"
|
|
183
|
+
|
|
184
|
+
resource = Resource(
|
|
185
|
+
arn=conn_arn,
|
|
186
|
+
resource_type="AWS::Glue::Connection",
|
|
187
|
+
name=conn_name,
|
|
188
|
+
region=self.region,
|
|
189
|
+
tags={}, # Connections don't support tags
|
|
190
|
+
config_hash=compute_config_hash(conn),
|
|
191
|
+
created_at=conn.get("CreationTime"),
|
|
192
|
+
raw_config=conn,
|
|
193
|
+
)
|
|
194
|
+
resources.append(resource)
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
self.logger.error(f"Error collecting Glue connections in {self.region}: {e}")
|
|
198
|
+
|
|
199
|
+
return resources
|