awslabs.terraform-mcp-server 1.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. awslabs/__init__.py +17 -0
  2. awslabs/terraform_mcp_server/__init__.py +17 -0
  3. awslabs/terraform_mcp_server/impl/resources/__init__.py +25 -0
  4. awslabs/terraform_mcp_server/impl/resources/terraform_aws_provider_resources_listing.py +66 -0
  5. awslabs/terraform_mcp_server/impl/resources/terraform_awscc_provider_resources_listing.py +69 -0
  6. awslabs/terraform_mcp_server/impl/tools/__init__.py +33 -0
  7. awslabs/terraform_mcp_server/impl/tools/execute_terraform_command.py +223 -0
  8. awslabs/terraform_mcp_server/impl/tools/execute_terragrunt_command.py +320 -0
  9. awslabs/terraform_mcp_server/impl/tools/run_checkov_scan.py +376 -0
  10. awslabs/terraform_mcp_server/impl/tools/search_aws_provider_docs.py +691 -0
  11. awslabs/terraform_mcp_server/impl/tools/search_awscc_provider_docs.py +641 -0
  12. awslabs/terraform_mcp_server/impl/tools/search_specific_aws_ia_modules.py +458 -0
  13. awslabs/terraform_mcp_server/impl/tools/search_user_provided_module.py +349 -0
  14. awslabs/terraform_mcp_server/impl/tools/utils.py +572 -0
  15. awslabs/terraform_mcp_server/models/__init__.py +49 -0
  16. awslabs/terraform_mcp_server/models/models.py +381 -0
  17. awslabs/terraform_mcp_server/scripts/generate_aws_provider_resources.py +1240 -0
  18. awslabs/terraform_mcp_server/scripts/generate_awscc_provider_resources.py +1039 -0
  19. awslabs/terraform_mcp_server/scripts/scrape_aws_terraform_best_practices.py +143 -0
  20. awslabs/terraform_mcp_server/server.py +440 -0
  21. awslabs/terraform_mcp_server/static/AWSCC_PROVIDER_RESOURCES.md +3125 -0
  22. awslabs/terraform_mcp_server/static/AWS_PROVIDER_RESOURCES.md +3833 -0
  23. awslabs/terraform_mcp_server/static/AWS_TERRAFORM_BEST_PRACTICES.md +2523 -0
  24. awslabs/terraform_mcp_server/static/MCP_INSTRUCTIONS.md +142 -0
  25. awslabs/terraform_mcp_server/static/TERRAFORM_WORKFLOW_GUIDE.md +330 -0
  26. awslabs/terraform_mcp_server/static/__init__.py +38 -0
  27. awslabs_terraform_mcp_server-1.0.14.dist-info/METADATA +166 -0
  28. awslabs_terraform_mcp_server-1.0.14.dist-info/RECORD +30 -0
  29. awslabs_terraform_mcp_server-1.0.14.dist-info/WHEEL +4 -0
  30. awslabs_terraform_mcp_server-1.0.14.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1039 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Script to generate AWSCC provider resources markdown for the Terraform Expert MCP server.
16
+
17
+ This script scrapes the Terraform AWSCC provider documentation using Playwright
18
+ and generates a comprehensive markdown file listing all AWS service categories,
19
+ resources, and data sources.
20
+
21
+ The generated markdown is saved to the static directory for use by the MCP server.
22
+
23
+ Usage:
24
+ python generate_awscc_provider_resources.py [--max-categories N] [--output PATH]
25
+
26
+ Options:
27
+ --max-categories N Limit to N categories (default: all)
28
+ --output PATH Output file path (default: terraform_mcp_server/static/AWSCC_PROVIDER_RESOURCES.md)
29
+ --no-fallback Don't use fallback data if scraping fails
30
+ """
31
+
32
+ import argparse
33
+ import asyncio
34
+ import os
35
+ import re
36
+ import sys
37
+ import tempfile
38
+ import time
39
+ from bs4 import BeautifulSoup, Tag
40
+ from bs4.element import PageElement, ResultSet
41
+ from bs4.filter import SoupStrainer
42
+ from datetime import datetime
43
+ from loguru import logger
44
+ from pathlib import Path
45
+ from typing import Any, Optional, TypeVar
46
+
47
+
48
+ # Type helpers for BeautifulSoup
49
+ T = TypeVar('T')
50
+
51
+
52
+ def ensure_tag(element: Optional[PageElement]) -> Optional[Tag]:
53
+ """Ensure an element is a Tag or return None."""
54
+ if isinstance(element, Tag):
55
+ return element
56
+ return None
57
+
58
+
59
+ def safe_find(element: Any, *args: Any, **kwargs: Any) -> Optional[Tag]:
60
+ """Safely find an element in a Tag."""
61
+ if not isinstance(element, Tag):
62
+ return None
63
+ result = element.find(*args, **kwargs)
64
+ return ensure_tag(result)
65
+
66
+
67
+ def safe_find_all(element: Any, *args: Any, **kwargs: Any) -> ResultSet:
68
+ """Safely find all elements in a Tag."""
69
+ if not isinstance(element, Tag):
70
+ return ResultSet(SoupStrainer(), [])
71
+ return element.find_all(*args, **kwargs)
72
+
73
+
74
+ def safe_get_text(element: Any, strip: bool = False) -> str:
75
+ """Safely get text from an element."""
76
+ if hasattr(element, 'get_text'):
77
+ return element.get_text(strip=strip)
78
+ return str(element) if element is not None else ''
79
+
80
+
81
+ ## Playwright optional import
82
+ try:
83
+ from playwright.async_api import async_playwright
84
+ except ImportError:
85
+ # Playwright is optional, we'll use fallback data if it's not available
86
+ async_playwright = None
87
+
88
+ # Add the parent directory to sys.path so we can import from terraform_mcp_server
89
+ script_dir = Path(__file__).resolve().parent
90
+ repo_root = script_dir.parent.parent.parent
91
+ sys.path.insert(0, str(repo_root))
92
+
93
+
94
+ # Configure logger for enhanced diagnostics with stacktraces
95
+ logger.configure(
96
+ handlers=[
97
+ {
98
+ 'sink': sys.stderr,
99
+ 'backtrace': True,
100
+ 'diagnose': True,
101
+ 'format': '<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>',
102
+ }
103
+ ]
104
+ )
105
+
106
+ # Environment variable to control whether to use Playwright or go straight to fallback data
107
+ USE_PLAYWRIGHT = os.environ.get('USE_PLAYWRIGHT', '1').lower() in ('1', 'true', 'yes')
108
+ # Shorter timeout to fail faster if it's not going to work
109
+ NAVIGATION_TIMEOUT = 20000 # 20 seconds
110
+ # Default output path
111
+ DEFAULT_OUTPUT_PATH = (
112
+ repo_root / 'awslabs' / 'terraform_mcp_server' / 'static' / 'AWSCC_PROVIDER_RESOURCES.md'
113
+ )
114
+ # AWSCC provider URL
115
+ AWSCC_PROVIDER_URL = 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs'
116
+
117
+
118
+ async def fetch_awscc_provider_page():
119
+ """Fetch the AWSCC provider documentation page using Playwright.
120
+
121
+ This function uses a headless browser to render the JavaScript-driven
122
+ Terraform Registry website and extract the AWSCC provider resources.
123
+
124
+ It will fall back to pre-defined data if:
125
+ - The USE_PLAYWRIGHT environment variable is set to 0/false/no
126
+ - There's any error during the scraping process
127
+
128
+ Returns:
129
+ A dictionary containing:
130
+ - 'categories': Dictionary of AWSCC service categories with resources and data sources
131
+ - 'version': AWSCC provider version string (e.g., "1.36.0")
132
+ """
133
+ # Check if we should skip Playwright or if it's not available
134
+ if not USE_PLAYWRIGHT or async_playwright is None:
135
+ logger.info(
136
+ 'Skipping Playwright and using pre-defined resource structure (USE_PLAYWRIGHT=0)'
137
+ )
138
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
139
+
140
+ logger.info('Starting browser to extract AWSCC provider resources structure')
141
+ start_time = time.time()
142
+ categories = {}
143
+
144
+ try:
145
+ async with async_playwright() as p:
146
+ # Launch the browser with specific options for better performance
147
+ browser = await p.chromium.launch(
148
+ headless=True,
149
+ args=['--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox'],
150
+ )
151
+ context = await browser.new_context(
152
+ viewport={'width': 1280, 'height': 800},
153
+ user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
154
+ )
155
+ page = await context.new_page()
156
+
157
+ # Set a shorter timeout for navigation
158
+ page.set_default_timeout(NAVIGATION_TIMEOUT)
159
+
160
+ # Navigate to the AWS provider docs with reduced timeout
161
+ logger.info(
162
+ f'Navigating to Terraform AWSCC provider documentation (timeout: {NAVIGATION_TIMEOUT}ms)'
163
+ )
164
+ try:
165
+ await page.goto(
166
+ AWSCC_PROVIDER_URL,
167
+ wait_until='domcontentloaded',
168
+ ) # Using 'domcontentloaded' instead of 'networkidle'
169
+ logger.info('Basic page loaded successfully')
170
+ except Exception as nav_error:
171
+ logger.error(f'Error during navigation: {nav_error}')
172
+ await browser.close()
173
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
174
+
175
+ # Wait for the content to be fully loaded
176
+ logger.info('Waiting for page to render completely')
177
+
178
+ # Add a small fixed delay to let JavaScript finish rendering
179
+ await asyncio.sleep(2)
180
+
181
+ # Extract AWS provider version
182
+ provider_version = 'unknown'
183
+ try:
184
+ # Try to extract version using the selector provided
185
+ logger.info('Attempting to extract AWSCC provider version')
186
+
187
+ # Try using the selector approach
188
+ version_element = await page.query_selector(
189
+ 'body > div.provider-view > div.provider-nav > nav.bread-crumbs.is-light > div > div > ul > li:nth-child(4) > span'
190
+ )
191
+ if version_element:
192
+ # Try to extract text from the element
193
+ version_text = await version_element.inner_text()
194
+ logger.debug(f'Found version element with text: {version_text}')
195
+
196
+ # Extract just the version number using regex
197
+ version_match = re.search(r'Version\s+([0-9.]+)', version_text)
198
+ if version_match:
199
+ provider_version = version_match.group(1) # e.g., "5.91.0"
200
+ logger.info(f'Extracted AWSCC provider version: {provider_version}')
201
+ else:
202
+ # If regex doesn't match, try JavaScript approach
203
+ logger.debug("Regex pattern didn't match, trying JavaScript approach")
204
+ provider_version = await page.evaluate("""
205
+ () => {
206
+ const versionEl = document.querySelector('.version-dropdown button span');
207
+ return versionEl ? versionEl.innerText.trim() : null;
208
+ }
209
+ """)
210
+ # Clean up the version string if needed
211
+ if provider_version:
212
+ provider_version = provider_version.strip()
213
+ version_match = re.search(r'([0-9.]+)', provider_version)
214
+ if version_match:
215
+ provider_version = version_match.group(1)
216
+ logger.info(
217
+ f'Extracted AWS provider version via JavaScript: {provider_version}'
218
+ )
219
+ else:
220
+ # If the specific selector doesn't work, try a more general approach
221
+ logger.debug(
222
+ 'Specific version selector not found, trying alternative selectors'
223
+ )
224
+ provider_version = await page.evaluate("""
225
+ () => {
226
+ // Try different selectors that might contain the version
227
+ const selectors = [
228
+ '.version-dropdown button span',
229
+ '.dropdown-trigger button span',
230
+ 'span:contains("Version")'
231
+ ];
232
+ for (const selector of selectors) {
233
+ try {
234
+ const el = document.querySelector(selector);
235
+ if (el && el.innerText.includes('Version')) {
236
+ return el.innerText.trim();
237
+ }
238
+ } catch (e) {}
239
+ }
240
+ return null;
241
+ }
242
+ """)
243
+
244
+ # Extract version number from text if found
245
+ if provider_version:
246
+ version_match = re.search(r'([0-9.]+)', provider_version)
247
+ if version_match:
248
+ provider_version = version_match.group(1)
249
+ logger.info(
250
+ f'Extracted AWSCC provider version via alternative selector: {provider_version}'
251
+ )
252
+ except Exception as version_error:
253
+ logger.warning(f'Error extracting AWSCC provider version: {version_error}')
254
+
255
+ # Check for and handle cookie consent banner
256
+ logger.info('Checking for cookie consent banner')
257
+ try:
258
+ # Check if the consent banner is present
259
+ consent_banner = await page.query_selector('#consent-banner')
260
+ if consent_banner:
261
+ logger.info('Cookie consent banner detected, attempting to dismiss')
262
+
263
+ # Target the specific dismiss button based on the HTML structure provided
264
+ dismiss_button_selectors = [
265
+ 'button.hds-button:has-text("Dismiss")',
266
+ 'button.hds-button .hds-button__text:has-text("Dismiss")',
267
+ 'button.hds-button--color-primary',
268
+ ]
269
+
270
+ for selector in dismiss_button_selectors:
271
+ try:
272
+ # Check if the button exists with this selector
273
+ button = await page.query_selector(selector)
274
+ if button:
275
+ logger.info(f'Found dismiss button with selector: {selector}')
276
+ await button.click()
277
+ logger.info('Clicked the dismiss button')
278
+
279
+ # Wait a moment for the banner to disappear
280
+ await asyncio.sleep(1)
281
+
282
+ # Check if the banner is gone
283
+ banner_still_visible = await page.query_selector('#consent-banner')
284
+ if not banner_still_visible:
285
+ logger.info('Banner successfully dismissed')
286
+ break
287
+ except Exception as button_error:
288
+ logger.warning(f'Failed to click button {selector}: {button_error}')
289
+
290
+ # If button clicking didn't work, try JavaScript approach as a fallback
291
+ banner_still_visible = await page.query_selector('#consent-banner')
292
+ if banner_still_visible:
293
+ logger.info('Attempting to remove banner via JavaScript')
294
+ try:
295
+ # Try to remove the banner using JavaScript
296
+ await page.evaluate("""() => {
297
+ const banner = document.getElementById('consent-banner');
298
+ if (banner) banner.remove();
299
+ return true;
300
+ }""")
301
+ logger.info('Removed banner using JavaScript')
302
+ except Exception as js_error:
303
+ logger.warning(f'Failed to remove banner via JavaScript: {js_error}')
304
+ except Exception as banner_error:
305
+ logger.warning(f'Error handling consent banner: {banner_error}')
306
+
307
+ # Progressive wait strategy - try multiple conditions in sequence
308
+ # Define selectors to try in order of preference
309
+ selectors = [
310
+ '.provider-docs-menu-content',
311
+ 'nav',
312
+ '.docs-nav',
313
+ 'aside',
314
+ 'ul.nav',
315
+ 'div[role="navigation"]',
316
+ ]
317
+
318
+ # Try each selector with a short timeout
319
+ for selector in selectors:
320
+ try:
321
+ logger.info(f'Trying to locate element with selector: {selector}')
322
+ await page.wait_for_selector(selector, timeout=5000)
323
+ logger.info(f'Found element with selector: {selector}')
324
+ break
325
+ except Exception as se:
326
+ logger.warning(f"Selector '{selector}' not found: {se}")
327
+
328
+ # Extract the HTML content after JS rendering
329
+ logger.info('Extracting page content')
330
+ content = await page.content()
331
+
332
+ # Save HTML for debugging using tempfile for security
333
+ with tempfile.NamedTemporaryFile(
334
+ prefix='terraform_awscc_debug_playwright_',
335
+ suffix='.html',
336
+ mode='w',
337
+ encoding='utf-8',
338
+ delete=False,
339
+ ) as temp_file:
340
+ temp_file.write(content)
341
+ temp_file.flush()
342
+ debug_file_path = temp_file.name
343
+ logger.debug(f'Saved rendered HTML content to {debug_file_path}')
344
+
345
+ # Parse the HTML
346
+ soup = BeautifulSoup(content, 'html.parser')
347
+
348
+ # First try the specific provider-docs-menu-content selector
349
+ menu_content = soup.select_one('.provider-docs-menu-content')
350
+
351
+ if not menu_content:
352
+ logger.warning(
353
+ "Couldn't find the .provider-docs-menu-content element, trying alternatives"
354
+ )
355
+
356
+ # Try each selector that might contain the menu
357
+ for selector in selectors:
358
+ menu_content = soup.select_one(selector)
359
+ if menu_content:
360
+ logger.info(f'Found menu content with selector: {selector}')
361
+ break
362
+
363
+ # If still not found, look for any substantial navigation
364
+ if not menu_content:
365
+ logger.warning("Still couldn't find navigation using standard selectors")
366
+
367
+ # Try to find any element with many links as a potential menu
368
+ potential_menus = []
369
+ for elem in safe_find_all(soup, ['div', 'nav', 'ul']):
370
+ links = safe_find_all(elem, 'a')
371
+ if len(links) > 10: # Any element with many links might be navigation
372
+ potential_menus.append((elem, len(links)))
373
+
374
+ # Sort by number of links, highest first
375
+ potential_menus.sort(key=lambda x: x[1], reverse=True)
376
+
377
+ if potential_menus:
378
+ menu_content = potential_menus[0][0]
379
+ logger.info(f'Using element with {potential_menus[0][1]} links as menu')
380
+
381
+ # If we still have nothing, use fallback
382
+ if not menu_content:
383
+ logger.error("Couldn't find any navigation element, using fallback data")
384
+ await browser.close()
385
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
386
+
387
+ # Find all category titles (excluding 'guides' and 'functions')
388
+ category_titles = menu_content.select('.menu-list-category-link-title')
389
+
390
+ if not category_titles:
391
+ logger.error("Couldn't find any .menu-list-category-link-title elements")
392
+ await browser.close()
393
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
394
+
395
+ logger.info(f'Found {len(category_titles)} category titles')
396
+
397
+ # First collect all categories that we need to process
398
+ categories_to_process = []
399
+ for category_el in category_titles:
400
+ category_name = category_el.get_text(strip=True)
401
+
402
+ # Skip non-service entries like 'Guides' and 'Functions'
403
+ if category_name.lower() in ['guides', 'functions', 'awscc provider']:
404
+ logger.debug(f'Skipping category: {category_name}')
405
+ continue
406
+
407
+ logger.debug(f'Will process category: {category_name}')
408
+ categories_to_process.append((category_name, category_el))
409
+
410
+ # Initialize category entry
411
+ categories[category_name] = {'resources': [], 'data_sources': []}
412
+
413
+ # Process a smaller set of categories if there are too many (for testing/development)
414
+ MAX_CATEGORIES = int(os.environ.get('MAX_CATEGORIES', '999'))
415
+ if len(categories_to_process) > MAX_CATEGORIES:
416
+ logger.info(
417
+ f'Limiting to {MAX_CATEGORIES} categories (from {len(categories_to_process)})'
418
+ )
419
+ categories_to_process = categories_to_process[:MAX_CATEGORIES]
420
+
421
+ logger.info(
422
+ f'Processing {len(categories_to_process)} categories with click interaction'
423
+ )
424
+
425
+ # Now process each category by clicking on it first
426
+ for category_idx, (category_name, category_el) in enumerate(categories_to_process):
427
+ try:
428
+ # Get the DOM path or some identifier for this category
429
+ # Try to find a unique identifier for the category to click on
430
+ # First, try to get the href attribute from the parent <a> tag
431
+ href = None
432
+ parent_a = category_el.parent
433
+ if parent_a and parent_a.name == 'a':
434
+ href = parent_a.get('href')
435
+
436
+ logger.info(
437
+ f'[{category_idx + 1}/{len(categories_to_process)}] Clicking on category: {category_name}'
438
+ )
439
+
440
+ # Handle potential cookie consent banner interference
441
+ try:
442
+ # Check if banner reappeared
443
+ consent_banner = await page.query_selector('#consent-banner')
444
+ if consent_banner:
445
+ logger.info(
446
+ 'Cookie consent banner detected again, removing via JavaScript'
447
+ )
448
+ await page.evaluate("""() => {
449
+ const banner = document.getElementById('consent-banner');
450
+ if (banner) banner.remove();
451
+ return true;
452
+ }""")
453
+ except Exception:
454
+ pass # Ignore errors in this extra banner check
455
+
456
+ # Click with increased timeout and multiple attempts
457
+ click_success = False
458
+ click_attempts = 0
459
+ max_attempts = 3
460
+
461
+ while not click_success and click_attempts < max_attempts:
462
+ click_attempts += 1
463
+ try:
464
+ if href:
465
+ # If we have an href, use that to locate the element
466
+ try:
467
+ selector = f"a[href='{href}']"
468
+ await page.click(selector, timeout=8000) # Increased timeout
469
+ logger.debug(
470
+ f'Clicked category using href selector: {selector}'
471
+ )
472
+ click_success = True
473
+ except Exception as click_error:
474
+ logger.warning(
475
+ f'Failed to click using href, trying text: {click_error}'
476
+ )
477
+ # If that fails, try to click by text content
478
+ escaped_name = category_name.replace("'", "\\'")
479
+ await page.click(
480
+ f"text='{escaped_name}'", timeout=8000
481
+ ) # Increased timeout
482
+ click_success = True
483
+ else:
484
+ # Otherwise try to click by text content
485
+ escaped_name = category_name.replace("'", "\\'")
486
+ await page.click(
487
+ f"text='{escaped_name}'", timeout=8000
488
+ ) # Increased timeout
489
+ click_success = True
490
+
491
+ except Exception as click_error:
492
+ logger.warning(
493
+ f'Click attempt {click_attempts} failed for {category_name}: {click_error}'
494
+ )
495
+ if click_attempts >= max_attempts:
496
+ logger.error(
497
+ f'Failed to click category {category_name} after {max_attempts} attempts'
498
+ )
499
+ # Don't break the loop, continue with next category
500
+ raise click_error
501
+
502
+ # Try removing any overlays before next attempt
503
+ try:
504
+ await page.evaluate("""() => {
505
+ // Remove common overlay patterns
506
+ document.querySelectorAll('[id*="banner"],[id*="overlay"],[id*="popup"],[class*="banner"],[class*="overlay"],[class*="popup"]')
507
+ .forEach(el => el.remove());
508
+ return true;
509
+ }""")
510
+ await asyncio.sleep(0.5) # Brief pause between attempts
511
+ except Exception:
512
+ pass # Ignore errors in overlay removal
513
+
514
+ # Wait briefly for content to load
515
+ await asyncio.sleep(0.3)
516
+
517
+ # Extract resources and data sources from the now-expanded category
518
+ # We need to use the HTML structure to locate the specific sections for this category
519
+ try:
520
+ # Get the updated HTML after clicking
521
+ current_html = await page.content()
522
+ current_soup = BeautifulSoup(current_html, 'html.parser')
523
+
524
+ resource_count = 0
525
+ data_source_count = 0
526
+
527
+ # Find the clicked category element in the updated DOM
528
+ # This is important because the structure changes after clicking
529
+ # First, find the category span by its text
530
+ category_spans = safe_find_all(
531
+ current_soup, 'span', class_='menu-list-category-link-title'
532
+ )
533
+ clicked_category_span = None
534
+ for span in category_spans:
535
+ if safe_get_text(span, strip=True) == category_name:
536
+ clicked_category_span = span
537
+ break
538
+
539
+ if not clicked_category_span:
540
+ logger.warning(
541
+ f'Could not find clicked category {category_name} in updated DOM'
542
+ )
543
+ continue
544
+
545
+ # Navigate up to find the parent LI, which contains all content for this category
546
+ parent_li = ensure_tag(clicked_category_span.find_parent('li'))
547
+ if not parent_li:
548
+ logger.warning(
549
+ f'Could not find parent LI for category {category_name}'
550
+ )
551
+ continue
552
+
553
+ # Find the ul.menu-list that contains both Resources and Data Sources sections
554
+ category_menu_list = safe_find(parent_li, 'ul', class_='menu-list')
555
+ if not category_menu_list:
556
+ logger.warning(
557
+ f'Could not find menu-list for category {category_name}'
558
+ )
559
+ continue
560
+
561
+ # Process Resources section
562
+ # Find the span with text "Resources"
563
+ resource_spans = safe_find_all(
564
+ category_menu_list, 'span', class_='menu-list-category-link-title'
565
+ )
566
+ resource_section = None
567
+ for span in resource_spans:
568
+ if safe_get_text(span, strip=True) == 'Resources':
569
+ resource_section_li = ensure_tag(span.find_parent('li'))
570
+ if resource_section_li:
571
+ resource_section = safe_find(
572
+ resource_section_li, 'ul', class_='menu-list'
573
+ )
574
+ break
575
+
576
+ # If we can't find the Resources section using the span approach,
577
+ # try alternative methods
578
+ if not resource_section:
579
+ # Look for any UL that might contain resource links
580
+ potential_resource_sections = safe_find_all(category_menu_list, 'ul')
581
+ for ul in potential_resource_sections:
582
+ # Check if this UL contains links that look like resources
583
+ links = safe_find_all(ul, 'a')
584
+ for link in links:
585
+ link_text = safe_get_text(link, strip=True)
586
+ # AWSCC resources typically start with "awscc_"
587
+ if (
588
+ isinstance(link_text, str)
589
+ and link_text.startswith('awscc_')
590
+ and '_data_' not in link_text.lower()
591
+ ):
592
+ resource_section = ul
593
+ break
594
+ if resource_section:
595
+ break
596
+
597
+ # Extract resources
598
+ if resource_section:
599
+ # Try both menu-list-link class and direct a tags
600
+ resource_links = safe_find_all(
601
+ resource_section, 'li', class_='menu-list-link'
602
+ )
603
+
604
+ # If not resource_links, try direct a tags
605
+ if not resource_links:
606
+ resource_links = safe_find_all(resource_section, 'a')
607
+
608
+ for item in resource_links:
609
+ # If item is a link itself (a tag)
610
+ if isinstance(item, Tag) and item.name == 'a':
611
+ link = item
612
+ else:
613
+ # If item is a container (li), find the link inside
614
+ link = safe_find(item, 'a')
615
+
616
+ if not link:
617
+ continue
618
+
619
+ href = link.get('href') if isinstance(link, Tag) else None
620
+ if not href:
621
+ continue
622
+
623
+ link_text = safe_get_text(link, strip=True)
624
+ if not link_text:
625
+ continue
626
+
627
+ # Skip if this doesn't look like an AWSCC resource
628
+ if not isinstance(link_text, str) or not link_text.startswith(
629
+ 'awscc_'
630
+ ):
631
+ continue
632
+
633
+ # Skip data sources (they'll be handled separately)
634
+ if isinstance(link_text, str) and '_data_' in link_text.lower():
635
+ continue
636
+
637
+ # Complete the URL if it's a relative path
638
+ full_url = (
639
+ f'https://registry.terraform.io{href}'
640
+ if isinstance(href, str) and href.startswith('/')
641
+ else href
642
+ )
643
+
644
+ # Add to resources
645
+ resource = {'name': link_text, 'url': full_url, 'type': 'resource'}
646
+
647
+ categories[category_name]['resources'].append(resource)
648
+ resource_count += 1
649
+
650
+ # Process Data Sources section
651
+ # Find the span with text "Data Sources"
652
+ data_spans = safe_find_all(
653
+ category_menu_list, 'span', class_='menu-list-category-link-title'
654
+ )
655
+ data_section = None
656
+ for span in data_spans:
657
+ if safe_get_text(span, strip=True) == 'Data Sources':
658
+ data_section_li = ensure_tag(span.find_parent('li'))
659
+ if data_section_li:
660
+ data_section = safe_find(
661
+ data_section_li, 'ul', class_='menu-list'
662
+ )
663
+ break
664
+
665
+ # If we can't find the Data Sources section using the span approach,
666
+ # try alternative methods
667
+ if not data_section:
668
+ # Look for any UL that might contain data source links
669
+ potential_data_sections = safe_find_all(category_menu_list, 'ul')
670
+ for ul in potential_data_sections:
671
+ # Check if this UL contains links that look like data sources
672
+ links = safe_find_all(ul, 'a')
673
+ for link in links:
674
+ link_text = safe_get_text(link, strip=True)
675
+ href_attr = (
676
+ link.get('href', '') if isinstance(link, Tag) else ''
677
+ )
678
+
679
+ # Data sources typically have "data" in the URL or name
680
+ if (
681
+ isinstance(link_text, str)
682
+ and link_text.startswith('awscc_')
683
+ and (
684
+ (
685
+ isinstance(href_attr, str)
686
+ and 'data' in href_attr.lower()
687
+ )
688
+ or (
689
+ isinstance(link_text, str)
690
+ and 'data' in link_text.lower()
691
+ )
692
+ )
693
+ ):
694
+ data_section = ul
695
+ break
696
+ if data_section:
697
+ break
698
+
699
+ # Extract data sources
700
+ if data_section:
701
+ # Try both menu-list-link class and direct a tags
702
+ data_links = safe_find_all(data_section, 'li', class_='menu-list-link')
703
+
704
+ # If no menu-list-link items found, try direct a tags
705
+ if not data_links:
706
+ data_links = safe_find_all(data_section, 'a')
707
+
708
+ for item in data_links:
709
+ # If item is a link itself (a tag)
710
+ if isinstance(item, Tag) and item.name == 'a':
711
+ link = item
712
+ else:
713
+ # If item is a container (li), find the link inside
714
+ link = safe_find(item, 'a')
715
+
716
+ if not link:
717
+ continue
718
+
719
+ href = link.get('href') if isinstance(link, Tag) else None
720
+ if not href:
721
+ continue
722
+
723
+ link_text = safe_get_text(link, strip=True)
724
+ if not link_text:
725
+ continue
726
+
727
+ # Skip if this doesn't look like an AWSCC data source
728
+ if not isinstance(link_text, str) or not link_text.startswith(
729
+ 'awscc_'
730
+ ):
731
+ continue
732
+
733
+ # Make sure it's a data source (contains "data" in URL or name)
734
+ if not (
735
+ (isinstance(href, str) and 'data' in href.lower())
736
+ or (isinstance(link_text, str) and 'data' in link_text.lower())
737
+ ):
738
+ continue
739
+
740
+ # Complete the URL if it's a relative path
741
+ full_url = (
742
+ f'https://registry.terraform.io{href}'
743
+ if isinstance(href, str) and href.startswith('/')
744
+ else href
745
+ )
746
+
747
+ # Add to data sources
748
+ data_source = {
749
+ 'name': link_text,
750
+ 'url': full_url,
751
+ 'type': 'data_source',
752
+ }
753
+
754
+ categories[category_name]['data_sources'].append(data_source)
755
+ data_source_count += 1
756
+
757
+ # If we still haven't found any resources or data sources,
758
+ # try a more aggressive approach by looking at all links in the category
759
+ if resource_count == 0 and data_source_count == 0:
760
+ all_links = safe_find_all(category_menu_list, 'a')
761
+ for link in all_links:
762
+ href = link.get('href', '') if isinstance(link, Tag) else ''
763
+ link_text = safe_get_text(link, strip=True)
764
+
765
+ if not isinstance(link_text, str) or not link_text.startswith(
766
+ 'awscc_'
767
+ ):
768
+ continue
769
+
770
+ # Complete the URL if it's a relative path
771
+ full_url = (
772
+ f'https://registry.terraform.io{href}'
773
+ if isinstance(href, str) and href.startswith('/')
774
+ else href
775
+ )
776
+
777
+ # Determine if it's a resource or data source based on URL/name
778
+ if isinstance(href, str) and (
779
+ 'data' in href.lower() or 'data-source' in href.lower()
780
+ ):
781
+ data_source = {
782
+ 'name': link_text,
783
+ 'url': full_url,
784
+ 'type': 'data_source',
785
+ }
786
+ categories[category_name]['data_sources'].append(data_source)
787
+ data_source_count += 1
788
+ else:
789
+ resource = {
790
+ 'name': link_text,
791
+ 'url': full_url,
792
+ 'type': 'resource',
793
+ }
794
+ categories[category_name]['resources'].append(resource)
795
+ resource_count += 1
796
+
797
+ logger.info(
798
+ f'Category {category_name}: found {resource_count} resources, {data_source_count} data sources'
799
+ )
800
+
801
+ except Exception as extract_error:
802
+ logger.error(
803
+ f'Error extracting resources for {category_name}: {extract_error}'
804
+ )
805
+
806
+ except Exception as click_error:
807
+ logger.warning(
808
+ f'Error interacting with category {category_name}: {click_error}'
809
+ )
810
+
811
+ # Close the browser
812
+ await browser.close()
813
+
814
+ # Count statistics for logging
815
+ service_count = len(categories)
816
+ resource_count = sum(len(cat['resources']) for cat in categories.values())
817
+ data_source_count = sum(len(cat['data_sources']) for cat in categories.values())
818
+
819
+ duration = time.time() - start_time
820
+ logger.info(
821
+ f'Extracted {service_count} service categories with {resource_count} resources and {data_source_count} data sources in {duration:.2f} seconds'
822
+ )
823
+
824
+ # Return the structure if we have data
825
+ if service_count > 0:
826
+ return {'categories': categories, 'version': provider_version}
827
+ else:
828
+ logger.warning('No categories found, using fallback data')
829
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
830
+
831
+ except Exception as e:
832
+ logger.error(f'Error extracting AWSCC provider resources: {str(e)}')
833
+ # Return fallback data in case of error
834
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
835
+
836
+
837
+ def get_fallback_resource_data():
838
+ """Provide fallback resource data in case the scraping fails.
839
+
840
+ Returns:
841
+ A dictionary with pre-defined AWSCC resources and data sources
842
+ """
843
+ logger.warning('Using pre-defined resource structure as fallback')
844
+
845
+ # The AWSCC provider has a different structure than the AWS provider
846
+ # It has two main categories: Resources and Data Sources
847
+ categories = {
848
+ 'Resources': {
849
+ 'resources': [
850
+ {
851
+ 'name': 'awscc_accessanalyzer_analyzer',
852
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/accessanalyzer_analyzer',
853
+ 'type': 'resource',
854
+ },
855
+ {
856
+ 'name': 'awscc_acmpca_certificate',
857
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate',
858
+ 'type': 'resource',
859
+ },
860
+ {
861
+ 'name': 'awscc_acmpca_certificate_authority',
862
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate_authority',
863
+ 'type': 'resource',
864
+ },
865
+ {
866
+ 'name': 'awscc_acmpca_certificate_authority_activation',
867
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate_authority_activation',
868
+ 'type': 'resource',
869
+ },
870
+ {
871
+ 'name': 'awscc_acmpca_permission',
872
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_permission',
873
+ 'type': 'resource',
874
+ },
875
+ # Add more resources as needed
876
+ ],
877
+ 'data_sources': [],
878
+ },
879
+ 'Data Sources': {
880
+ 'resources': [],
881
+ 'data_sources': [
882
+ {
883
+ 'name': 'awscc_accessanalyzer_analyzer',
884
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/data-sources/accessanalyzer_analyzer',
885
+ 'type': 'data_source',
886
+ },
887
+ {
888
+ 'name': 'awscc_accessanalyzer_analyzers',
889
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/data-sources/accessanalyzer_analyzers',
890
+ 'type': 'data_source',
891
+ },
892
+ # Add more data sources as needed
893
+ ],
894
+ },
895
+ }
896
+ return categories
897
+
898
+
899
+ def parse_arguments():
900
+ """Parse command line arguments."""
901
+ parser = argparse.ArgumentParser(
902
+ description='Generate AWSCC provider resources markdown for the Terraform Expert MCP server.'
903
+ )
904
+ parser.add_argument(
905
+ '--max-categories',
906
+ type=int,
907
+ default=999,
908
+ help='Limit to N categories (default: all)',
909
+ )
910
+ parser.add_argument(
911
+ '--output',
912
+ type=Path,
913
+ default=DEFAULT_OUTPUT_PATH,
914
+ help=f'Output file path (default: {DEFAULT_OUTPUT_PATH})',
915
+ )
916
+ parser.add_argument(
917
+ '--no-fallback',
918
+ action='store_true',
919
+ help="Don't use fallback data if scraping fails",
920
+ )
921
+ return parser.parse_args()
922
+
923
+
924
+ async def main():
925
+ """Main entry point for the script."""
926
+ start_time = datetime.now()
927
+
928
+ # Parse command line arguments
929
+ args = parse_arguments()
930
+
931
+ print('Generating AWSCC provider resources markdown...')
932
+ print(f'Output path: {args.output}')
933
+ print(f'Max categories: {args.max_categories if args.max_categories < 999 else "all"}')
934
+
935
+ # Set environment variable for max categories
936
+ os.environ['MAX_CATEGORIES'] = str(args.max_categories)
937
+
938
+ # Set environment variable for fallback behavior
939
+ if args.no_fallback:
940
+ os.environ['USE_PLAYWRIGHT'] = '1'
941
+ print('Using live scraping without fallback')
942
+
943
+ try:
944
+ # Fetch AWSCC provider data using the existing implementation
945
+ result = await fetch_awscc_provider_page()
946
+
947
+ # Extract categories and version
948
+ if isinstance(result, dict) and 'categories' in result and 'version' in result:
949
+ categories = result['categories']
950
+ provider_version = result.get('version', 'unknown')
951
+ else:
952
+ # Handle backward compatibility with older API
953
+ categories = result
954
+ provider_version = 'unknown'
955
+
956
+ # Sort categories alphabetically
957
+ sorted_categories = sorted(categories.keys())
958
+
959
+ # Count totals
960
+ total_resources = sum(len(cat['resources']) for cat in categories.values())
961
+ total_data_sources = sum(len(cat['data_sources']) for cat in categories.values())
962
+
963
+ print(
964
+ f'Found {len(categories)} categories, {total_resources} resources, and {total_data_sources} data sources'
965
+ )
966
+
967
+ # Generate markdown
968
+ markdown = []
969
+ markdown.append('# AWSCC Provider Resources Listing')
970
+ markdown.append(f'\nAWSCC Provider Version: {provider_version}')
971
+ markdown.append(f'\nLast updated: {datetime.now().strftime("%B %d, %Y %H:%M:%S")}')
972
+ markdown.append(
973
+ f'\nFound {total_resources} resources and {total_data_sources} data sources across {len(categories)} AWSCC service categories.\n'
974
+ )
975
+
976
+ # Generate table of contents
977
+ # markdown.append('## Table of Contents')
978
+ # for category in sorted_categories:
979
+ # sanitized_category = (
980
+ # category.replace(' ', '-').replace('(', '').replace(')', '').lower()
981
+ # )
982
+ # markdown.append(f'- [{category}](#{sanitized_category})')
983
+ # markdown.append('')
984
+
985
+ # Generate content for each category
986
+ for category in sorted_categories:
987
+ cat_data = categories[category]
988
+ sanitized_heading = category.replace('(', '').replace(')', '')
989
+
990
+ markdown.append(f'## {sanitized_heading}')
991
+
992
+ resource_count = len(cat_data['resources'])
993
+ data_source_count = len(cat_data['data_sources'])
994
+
995
+ # Add category summary
996
+ markdown.append(
997
+ f'\n*{resource_count} resources and {data_source_count} data sources*\n'
998
+ )
999
+
1000
+ # Add resources section if available
1001
+ if cat_data['resources']:
1002
+ markdown.append('### Resources')
1003
+ for resource in sorted(cat_data['resources'], key=lambda x: x['name']):
1004
+ markdown.append(f'- [{resource["name"]}]({resource["url"]})')
1005
+
1006
+ # Add data sources section if available
1007
+ if cat_data['data_sources']:
1008
+ markdown.append('\n### Data Sources')
1009
+ for data_source in sorted(cat_data['data_sources'], key=lambda x: x['name']):
1010
+ markdown.append(f'- [{data_source["name"]}]({data_source["url"]})')
1011
+
1012
+ markdown.append('') # Add blank line between categories
1013
+
1014
+ # Add generation metadata at the end
1015
+ duration = datetime.now() - start_time
1016
+ markdown.append('---')
1017
+ markdown.append(
1018
+ '*This document was generated automatically by the AWSCC Provider Resources Generator script.*'
1019
+ )
1020
+ markdown.append(f'*Generation time: {duration.total_seconds():.2f} seconds*')
1021
+
1022
+ # Ensure directory exists
1023
+ args.output.parent.mkdir(parents=True, exist_ok=True)
1024
+
1025
+ # Write markdown to output file
1026
+ with open(args.output, 'w', encoding='utf-8') as f:
1027
+ f.write('\n'.join(markdown))
1028
+
1029
+ print(f'Successfully generated markdown file at: {args.output}')
1030
+ print(f'Generation completed in {duration.total_seconds():.2f} seconds')
1031
+ return 0
1032
+
1033
+ except Exception as e:
1034
+ print(f'Error generating AWSCC provider resources: {str(e)}', file=sys.stderr)
1035
+ return 1
1036
+
1037
+
1038
+ if __name__ == '__main__':
1039
+ sys.exit(asyncio.run(main()))